# Moosic Baseline Model :: Iteration v1


* combining datasets as done prior [data preprocessing][data management]
* train test split [modelling]
* baseline model [discussion needed]
* baseline model sketch and implementation


## Importing required libraries




In [None]:
# IMPORT LIBRARIES


try:

    import numpy as np
    import pandas as pd
    import random as rnd
    #from tqdm.notebook import tqdm as tqdm
    from tqdm import tqdm 
    #from .autonotebook import tqdm as notebook_tqdm
    import time

    # databases - sql
    #from dotenv import dotenv_values
    #import sqlalchemy

    # visualisation
    import seaborn as sns
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap

    # split data - avoid data leakage
    from sklearn.model_selection import train_test_split

    # preprocessing, scaling
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler

    # modelling
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans, MiniBatchKMeans

    # cross validation, hyperparameter tuning
    #from surprise.model_selection import GridSearchCV
    from sklearn.model_selection import GridSearchCV

    # metrics
    from sklearn import metrics
    from sklearn.metrics import euclidean_distances, silhouette_score
    from sklearn.metrics.pairwise import sigmoid_kernel, cosine_similarity, pairwise_distances_argmin

    # high dimensional usage - dimensionality reduction
    from sklearn.manifold import TSNE
    from sklearn.decomposition import PCA

    # text converter/vectorizer
    from sentence_transformers import SentenceTransformer
    from sklearn.feature_extraction.text import CountVectorizer

    # pipeline
    from sklearn.pipeline import Pipeline



except ImportError as error:
    print(f"Installation of the required dependencies necessary! {error}")

    %pip install numpy
    %pip install pandas
    #%pip install dotenv
    #%pip install sqlalchemy
    %pip install seaborn
    %pip install matplotlib
    %pip install scikit-learn
    %pip install tqdm
    %pip install ipywidgets
    %pip install scikit-surprise
    %pip install sentence-transformers

    print(f"Successful installation of the required dependencies necessary")


import warnings
warnings.filterwarnings('ignore')



# color scheme

- custom_palette = { violet: #2B2960, blue: #00A1D8, orange: #F08144, yellow: #FDC20C, green: #29A744, eggshell: #FFF4D5}


- custom_palette =[#2B2960, #00A1D8, #F08144, #FDC20C, #29A744, #FFF4D5]



In [None]:


# setting color scheme 


#custom_palette = sns.color_palette(['#2B2960', '#00A1D8', '#F08144', '#FDC20C', '#29A744', '#FFF4D5']) #hexcode
#custom_palette_rgb = sns.color_palette(['(43, 41, 96, 1.0)', '(0, 161, 216, 1.0)', '(240, 129, 68, 1.0)', '(253, 194, 12, 1.0)', '(41, 167, 68, 1.0)', '(255, 244, 213, 1.0)']) #rgba values of the hexcodes above

#colors = ListedColormap(custom_palette)

#colors_hex = sns.set_palette(custom_palette)
hex_colors = ['#2B2960', '#00A1D8', '#F08144', '#FDC20C', '#29A744', '#32A87D', '#FFF4D5', '#E3CFBF']
custom_cmap_hex = ListedColormap(sns.color_palette(hex_colors).as_hex())


custom_palette = sns.set_palette(sns.color_palette(hex_colors))



custom_cmap_hex

# pandas plot: colormap = custom_cmap_hex, plt/sns plot : cmap = custom_cmap_hex



In [None]:
# set color map style for plots


hex_colors = ['#2B2960', '#00A1D8', '#F08144', '#FDC20C', '#29A744', '#32A87D', '#FFF4D5', '#E3CFBF']

custom_palette = sns.set_palette(sns.color_palette(hex_colors))

custom_palette


## Loading the data

In [None]:
# load the data files for artists and tracks


df_artists = pd.read_csv('../data/moosic-raw/spotify_600k_artists.csv', low_memory=False)
df_tracks = pd.read_csv('../data/moosic-raw/spotify_600k_tracks.csv', low_memory=False)


# get shape of the artist and tracks dataframe

print(f"Artists data: There are {df_artists.shape[0]} observations and {df_artists.shape[1]} feature variables ")
print('----------'*10)
print(f"Music Track data: There are {df_tracks.shape[0]} observations and {df_tracks.shape[1]} feature variables ")
print('----------'*10)


## Data Overview Artists

| column | additional information |
|--------|------------------------|
| id | id of artist |
| followers | number of followers | 
| genres | genres associated with artist |
| name | name of artist |
| popularity | popularity of artist in range 0 to 100 |

## Data Overview Tracks

| column | additional information |
|--------|------------------------|
| id | id of track |
| name | name of track | 
| popularity | popularity of track in range 0 to 100 |
| duration_ms | duration of songs in ms |
| explicit | whether it contains explicit content or not |
| artists | artists who created the track | 
| id_artists | id of artists who created the track |
| release_date | date of release |
| danceability | how danceable a song is in range 0 to 1 |
| energy | how energized a song is in range 0 to 1 |
| key | The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1 |
| loudness | The overall loudness of a track in decibels (dB) |
| mode |  Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0 |
| speechiness | Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks |
| acousticness | A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic |
| instrumentalness | Predicts whether a track contains no vocals. "Ooh" and "aah" sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly "vocal". The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content |
| liveness | Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live |
| valence | A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry) |
| tempo | The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration | 
| time_signature | An estimated time signature. The time signature (meter) is a notational convention to specify how many beats are in each bar (or measure). The time signature ranges from 3 to 7 indicating time signatures of 3/4, to 7/4. | 

---

# Load merged data with genre 

- to be used for modelling

---

In [None]:
# load the data files for artists and tracks


df_musgenre = pd.read_csv('../data/processed/df_with_main_genres_v1.csv', low_memory=False)

# get shape of the artist and tracks dataframe

print(f"Music data: There are {df_musgenre.shape[0]} observations and {df_musgenre.shape[1]} feature variables ")
print('----------'*10)

df_musgenre.head(2)

In [None]:
## display the info of the data files 

# spotify music data
display(df_musgenre.info())
print('------'*10)


# display a transposed sample of the data file
display(df_musgenre.head().T)
print('------'*10)

# display the number of unique values in the music data
display(df_musgenre.nunique())
print('------'*10)








In [None]:
# # for the music tracks dataset: 
# # -  check for number of null values in each columns

# display(df_musgenre.isnull().sum())
# print('------'*10)

# # - drop null / NaNs values in columns

# df_musgenre = df_musgenre.dropna()
# print('------'*10)

# # - re-check for number of null values in each columns

# display(df_musgenre.isnull().sum())
# print('------'*10)

# # - check for duplicates

# display(df_musgenre.duplicated())
# print('------'*10)

# # get count of duplicated values in tracks dataframe

# display(df_musgenre.duplicated().value_counts())
# print('------'*10)

# # show top 5 rows of data (transposed)

# display(df_musgenre.head().T)
# print('------'*10)



In [None]:
#  check for number of null list values in genre column

display(df_musgenre.main_genres.value_counts())
print('------'*10)


In [None]:
df_musgenre.head(2)

In [None]:
# drop rows with main_genres having empty lists 
# drop the 56 rows with null/nan values

moosic_data = df_musgenre.copy(deep=True)
moosic_data = moosic_data.query("main_genres != '[]' ").reset_index(drop=True)
moosic_data = moosic_data.convert_dtypes()

null_rows = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows)

print(moosic_data.isnull().T.any())
print(moosic_data.shape)

moosic_data.head(2)


---

# Mood groups based on valence values to mapped to mood labels for several moods

* based on validated affect measures and their associated word- mood labels

    ```python 

    mood_1d_labels = {"depressed":(0.0, 0.125) , "sad":(0.0, 0.125) , "fear": (0.0, 0.125),  
                        "neutral":(0.0, 0.125), "content":(0.0, 0.125), 
                        "euphoric": (0.0, 0.125), "energetic":(0.0, 0.125), "happy":(0.0, 0.125)}


    ```
---



In [None]:
# mood targets (y) to music track_name and valence

# Based on the one dimensional (1D) mood indicator valence
# create new column 'mood labels'(text) and 'mood_class'(nominal value) 


mood_val_labels = {"depressed":(0.0, 0.124) , "sad":(0.125, 0.249) , "fear_anxious": (0.250, 0.374),  
                    "neutral":(0.375, 0.490), "calm_content":(0.500, 0.624), 
                    "euphoric": (0.625, 0.749), "energetic":(0.750, 0.874), "happy":(0.875, 1.0)}

mood_1d_labels = ["depressed", "sad", "anxious",  "neutral", "calm", "euphoric", "energetic", "happy"]

mood_valence_values =  [0.0 , 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
mood_1d_class = [0, 1, 2, 3, 4, 5, 6, 7]


# add targets for evaluation for clusters later

moosic_data["mood_label"] = (pd.cut(moosic_data["valence"], bins=mood_valence_values, labels=mood_1d_labels)).astype('string')
moosic_data["mood_class"] = (pd.cut(moosic_data["valence"], bins=mood_valence_values, labels=mood_1d_class)).astype('Int64')

moosic_data.head(2)

In [None]:
# save current 1-D moosic (mood-music) dataset

#moosic_data.to_csv('../data/processed/moosic_data.csv', chunksize=len(moosic_data)//5, index=False)





---

# Splitting the dataset for baseline modelling : 

* train, test split
* drop columns = ['artists_id', 'track_id', 'artist_name', 'genres', 'release_date', 'main_genres', 'explicit', 'mood_label', 'track_name', 'mood_class']

<!---
['artists_id', 'track_id', 'artist_name', 'genres', 
                        'duration_ms', 'danceability', 'energy', 'acousticness', 'instrumentalness', 'liveness', 'valence',	
                        'tempo', 'time_signature','followers', 'artist_popularity',	'track_popularity', 'main_genres',
                        'release_date', 'main_genres', 'explicit', 'track_name', 'mood_label', 'mood_class']

--->


In [None]:
# train-test-split: splitting the data to avoid data leakage 
# defining X and y

features = moosic_data.columns.tolist()

X = moosic_data[features].drop(['artists_id', 'track_id', 'artist_name', 'genres', 'main_genres',
                        'duration_ms', 'liveness', 'time_signature','followers', 'artist_popularity', 'track_popularity',
                        'release_date',  'explicit', 'track_name',  'mood_label', 'mood_class'], axis = 1)

y = moosic_data[['track_name', 'mood_label', 'mood_class']]#.values.reshape(-1,1)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)



In [None]:
# test data

X_test = X_test.reset_index(drop=True)

y_test = y_test.reset_index(drop=True)



In [None]:
# train data : input features

X_train = X_train.reset_index(drop=True)

X_train.head(2)



In [None]:
# train data : target? music track name, 
# classify by mood features later based on indicators: 1-D (valence), 2-D (valence -energy)

y_train = y_train.reset_index(drop=True)

y_train.head(2)

---
# MODEL SKETCH : baseline model 

---

### Music track baseline content based recommender system by mood categories

* suggests data (music) based on user's interests? or users mood?
* insights and filter based on feature variables from our data
* ??metric: cosine similarity to measure the similarity of tracks/genres etc

<br>

### Algortihms and Options

* Kmeans clustering algorithm (unsupervised) ~ mini batch

* t-SNE for dimensionality reduction and visualisation based on our mood labels

* similarity modelling based on 1D mood indicator, valence (V)

* baseline focus: content-based recommender system based on user input query 
    - if-else construct based on mood clusters
    - output playlist with 10 randomized music track recommendations based on query
    - mood_choices are :  get users current mood cluster and also their preffered choice for mood choice for a playlist.

* optional baseline: 
    - music track name clustering and similarity measure, 
    - then get mood of clusters based on the average valence of the clusters gotten from the similarity

* options: 
    - svm 
    - text vectorization (cosine similarity)

<br>


---


---

[algo] define  model based on similarity of possible target features and focus

---


In [None]:
# defining models for baseline 
# baseline clustering 


def dimension_reduction(train_data):
    pca = PCA(n_components=2)

    pca_data = pca.fit_transform(train_data)

    return pca_data


def mini_batch_kmeans(train_data, test_data, params = {'n_clusters': 8}, *args, **kwargs):

    model = MiniBatchKMeans(**params)

    model.fit_predict(train_data)
    #model.fit(train_data)
    #clustered_labels = model.fit_predict(train_data)
    #model.fit_predict(train_data)

    cluster_labels = model.labels_ 
    cluster_centers = model.cluster_centers_ 
    sse = model.inertia_


    predictions = model.predict(test_data)


    return predictions, cluster_labels, cluster_centers, sse



In [None]:
# Baseline cluster and dimensionality reduction model

def baseline_model(x_train, y_train, x_test,  cluster_params, sample_size = 50000, *args, **kwargs):

    # n number of samples from input features- x_train
    x_data = x_train.head(sample_size)

    # n number of samples from target features- y_train
    y_data = y_train.head(sample_size) 

    # for test: n number of samples from input features- x_test
    x_test = x_test.head(sample_size)

    # dimensionality reduction 
    pca_start_time = time.time()
    pca_data = dimension_reduction(x_data)
    pca_end_time = time.time()



    # mini batch k-means 
    kme_start_time = time.time()
    predictions, cluster_labels, cluster_centers, sse = mini_batch_kmeans(x_data, x_test, params = cluster_params)
    kme_end_time = time.time()


    x_data["k_clusters"] = pd.Series(cluster_labels, index= x_data.index)


    # initialize 2D t-SNE model and fit data
    tsne = TSNE(n_components = 2, random_state = 42)

    tsne_start_time = time.time()

    baseline_tsne = tsne.fit_transform(x_data)

    tsne_end_time = time.time()

    pca_train_time = pca_end_time - pca_start_time
    kme_train_time = kme_end_time - kme_start_time
    tsne_train_time = tsne_end_time - tsne_start_time

    print(f"Time taken for PCA dimension reduction: {pca_train_time:.2f} seconds")
    print(f"Time taken for Mini-Batch K-Means: {kme_train_time:.2f} seconds")
    print(f"Time taken for t-SNE: {tsne_train_time:.2f} seconds")

    # baseline 2D plot : visualize the clustered data
    plt.figure(figsize = (16, 10))
    plt.scatter(baseline_tsne[:, 0], baseline_tsne[:, 1], c=cluster_labels, cmap=custom_cmap_hex, s=50)
    # for i in np.unique(cluster_labels):
    #     #print(cluster_centers[i][0])
    #     print(cluster_centers[i])
    #     plt.scatter(cluster_centers[i][0], cluster_centers[i][0], marker="x", color='k')

    plt.legend()
    plt.title('Baseline Music - Mood Clusters ', pad=15, fontsize = 20, weight = 'bold', color='#2B2960')
    plt.colorbar()

    get_axes = plt.gca()
    #plt.xticks([]) 
    #plt.yticks([]) 
    xax = get_axes.axes.get_xaxis()
    xax = xax.set_visible(False)

    yax = get_axes.axes.get_yaxis()
    yax = yax.set_visible(False)

    plt.show()
    #plt.savefig('../images/kclusters_count_plot.png', transparent=True)

    final_data = x_data.copy(deep=True)
    final_data["track_name"] =  y_data["track_name"].values
    final_data["mood_label"] =  y_data["mood_label"].values
    final_data["mood_class"] =  y_data["mood_class"].values



    return x_data, final_data, cluster_labels, cluster_centers, sse #, metrics





In [None]:
# train model

# global variables
epochs = 100

# scale the data

#scaledstd_xdata = StandardScaler().fit_transform(X_train)
#scaledstd_xdata

scaled_data = MinMaxScaler().fit_transform(X_train)

x_train_col = X_train.columns.to_list()
x_train_df = pd.DataFrame(scaled_data, columns = x_train_col)
#x_test_df = X_test

#x_train_df.isnull().T.any()


In [None]:

cluster_params = {
    'n_clusters' : 8,
    'batch_size' : 5000,
    'random_state' : 42,
    'init' : 'k-means++' #random
}

#clustered_data,  cluster_labels = baseline_model(x_train_df, y_train, X_test, cluster_params=cluster_params, sample_size = 50000)

clustered_data, final_data, cluster_labels, cluster_centers, sse = baseline_model(x_train_df, y_train, X_test, cluster_params=cluster_params, sample_size = 50000)


In [None]:
# clus_params = {
#     'n_clusters' : 8,
#     'batch_size' : 5000,
#     'random_state' : 42,
#     'init' : 'k-means++' #random
# }


# clus_data, f_data, clus_labels, clus_centers, sse_0 = baseline_model(x_train_df, y_train, X_test, cluster_params=clus_params, sample_size = 50000)


In [None]:
# show data 

clustered_data.head(2)



In [None]:
# clustered labels : 1 to 8

cluster_labels

In [None]:
# clustered centers


#type(cluster_centers)

for i in np.unique(cluster_labels):
    print(cluster_centers[i][0])

cluster_centers

In [None]:
cluster_labels.n

In [None]:

for i in cluster
cluster_centers



In [None]:
# how many unique clusters

clustered_data['k_clusters'].nunique()



In [None]:
# check for null values in the final data gotten from clustering



null_clustered_rows = final_data[final_data.isnull().T.any()].index
final_data = final_data.drop(null_clustered_rows)

print(final_data['k_clusters'].isnull().T.any())
print(final_data['mood_label'].isnull().T.any())
print(final_data['mood_class'].isnull().T.any())


In [None]:
# show final data containing model prediction for mood clusters, true target labels/class for  the moods of music tracks

display(final_data.head(2))



In [None]:
## get y : target data as mood_labels

y_true_label = final_data['mood_label'] #8 unique named mood clusters
y_true_class = final_data['mood_class'] #8 unique discrete values
y_pred_clusters = final_data['k_clusters'] #8 unique discrete values

print(cluster_labels.size)
print(y_true_label.size)
print(y_true_class.size)
print(y_pred_clusters.size)

In [None]:

print(pd.isna((y_true_class.to_numpy()).any))
print(pd.isna((y_true_class.to_numpy()).all))

#type(y_true_class.values)
#type(y_true_class.to_numpy())
print(type(y_true_class.values.to_numpy()))

In [None]:

print(pd.isna(y_pred_clusters.any))
print(pd.isna(y_pred_clusters.all))
print(type(y_true_class.values.to_numpy()))


In [None]:
# # Classification
# # simple baseline classifier
# # univariate data? k_clusters? true labels = mood_class
# # bi-variate? valence - k_clusters? true labels = mood_class

# from sklearn.linear_model import LogisticRegression


# clf_x = final_data['k_clusters'].to_numpy().reshape(-1, 1)
# clf_y = final_data['mood_class'].to_numpy().reshape(-1, 1)
# clf = LogisticRegression(random_state=42).fit(clf_x, clf_y)
# clf.predict(clf_x[:2, :])
# clf.predict_proba(clf_x[:2, :])
# clf.score(clf_x, clf_y)


---

# Clustering/Classification : Error Analysis and Evaluation

* Similarity measures
    - cosine similarity
    - euclidean similarity


* Offline Metrics
    Bbased on the mood clustering
    - Classification report
    - RMSE
    - Precision
    - ROC-curve?
    - F1-Score
    - Silhoute score 
    - Other cluster metrics


---

In [None]:
# Evaluation metrics for the mood clustering



clustering_metrics = [
        # (n_samples, )
        metrics.rand_score,
        metrics.fowlkes_mallows_score,
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.mutual_info_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score
    ]

print(f"For the mini-batch kmeans clustering with:  ")

for metric in clustering_metrics:
    #score_labels = metric(y_true_label.values, cluster_labels)
    score_class = metric(y_true_class.to_numpy(), y_pred_clusters.to_numpy())
    print(f"{metric.__name__} of: {score_class:.2f} ")

print('_______'*10)

cluster_scores_metrics = [
        # (n_samples, 1)
        metrics.silhouette_score,
        metrics.calinski_harabasz_score,
    ]


print(f"For the mini-batch kmeans clustering with:  ")

for metric in cluster_scores_metrics:
    #score_labels = metric(y_true_label.values, cluster_labels)
    score_class = metric(y_true_class.to_numpy().reshape(-1,1), y_pred_clusters.to_numpy().reshape(-1,1))
    print(f"{metric.__name__}  of: {score_class:.2f} ")

print('_______'*10)







In [None]:
# similarity between the predicted and actual mood clusters
# by what percentage are they similar?
# rand index score of 0.78
# in terms of % 

RI = 0.78
RI_rate = RI * 100
print(f"The similarity rate between predicted and true clusters is {RI_rate:.2f} %")
print(f"The baseline model predicted clusters is approximately {RI_rate:.2f} % similar to the actual music-mood (1-D) clusters")




---

#### analysis of the metrics

For the mini-batch kmeans clustering with:  
* rand_score of: 0.78  means the model is good (okay) with respect to the true mood_class
* fowlkes_mallows_score of: 0.17 , bad or moderate cluster prediction by the model?
* homogeneity_score of: 0.09, low score indicates the clusters are not highly homogeneous with respect to the predicted mood_class labels 
* completeness_score of: 0.09, low score indicates that some data points of the same class are split across predicted by the model clusters 
* v_measure_score of: 0.09 , okay/bad? quality of clustering
* mutual_info_score of: 0.18, an okay level of shared information 
* adjusted_rand_score of: 0.04 , low level beyond what is expected by chance
* adjusted_mutual_info_score of: 0.09 , low/okay level of agreement beyond what is expected by chance

<br>

For the mini-batch kmeans clustering with:  
* silhouette_score of: -0.22 , negative, the clusters overlap and are not well separated
* calinski_harabasz_score of: 2729.40 , better separation between clusters? , low within-cluster variance due to high value

<br>

* rand_score measures: the similarity of the predicted clusters and the true clusters for the mood music data, 0 (not a good match/clustering) to 1 (perfect identical to true clusters) 
* fowlkes_mallows_score: the similarity of the predicted clusters and the true clusters for the mood music data, 0 (not a good match/clustering) to 1 (perfect) 
* homogeneity_score: a measure of how much each cluster contains only data points that belong to a single class
* completeness_score: a measure of how well all cluster data points that belong to the same class are assigned to the same cluster
* v_measure_score: the harmonic mean of homogeneity and completeness, a balanced measure of the quality of clusters 
* mutual_info_score: the measure of the amount of information shared between true and predicted clusters
* adjusted_rand_score: a variation of the rand index score that accounts for chance
* adjusted_mutual_info_score: a variation of the mutual info score that accounts for chance
* silhouette_score : it measures the quality of clusters by evaluating how similar each data point is to its own cluster compared to other clusters
* calinski_harabasz_score: the variance ratio criterion, it measures the cluster quality based on between-cluster and within-cluster variance



<br>

In summary, the model 

* was able to cluster 78% of the data to the right mood clusters for the music tracks based on valence and other audio features
* thus the baseline model predicted clusters is approximately 78 % similar to the actual music-mood (1-D) clusters
* also show that clusters are not well separated and a lot of music data belonging to similar clusters were not sisigned to the same ones
* the music tracks smaples seem to belong to multiple mood classes



---


In [None]:
# classification_metrics = [
#         # (n_samples, n_classes)
#         #metrics.roc_auc_score,
#         #metrics.accuracy_score,
#         #metrics.balanced_accuracy_score,
#         #metrics.f1_score,
#         #metrics.jaccard_score,
#         metrics.classification_report,
#         #metrics.confusion_matrix
#     ]

# k_clusters = (y_true_class.nunique())
# n_samples = (y_true_class.to_numpy().size)//k_clusters

# print(f"For the label classification based on clustering :  ")

# for metric in classification_metrics:
#     # reshape (n_samples, ) to (n_samples, k_clusters)
    
#     #score_class = metric(y_true_class.to_numpy().reshape((n_samples, k_clusters)), y_pred_clusters.to_numpy().reshape((n_samples, k_clusters)))
#     score_class = metric(y_true_class.to_numpy().reshape((n_samples, k_clusters)), y_pred_clusters.to_numpy().reshape((n_samples, k_clusters)))

#     print(f"{metric.__name__} score of: {score_class:.2f} ")





---

# Predictions and Results

* predict valence clusters on test data
* ...
* ...

<br>


---

---

# Visualisations

* mood clusters plot 
* prediction plots
* etcs

<br>

---

In [None]:
# show data again 

final_data.head(2) 

In [None]:
final_data_plot = final_data[['k_clusters', 'mood_class']].melt(var_name='Music mood clusters', value_name= 'Mood labels' )
final_data_plot

In [None]:

# plot count of data samples in clusters by predicted mood vs actual mood label for the music tracks

final_data_plot = final_data[['k_clusters', 'mood_class']].melt(var_name='Music mood clusters', value_name= 'Mood labels' )

fig, ax = plt.subplots(figsize=(16, 10))

sns.countplot(data= final_data_plot, y='Music mood clusters', hue='Mood labels', color=custom_palette)


#plt.xticks([0, 1], ["Predicted", "Actual"], rotation=45)
plt.yticks([0, 1], ["Predicted Clusters", "Actual Clusters"], rotation=75)


for container in ax.containers:
    ax.bar_label(container)



#plt.xlabel('Count (in Thousands)', fontsize=20, color='#2B2960', labelpad=12);
#plt.ylabel("Clusters", fontsize=20, color='#2B2960', labelpad=10);

plt.xlabel(' ')
plt.ylabel(' ')

plt.title("Mood of Music : Valence-Based", fontsize=28, color='#2B2960', pad=15)

plt.xticks(color='#2B2960',fontsize=16)
plt.yticks(color='#2B2960', fontsize=16)


plt.legend(
    title= "Mood of Music",
    labels = ["depressed", "sad", "anxious",  "neutral", "calm", "euphoric", "energetic", "happy"],
    loc='center left',
    bbox_to_anchor=(1.0, 0.5),
    fontsize='x-large',
    title_fontsize='xx-large'
    )

# plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

# mood_of_music_val1dbased.png

In [None]:

#cmap = custom_cmap_hex

plt.figure(figsize = (16, 10))
#fig, ax = plt.subplots(figsize=(16, 10))
ax.set_facecolor("#FFF4D5")
ax = final_data.groupby('mood_label')[['track_name']].count().plot(kind='barh', colormap = custom_cmap_hex, #color ='#2B2960', 
                                                    xlabel=' ', ylabel=' ', legend= False, xlim = (0, 9000), rot=0)
for container in ax.containers:
    ax.bar_label(container)
plt.title("Count of Music Tracks by Moods", pad=15, fontsize = 20,color='#2B2960')

plt.savefig('../images/mood_count_plot.png', transparent=True)


In [None]:
# clustering based on 10,000 samples


cluster_params_1 = {
    'n_clusters' : 8,
    'batch_size' : 5000,
    'random_state' : 42,
    'init' : 'k-means++' #random
}


clustered_data_1, final_data_1, cluster_labels_1, cluster_centers_1, sse_1 = baseline_model(x_train_df, y_train, X_test, cluster_params=cluster_params_1, sample_size = 10000)


---
## Baseline Recommender System : 

* based on music-mood clusters with mini-batch kmeans + tsne visuals

---

In [None]:
# content based recomendation system
# - music track recommendation in a random order using valence clusters based on t-sne baseline model 

# print recommendations based on mood cluster choice

playlist_length = 5 # 10 

mood_clusters = {
            "depressed" : 0, 
            "sad" : 1, 
            "anxious" : 2,  
            "neutral" : 3, 
            "calm" : 4, 
            "euphoric" : 5, 
            "energetic" : 6, 
            "happy" : 7
}


user_current_mood = '1' # sad
moodsic_playlist_choice = 'cluster 4'

mood_choice = {
        'user_current_mood' :'sad', #sad
        'moodsic_playlist_choice':'happy' #happy 
}


def baseline_recommender(playlist_length= playlist_length, mood_clusters = mood_clusters, mood_choice= mood_choice, *args, **kwargs):

    mood_list_types = ["depressed", 0, "sad", 1, " anxious", 2, "neutral", 3, "calm", 4, "euphoric", 5, "enegertic", 6, "happy", 7]

    # cluster 1 : depressed (0)
    if (mood_choice['user_current_mood'] in mood_list_types) and (mood_choice['moodsic_playlist_choice'] in mood_list_types):    

        choice = mood_clusters[mood_choice['moodsic_playlist_choice']]
        query_data = final_data.query("k_clusters == @choice")


    else:
        raise ValueError("Input mood cluster choice is unavailable .... specified only! ")


    print("________"*10)

    print(f"Recommended music tracks based on {mood_choice['moodsic_playlist_choice']} : \n ")


    for music in query_data:
        print(f" Enjoy these {playlist_length} music tracks from spotify")
        print("             "*10)
        n_mood_music = (query_data[['track_name', 'k_clusters', 'mood_class', 'mood_label']].sample(n=playlist_length, random_state = 42, replace=False))


        print("________"*10)
        print(n_mood_music['track_name'])

        print("________"*10)
        print("________"*10)

        # option : print dataframe with music name, by which artist and spotify url
        break

    return n_mood_music



In [None]:
# display recomendations and print recommendation dataframe


n_mood_music = baseline_recommender(playlist_length = playlist_length, **mood_choice)

print("________"*10)

n_mood_music



In [None]:

# plot count of data samples in clusters by predicted mood vs actual mood label for the music tracks

rec_data_plot = n_mood_music[['k_clusters', 'mood_class']].melt(var_name='Music mood clusters', value_name= 'Mood labels' )

fig, ax = plt.subplots(figsize=(16, 10))

ind = np.arange(2)
width = 0.3       


plt.bar(n_mood_music['k_clusters'],, width, label="Predicted Clusters")
plt.bar(n_mood_music['mood_class'] + width,  width, label="Actual Clusters")



plt.bar(data= rec_data_plot, y='Music mood clusters', hue='Mood labels', color=custom_palette)
plt.bar(data= rec_data_plot, y='Music mood clusters', hue='Mood labels', color=custom_palette)


plt.yticks([0, 1], ["Predicted Clusters", "Actual Clusters"], rotation=75)


for container in ax.containers:
    ax.bar_label(container)

plt.xlabel(' ')
plt.ylabel(' ')

plt.title("Mood of Music : Happy", fontsize=28, color='#2B2960', pad=15)

plt.xticks(color='#2B2960',fontsize=16)
plt.yticks(color='#2B2960', fontsize=16)


plt.legend(
    title= "Mood of Music",
    labels = ["depressed", "sad", "anxious",  "neutral", "calm", "euphoric", "energetic", "happy"],
    loc='center left',
    bbox_to_anchor=(1.0, 0.5),
    fontsize='x-large',
    title_fontsize='xx-large'
    )

# plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

# mood_of_music_val1dbased.png

---

# Recommendation : Error Analysis and Evaluation

* Offline Metrics
    Based on the mood clustering
    - Classification report
    - Similarity score 

* Online Metrics
    Based on user usage: mood choices and filtering
    - Click through rate
    - A-B test
    - Churn
    - Product usage time

---

In [None]:
(n_mood_music['k_clusters'].to_numpy()).shape

In [None]:
baseline_rec_preds = n_mood_music.copy(deep=True)

baseline_rec_preds


In [None]:
# check the rand index score for the recommended score 
from sklearn.metrics import rand_score


rec_metrics = [
        # (n_samples, )
        metrics.rand_score,
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.mutual_info_score,
    ]

print(f"For the baseline recommendation system:  ")

for metric in rec_metrics:
    #score_labels = metric(y_true_label.values, cluster_labels)
    score_class = metric(n_mood_music['k_clusters'].to_numpy(), n_mood_music['mood_class'].to_numpy())
    print(f"{metric.__name__} of: {score_class:.2f} ")

print('_______'*10)


#similarity_metric = rand_score()
#RI_recsys = similarity_metric(n_mood_music['k_clusters'].to_numpy(), n_mood_music['mood_class'].to_numpy())
#RI_recsys = rand_score(n_mood_music['k_clusters'].to_numpy(), n_mood_music['mood_class'].to_numpy())
#RI_rate_rec = RI_recsys * 100
#print(f"The baseline model predicted clusters for the {similarity_metric} is approximately {RI_rate_rec:.2f} % similar to the actual music-mood (1-D) clusters")




