In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from os import chdir
import numpy as np
import pickle

In [45]:
def clipAndNormalize(features):
    #clip the features to the range of the training data
    #clip outliers to 1st and 99th percentile
    features['danceability'] = features['danceability'].clip(lower=features['danceability'].quantile(0.01), upper=features['danceability'].quantile(0.99))
    features['energy'] = features['energy'].clip(lower=features['energy'].quantile(0.01), upper=features['energy'].quantile(0.99))
    features['loudness'] = features['loudness'].clip(lower=features['loudness'].quantile(0.01), upper=features['loudness'].quantile(0.99))
    features['speechiness'] = features['speechiness'].clip(lower=features['speechiness'].quantile(0.01), upper=features['speechiness'].quantile(0.99))
    features['acousticness'] = features['acousticness'].clip(lower=features['acousticness'].quantile(0.01), upper=features['acousticness'].quantile(0.99))
    features['instrumentalness'] = features['instrumentalness'].clip(lower=features['instrumentalness'].quantile(0.01), upper=features['instrumentalness'].quantile(0.99))
    features['liveness'] = features['liveness'].clip(lower=features['liveness'].quantile(0.01), upper=features['liveness'].quantile(0.99))
    features['valence'] = features['valence'].clip(lower=features['valence'].quantile(0.01), upper=features['valence'].quantile(0.99))
    features['tempo'] = features['tempo'].clip(lower=features['tempo'].quantile(0.01), upper=features['tempo'].quantile(0.99))
    features['duration_ms'] = features['duration_ms'].clip(lower=features['duration_ms'].quantile(0.01), upper=features['duration_ms'].quantile(0.99))
    features['time_signature'] = features['time_signature'].clip(lower=features['time_signature'].quantile(0.01), upper=features['time_signature'].quantile(0.99))


    
    #normalize the data
    scaler = StandardScaler()

    #if id is a column, drop it
    if 'id' in features.columns:
        #fit on all columns except the track id
        rawfeatures = features.drop(['id'], axis=1)
    else:
        rawfeatures = features
    preprocessedFeatures = scaler.fit_transform(rawfeatures)
    preprocessedFeaturesDF = pd.DataFrame(preprocessedFeatures, columns=rawfeatures.columns)

    '''#convert to dictionary, with track id as key
    preprocessedFeatures = pd.DataFrame(preprocessedFeatures, columns=rawfeatures.columns)
    preprocessedFeatures['id']= features['id']
    preprocessedFeatures = preprocessedFeatures.set_index('id').T.to_dict('list')'''
    return preprocessedFeaturesDF, preprocessedFeatures

In [46]:
def makeCategorical(df):
    mood_order=['sad','angry','energetic','excited','happy','content','calm','depressed']
    mood_codes, mood_categories = pd.factorize(mood_order)
    
    # Create a categorical object with the desired order
    cat = pd.Categorical(df['mood'], categories=mood_order, ordered=True)

    # Get the integer codes of the categories
    codes = cat.codes

    # Add the codes as a new column to the dataframe
    df['mood_code'] = codes
    return df



In [47]:
chdir('C:/Users/mlar5/OneDrive/Desktop/Code Folder/Python Projects/IRL projects/Aspire - Affective Computing Project/Playlists Data/Audio Features/emotion joint data')

In [48]:
emotionsDF = pd.read_csv('Merged Emotions Data3.csv')

In [49]:
emotionsDF = makeCategorical(emotionsDF)

In [50]:
emotionsDF['mood_code'].value_counts()

7    3783
6    1218
0    1020
5     773
3     720
1     694
2     631
4     405
Name: mood_code, dtype: int64

In [51]:
# create a new df with only up to 500 songs per mood_code
# this is to balance the data

balancedDF = pd.DataFrame(columns=emotionsDF.columns)

for i in emotionsDF['mood_code'].unique():
    df = emotionsDF[emotionsDF['mood_code']==i]
    #if the value count of the mood_code is larger than 500, sample 500
    if df['mood_code'].value_counts()[i] > 500:
        df = df.sample(n=500, random_state=42)
    #if the value count of the mood_code is less than 500, sample the value count
    else:
        df = df.sample(n=df['mood_code'].value_counts()[i])
    balancedDF = pd.concat([balancedDF, df])

balancedDF['mood_code'].value_counts()

1    500
6    500
5    500
7    500
2    500
3    500
0    500
4    405
Name: mood_code, dtype: int64

In [52]:
balancedDF.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,duration_ms,time_signature,song,mood,genre,mood_code
381,0.848,0.52,5,-10.663,0,0.501,0.00225,0.000799,0.679,0.304,149.996,spotify:track:2XSrt1dcuOXPgl3B4bxmBz,203897,4,Carrollton,angry,rap,1
666,0.713,0.698,10,-7.435,0,0.168,0.18,1e-06,0.304,0.48,124.973,spotify:track:1SSv8SA2OHfOUwLgb8yOum,180062,4,Cheat Cxdes,angry,rap,1
257,0.757,0.423,1,-2.311,1,0.0527,4e-06,0.897,0.118,0.125,130.058,spotify:track:0A8Mrg7ButLr17K3A0R61D,133308,4,TOTALITARIANISM,angry,EDM,1
338,0.516,0.515,1,-13.005,1,0.279,0.0336,2e-06,0.119,0.396,95.971,spotify:track:583TaS41X2JJGKoGXnTY3l,107159,4,KILLTHEPHARAOH,angry,rap,1
319,0.618,0.836,6,-4.75,0,0.0813,0.0024,0.0,0.363,0.397,175.06,spotify:track:7CMy59461Q3pgsPZ4Cj8CP,89143,4,EASE,angry,rap,1


In [53]:
rawfeatures = balancedDF.drop(['uri', 'song','mood','genre','mood_code'], axis=1)

In [54]:
rawfeaturesDF, rawfeatures = clipAndNormalize(rawfeatures)

In [55]:
rawfeaturesDF.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,1.486461,-0.406225,-0.063089,-0.627609,-1.299055,3.624754,-0.881714,-0.490927,3.691332,-0.539255,0.943321,0.265175,0.176728
1,0.550314,0.321131,1.327226,0.058458,-1.299055,0.551285,-0.331506,-0.493447,0.904795,0.207345,0.070628,-0.194902,0.176728
2,0.855429,-0.802593,-1.175342,1.147494,0.769791,-0.512892,-0.88823,2.341344,-0.477327,-1.298581,0.247971,-1.097375,0.176728
3,-0.815767,-0.426657,-1.175342,-1.125369,0.769791,1.575774,-0.784673,-0.493447,-0.469896,-0.148987,-0.940835,-1.602118,0.176728
4,-0.108456,0.885036,0.214974,0.629118,-1.299055,-0.248925,-0.88125,-0.493452,1.34321,-0.144745,1.817443,-1.949873,0.176728


In [56]:
y = balancedDF['mood_code']

In [57]:
#set it to categorical
y = y.astype('category')

In [58]:
X_train_standard, X_test_standard, y_train_standard, y_test_standard = train_test_split(rawfeatures, y, test_size=0.2, random_state=42, stratify=y)


In [77]:

# Initialize the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(256,128),random_state=42,early_stopping=True)

In [78]:
# Train the model on the resampled data
mlp.fit(X_train_standard, y_train_standard)

# Make predictions on the test set
y_pred = mlp.predict(X_test_standard)

# Evaluate the model performance of micro-averaged F1 score

print(classification_report(y_test_standard, y_pred))


              precision    recall  f1-score   support

           0       0.34      0.50      0.41       100
           1       0.59      0.57      0.58       100
           2       0.51      0.56      0.53       100
           3       0.42      0.43      0.42       100
           4       0.46      0.41      0.43        81
           5       0.34      0.22      0.27       100
           6       0.52      0.71      0.60       100
           7       0.36      0.19      0.25       100

    accuracy                           0.45       781
   macro avg       0.44      0.45      0.44       781
weighted avg       0.44      0.45      0.44       781



In [61]:
def offByOne(y_test_standard, y_pred):
    #compare y_test_standard with y_pred_list. If y_pred_list is +-1 from y_test_standard, then it change it to be the same as y_test_standard
    y_test_standard_list=list(y_test_standard)
    y_pred_list = list(y_pred)
    for id in range(len(y_test_standard_list)):
        if y_test_standard_list[id] != 0 and y_test_standard_list[id] != 7:
            if y_pred_list[id] == y_test_standard_list[id] - 1 or y_pred_list[id] == y_test_standard_list[id] + 1:
                y_pred_list[id] = y_test_standard_list[id]
        elif y_test_standard_list[id] == 0:
            if y_pred_list[id] ==  1 or y_pred_list[id] == 7:
                y_pred_list[id] = y_test_standard_list[id]
        elif y_test_standard_list[id] == 7:
            if y_pred_list[id] ==  0 or y_pred_list[id] == 6:
                y_pred_list[id] = y_test_standard_list[id]
    print(classification_report(y_test_standard_list, y_pred_list))
    return

In [62]:
offByOne(y_test_standard, y_pred)

              precision    recall  f1-score   support

           0       0.47      0.60      0.53       100
           1       0.76      0.79      0.77       100
           2       0.71      0.80      0.75       100
           3       0.59      0.58      0.58       100
           4       0.66      0.56      0.60        81
           5       0.60      0.49      0.54       100
           6       0.80      0.82      0.81       100
           7       0.75      0.64      0.69       100

    accuracy                           0.66       781
   macro avg       0.67      0.66      0.66       781
weighted avg       0.67      0.66      0.66       781



In [63]:
#svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm =SVC(kernel='poly', degree=3,class_weight='balanced', random_state=42)


In [64]:
# Train the model
svm.fit(X_train_standard, y_train_standard)

# Make predictions on the test set
y_pred_SVM = svm.predict(X_test_standard)

# Evaluate the model performance
print(classification_report(y_test_standard, y_pred_SVM))

              precision    recall  f1-score   support

           0       0.33      0.48      0.39       100
           1       0.65      0.49      0.56       100
           2       0.50      0.54      0.52       100
           3       0.39      0.41      0.40       100
           4       0.30      0.57      0.39        81
           5       0.41      0.25      0.31       100
           6       0.58      0.53      0.55       100
           7       0.43      0.19      0.26       100

    accuracy                           0.43       781
   macro avg       0.45      0.43      0.42       781
weighted avg       0.45      0.43      0.43       781



In [65]:
offByOne(y_test_standard, y_pred_SVM)

              precision    recall  f1-score   support

           0       0.45      0.53      0.49       100
           1       0.82      0.70      0.76       100
           2       0.69      0.74      0.71       100
           3       0.60      0.69      0.64       100
           4       0.46      0.68      0.55        81
           5       0.73      0.56      0.63       100
           6       0.88      0.71      0.78       100
           7       0.80      0.63      0.70       100

    accuracy                           0.65       781
   macro avg       0.68      0.65      0.66       781
weighted avg       0.68      0.65      0.66       781

