In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from os import chdir
import numpy as np
import pickle

In [2]:
def clipAndNormalize(features):
    #clip the features to the range of the training data
    features['danceability'] = features['danceability'].clip(lower=0.22718080000000002, upper=0.906)
    features['energy'] = features['energy'].clip(lower=0.03545904, upper=0.978)
    features['loudness'] = features['loudness'].clip(lower=-26.4981552, upper=-1.6015904000000007)
    features['speechiness'] = features['speechiness'].clip(lower=0.0257, upper=0.46640959999999926)
    features['acousticness'] = features['acousticness'].clip(lower=8.353136000000001e-05, upper=0.9884095999999992)
    features['instrumentalness'] = features['instrumentalness'].clip(lower=0.0, upper=0.956)
    features['liveness'] = features['liveness'].clip(lower=0.0494, upper=0.697)
    features['valence'] = features['valence'].clip(lower=0.0382, upper=0.923)
    features['tempo'] = features['tempo'].clip(lower=63.7631808, upper=188.00344319999996)
    features['duration_ms'] = features['duration_ms'].clip(lower=88264.8768, upper=372339.1727999991)
    features['time_signature'] = features['time_signature'].clip(lower=3.0, upper=5.0)
    
    #normalize the data
    scaler = pickle.load(open('scaler2.pkl', 'rb'))

    #if id is a column, drop it
    if 'id' in features.columns:
        #fit on all columns except the track id
        rawfeatures = features.drop(['id'], axis=1)
    else:
        rawfeatures = features
    preprocessedFeatures = scaler.transform(features)
    preprocessedFeaturesDF = pd.DataFrame(preprocessedFeatures, columns=rawfeatures.columns)

    '''#convert to dictionary, with track id as key
    preprocessedFeatures = pd.DataFrame(preprocessedFeatures, columns=rawfeatures.columns)
    preprocessedFeatures['id']= features['id']
    preprocessedFeatures = preprocessedFeatures.set_index('id').T.to_dict('list')'''
    return preprocessedFeaturesDF, preprocessedFeatures

In [9]:
def makeCategorical(df):
    mood_order=['sad','angry','energetic','excited','happy','content','calm','depressed']
    mood_codes, mood_categories = pd.factorize(mood_order)
    
    # Create a categorical object with the desired order
    cat = pd.Categorical(df['mood'], categories=mood_order, ordered=True)

    # Get the integer codes of the categories
    codes = cat.codes

    # Add the codes as a new column to the dataframe
    df['mood_code'] = codes
    return df



In [3]:
chdir('C:/Users/mlar5/OneDrive/Desktop/Code Folder/Python Projects/IRL projects/Aspire - Affective Computing Project/Playlists Data/Audio Features/emotion joint data')

In [4]:
emotionsDF = pd.read_csv('Merged Emotions Data2.csv')

In [10]:
emotionsDF = makeCategorical(emotionsDF)

In [12]:
emotionsDF['mood_code'].value_counts()

7    3781
6    1218
0    1015
5     771
3     721
2     631
1     447
4     405
Name: mood_code, dtype: int64

In [13]:
# create a new df with only up to 500 songs per mood_code
# this is to balance the data

balancedDF = pd.DataFrame(columns=emotionsDF.columns)

for i in emotionsDF['mood_code'].unique():
    df = emotionsDF[emotionsDF['mood_code']==i]
    #if the value count of the mood_code is larger than 500, sample 500
    if df['mood_code'].value_counts()[i] > 500:
        df = df.sample(n=500, random_state=42)
    #if the value count of the mood_code is less than 500, sample the value count
    else:
        df = df.sample(n=df['mood_code'].value_counts()[i])
    balancedDF = pd.concat([balancedDF, df])

balancedDF['mood_code'].value_counts()

6    500
5    500
7    500
2    500
3    500
0    500
1    447
4    405
Name: mood_code, dtype: int64

In [14]:
balancedDF.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,duration_ms,time_signature,song,mood,genre,mood_code
124,0.613,0.764,2,-6.509,1,0.136,0.0527,0.0,0.197,0.417,160.015,spotify:track:4CdVzZTnf7JElTDw0kyUiN,208187,4,...Ready For It?,angry,pop,1
302,0.436,0.952,5,-5.089,1,0.122,0.0122,0.723,0.156,0.293,110.073,spotify:track:7aLFXFk0jTtBjwjosjC596,175636,4,GOD OF A DIFFERENT PLANE,angry,metal,1
295,0.663,0.66,2,-6.623,1,0.29,0.0905,0.0,0.348,0.606,200.117,spotify:track:0s1MIn7UaPbAfq85CnKVl6,134452,3,LED,angry,metal,1
165,0.697,0.377,3,-7.755,1,0.0397,0.556,0.0,0.0999,0.336,138.754,spotify:track:0xCA70t1ZA4fa9UOE0lIJm,244573,4,I'm a Mess,angry,pop,1
272,0.792,0.829,2,-6.612,1,0.256,0.00103,0.0755,0.582,0.17,93.044,spotify:track:3pXVmZh293nWtqxildz9jf,92856,4,Miss YOU!,angry,metal,1


In [15]:
rawfeatures = balancedDF.drop(['uri', 'song','mood','genre','mood_code'], axis=1)

In [16]:
rawfeaturesDF, rawfeatures = clipAndNormalize(rawfeatures)

In [17]:
rawfeaturesDF.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.009197,0.847953,-0.906534,0.458508,0.805606,0.420868,-0.902587,-0.556962,0.231004,0.137022,1.303041,0.243837,0.187342
1,-1.187071,1.584089,-0.064537,0.723935,0.805606,0.276312,-1.019543,1.501983,-0.103001,-0.412162,-0.363622,-0.342497,0.187342
2,0.347125,0.440729,-0.906534,0.437199,0.805606,2.010982,-0.793428,-0.556962,1.461121,0.974084,2.237071,-1.084336,-3.298001
3,0.576917,-0.66739,-0.625868,0.225605,0.805606,-0.57347,0.550841,-0.556962,-0.560018,-0.221719,0.593519,0.899251,0.187342
4,1.218982,1.102468,-0.906534,0.439255,0.805606,1.659918,-1.051799,-0.341955,3.367395,-0.956916,-0.931914,-1.833596,0.187342


In [18]:
y = balancedDF['mood_code']

In [19]:
#set it to categorical
y = y.astype('category')

In [20]:
X_train_standard, X_test_standard, y_train_standard, y_test_standard = train_test_split(rawfeatures, y, test_size=0.2, random_state=42, stratify=y)


In [21]:

# Initialize the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(256,128),random_state=42,early_stopping=True)

In [22]:
# Train the model on the resampled data
mlp.fit(X_train_standard, y_train_standard)

# Make predictions on the test set
y_pred = mlp.predict(X_test_standard)

# Evaluate the model performance of micro-averaged F1 score

print(classification_report(y_test_standard, y_pred))


              precision    recall  f1-score   support

           0       0.41      0.43      0.42       100
           1       0.29      0.20      0.24        90
           2       0.41      0.67      0.51       100
           3       0.37      0.43      0.40       100
           4       0.37      0.30      0.33        81
           5       0.40      0.25      0.31       100
           6       0.56      0.72      0.63       100
           7       0.35      0.24      0.29       100

    accuracy                           0.41       771
   macro avg       0.40      0.40      0.39       771
weighted avg       0.40      0.41      0.39       771



In [24]:
def offByOne(y_test_standard, y_pred):
    #compare y_test_standard with y_pred_list. If y_pred_list is +-1 from y_test_standard, then it change it to be the same as y_test_standard
    y_test_standard_list=list(y_test_standard)
    y_pred_list = list(y_pred)
    for id in range(len(y_test_standard_list)):
        if y_test_standard_list[id] != 0 and y_test_standard_list[id] != 7:
            if y_pred_list[id] == y_test_standard_list[id] - 1 or y_pred_list[id] == y_test_standard_list[id] + 1:
                y_pred_list[id] = y_test_standard_list[id]
        elif y_test_standard_list[id] == 0:
            if y_pred_list[id] ==  1 or y_pred_list[id] == 7:
                y_pred_list[id] = y_test_standard_list[id]
        elif y_test_standard_list[id] == 7:
            if y_pred_list[id] ==  0 or y_pred_list[id] == 6:
                y_pred_list[id] = y_test_standard_list[id]
    print(classification_report(y_test_standard_list, y_pred_list))
    return

In [25]:
offByOne(y_test_standard, y_pred)

              precision    recall  f1-score   support

           0       0.58      0.58      0.58       100
           1       0.69      0.72      0.71        90
           2       0.64      0.86      0.73       100
           3       0.57      0.71      0.63       100
           4       0.67      0.48      0.56        81
           5       0.65      0.48      0.55       100
           6       0.82      0.85      0.83       100
           7       0.67      0.55      0.60       100

    accuracy                           0.66       771
   macro avg       0.66      0.65      0.65       771
weighted avg       0.66      0.66      0.65       771



In [26]:
#svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm =SVC(kernel='poly', degree=3,class_weight='balanced', random_state=42)


In [27]:
# Train the model
svm.fit(X_train_standard, y_train_standard)

# Make predictions on the test set
y_pred_SVM = svm.predict(X_test_standard)

# Evaluate the model performance
print(classification_report(y_test_standard, y_pred_SVM))

              precision    recall  f1-score   support

           0       0.35      0.50      0.41       100
           1       0.33      0.17      0.22        90
           2       0.47      0.60      0.53       100
           3       0.38      0.37      0.38       100
           4       0.33      0.40      0.36        81
           5       0.26      0.32      0.29       100
           6       0.61      0.54      0.57       100
           7       0.35      0.17      0.23       100

    accuracy                           0.39       771
   macro avg       0.38      0.38      0.37       771
weighted avg       0.39      0.39      0.37       771



In [28]:
offByOne(y_test_standard, y_pred_SVM)

              precision    recall  f1-score   support

           0       0.48      0.56      0.52       100
           1       0.71      0.61      0.66        90
           2       0.70      0.74      0.72       100
           3       0.60      0.64      0.62       100
           4       0.60      0.65      0.63        81
           5       0.48      0.56      0.52       100
           6       0.88      0.82      0.85       100
           7       0.74      0.50      0.60       100

    accuracy                           0.64       771
   macro avg       0.65      0.64      0.64       771
weighted avg       0.65      0.64      0.64       771

