# Imports

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
# from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sn
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
import nltk
from sklearn.metrics import roc_auc_score, f1_score
import seaborn as sn
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error #add rmse
from data import merged


# Reading files and making it regressionable

In [6]:
df_train = pd.read_csv('data/merged/merged_cleaned_sentiment_train.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_val = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_test = pd.read_csv('data/merged/merged_cleaned_sentiment_test.csv').drop(['pos','neg','neu', 'compound'], axis = 1)

In [7]:
df_train = df_train[['danceability', 'energy', 'instrumentalness', 'valence','mode', 'y_valence', 'y_arousal']]
df_val = df_val[['danceability', 'energy', 'instrumentalness', 'valence','mode', 'y_valence', 'y_arousal']]
df_test = df_test[['danceability', 'energy', 'instrumentalness', 'valence','mode','y_valence', 'y_arousal']]


In [9]:
df_train = pd.concat([df_train, pd.read_csv('data/lyrics/lyrics_features_train.csv').iloc[:, :-200]], axis=1)
df_val = pd.concat([df_val, pd.read_csv('data/lyrics/lyrics_features_val.csv').iloc[:, :-200]], axis=1)
df_test = pd.concat([df_test, pd.read_csv('data/lyrics/lyrics_features_test.csv').iloc[:, :-200]], axis=1)

### This was when we used all audio features

df_train = pd.concat([df_train, pd.get_dummies(df_train.key, drop_first = True, prefix = 'key')], axis=1)
df_val = pd.concat([df_val, pd.get_dummies(df_val.key, drop_first = True, prefix = 'key')], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_val.key, drop_first = True, prefix = 'key')], axis=1)

df_train = df_train.drop(['Unnamed: 0', 'artist', 'trackname', 'id', 'time_signature', 'lyrics', 'key','lyrics_cleaned' ], axis=1)
df_val = df_val.drop(['Unnamed: 0', 'artist', 'trackname', 'id', 'time_signature', 'lyrics', 'key', 'lyrics_cleaned' ], axis=1)
df_test = df_test.drop(['Unnamed: 0', 'artist', 'trackname', 'id', 'time_signature', 'lyrics', 'key', 'lyrics_cleaned'], axis=1)

In [10]:
df_train = df_train.dropna()
df_val = df_val.dropna()
df_test = df_test.dropna()

In [11]:
df_train.columns

Index(['danceability', 'energy', 'instrumentalness', 'valence', 'mode',
       'y_valence', 'y_arousal', 'Unnamed: 0', 'pos', 'neg',
       ...
       'tfidf_pca_91', 'tfidf_pca_92', 'tfidf_pca_93', 'tfidf_pca_94',
       'tfidf_pca_95', 'tfidf_pca_96', 'tfidf_pca_97', 'tfidf_pca_98',
       'tfidf_pca_99', 'tfidf_pca_100'],
      dtype='object', length=112)

In [12]:
# function to get cross validation scores
def get_cv_scores(model, X_train, y_train):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))

# Splitting the data into X and y

In [13]:
#     train set
X_train = df_train.drop(['y_valence', 'y_arousal'], axis=1).values
y_train_valence = df_train.y_valence.values 
y_train_arousal = df_train.y_arousal.values
    
#     validation set
X_val = df_val.drop(['y_valence', 'y_arousal'], axis=1).values
y_val_valence = df_val.y_valence.values 
y_val_arousal = df_val.y_arousal.values 

#      test set
X_test = df_test.drop(['y_valence', 'y_arousal'], axis=1).values
y_test_valence = df_test.y_valence.values 
y_test_arousal = df_test.y_arousal.values

In [14]:
LinearRegression().fit(y_train_arousal.reshape(-1, 1), y_train_valence.reshape(-1, 1)).score(y_train_arousal.reshape(-1, 1), y_train_valence.reshape(-1, 1))

0.054508397801767416

In [15]:
def do_regression(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):
    

    # parameters
    param_grid = {'fit_intercept':[True,False], 'positive':[True, False]}
    
    # Initialize model for Grid search
    lr_val = LinearRegression()
    lr_arou = LinearRegression()
    
    # Grid search
    clf_vale = GridSearchCV(lr_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(lr_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    print()


    #Initialize models with best parameters
    lr_val_top = LinearRegression(fit_intercept=clf_vale.best_params_['fit_intercept'],  
                                    positive = clf_vale.best_params_['positive'])
    lr_arou_top = LinearRegression(fit_intercept=clf_arou.best_params_['fit_intercept'],  
                                    positive = clf_arou.best_params_['positive'])

    # get cross val scores for models 
    get_cv_scores(lr_val_top, X, y_1)
    get_cv_scores(lr_arou_top, X, y_2)


    #fit optimal models to train data 
    lr_val_fit = lr_val_top.fit(X, y_1)
    lr_arou_fit = lr_arou_top.fit(X, y_2)
    
    # validation scores 
    r2_validation_valence = lr_val_fit.score(X_validation, y_1_validation)
    r2_validation_arousal = lr_arou_fit.score(X_validation, y_2_validation)
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')
    
    return clf_vale.best_params_, clf_arou.best_params_

In [16]:
def do_forest_regression(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):
    
    # Initialize models
    rf_val = RandomForestRegressor(random_state=0)
    rf_arou = RandomForestRegressor(random_state=0)
    
    param_grid = { 
    'n_estimators': [100, 500],
    'max_depth' : [5,10, 15]
    }

    # Grid search
    clf_vale = GridSearchCV(rf_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(rf_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)

    # Print best results on training data
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)


    #Initialize models with best parameters
    rf_val_top = RandomForestRegressor(n_estimators = clf_vale.best_params_['n_estimators'], 
                                        max_depth = clf_vale.best_params_['max_depth'], random_state=0)
    rf_arou_top = RandomForestRegressor(n_estimators = clf_arou.best_params_['n_estimators'], 
                                        max_depth = clf_arou.best_params_['max_depth'], random_state=0)
    

    # get cross val scores
    get_cv_scores(rf_val_top, X, y_1)
    get_cv_scores(rf_arou_top, X, y_2)

    rf_val_fit = rf_val_top.fit(X, y_1)
    rf_arou_fit = rf_arou_top.fit(X, y_2)
    
    r2_validation_valence = rf_val_fit.score(X_validation, y_1_validation)
    r2_validation_arousal = rf_arou_fit.score(X_validation, y_2_validation)

    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')
    
    
    return clf_vale.best_params_, clf_arou.best_params_

In [17]:
def do_svr(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):

    # Normalization
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X_validation = scaler.fit_transform(X_validation)

    # Train model
    svr_val = SVR()
    svr_arou = SVR()
    
    param_grid = {'kernel' : ('linear', 'rbf', 'poly'),
                  'C' : [1,5,10]
                 }

    # Grid search
    clf_vale = GridSearchCV(svr_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(svr_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    print()


    # Train model
    svr_val_top = SVR(kernel = clf_vale.best_params_['kernel'], C = clf_vale.best_params_['C'])
    svr_arou_top = SVR(kernel = clf_arou.best_params_['kernel'], C = clf_arou.best_params_['C'])


    # get cross val scores
    get_cv_scores(svr_val_top, X, y_1)
    get_cv_scores(svr_arou_top, X, y_2)

    #fit
    svr_val_fit = svr_val_top.fit(X,y_1)
    svr_arou_fit = svr_arou_top.fit(X, y_2)
    
    r2_validation_valence = svr_val_fit.score(X_validation, y_1_validation)
    r2_validation_arousal = svr_arou_fit.score(X_validation, y_2_validation)
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')


    return clf_vale.best_params_, clf_arou.best_params_

In [18]:
def do_mlp(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):

    # Normalization
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X_validation = scaler.fit_transform(X_validation)

    # Initialize model
    mlp_val = MLPRegressor(random_state = 2)
    mlp_arou = MLPRegressor(random_state = 2)
    
    param_grid = {'hidden_layer_sizes':[(5), (10), (15), (5,5), (10,10), (15,15), (5,5,5), (10,10,10), (15,15,15)], 
    'max_iter':[500, 1000, 2000, 2500]}

    # Grid search
    clf_vale = GridSearchCV(mlp_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(mlp_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)


    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    print()


    #Train model with best params
    mlp_val_top = MLPRegressor(hidden_layer_sizes=clf_vale.best_params_['hidden_layer_sizes'], max_iter=clf_vale.best_params_['max_iter'], random_state = 2)
    mlp_arou_top = MLPRegressor(hidden_layer_sizes=clf_arou.best_params_['hidden_layer_sizes'], max_iter=clf_arou.best_params_['max_iter'], random_state = 2)

    # get cross val scores 
    get_cv_scores(mlp_val_top, X, y_1)
    get_cv_scores(mlp_arou_top, X, y_2)

    mlp_val_fit = mlp_val_top.fit(X, y_1)
    mlp_arou_fit = mlp_arou_top.fit(X, y_2)
    
    r2_validation_valence = mlp_val_fit.score(X_validation, y_1_validation)
    r2_validation_arousal = mlp_arou_fit.score(X_validation, y_2_validation)
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')

    
    return clf_vale.best_params_, clf_arou.best_params_

### Random Forest Regression - Optimization

In [None]:
val_par_rf, arou_par_rf = do_forest_regression(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits


    Fitting 5 folds for each of 6 candidates, totalling 30 fits
    Fitting 5 folds for each of 6 candidates, totalling 30 fits

    Best parameter for Valence (CV score=0.185):
    {'max_depth': 10, 'n_estimators': 500}

    Best parameter for Arousal (CV score=0.199):
    {'max_depth': 10, 'n_estimators': 500}
    CV Mean:  0.18477634266575188
    STD:  0.010064224970315037
    CV Mean:  0.1987682194468258
    STD:  0.025811738331714372

    Validation score for Valence: 0.17296757628631831
    Validation score for Arousal: 0.24101399592965878

#### Test Set - RF

In [None]:
rf_val = RandomForestRegressor(n_estimators = 100, 
                                        max_depth = 5, random_state=0)
rf_arou = RandomForestRegressor(n_estimators = 100, 
                                        max_depth = 5, random_state=0)


In [None]:
print(rf_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(rf_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))

### Linear Regression - optimization

In [None]:
val_par, arou_par = do_regression(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

    Fitting 5 folds for each of 4 candidates, totalling 20 fits
    Fitting 5 folds for each of 4 candidates, totalling 20 fits

    Best parameter for Valence (CV score=0.193):
    {'fit_intercept': True, 'positive': False}

    Best parameter for Arousal (CV score=0.164):
    {'fit_intercept': True, 'positive': True}

    CV Mean:  0.19263909417367236
    STD:  0.01423911801666168
    CV Mean:  0.16387005569169152
    STD:  0.013789304326383702

    Validation score for Valence: 0.18365452891704448
    Validation score for Arousal: 0.18418929034767606

#### LR Test Set 

In [None]:
lr_val = LinearRegression(fit_intercept=val_par['fit_intercept'],  
                                    positive = val_par['positive'])
lr_arou = LinearRegression(fit_intercept=arou_par['fit_intercept'],  
                                    positive = arou_par['positive'])

In [None]:
print(lr_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(lr_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))


### Support Vector Regression - optimization

In [None]:
val_par_svr, arou_par_svr = do_svr(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

#### Test Score - SVR

In [None]:
svr_val = SVR(kernel = val_par_svr['kernel'], C = val_par_svr['C'])
svr_arou = SVR(kernel = arou_par_svr['kernel'], C = arou_par_svr['C'])

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
print(svr_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(svr_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))

### Multilayer Perceptron Regression - optimization

In [None]:
val_par_mlp, arou_par_mlp = do_mlp(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

    Fitting 5 folds for each of 36 candidates, totalling 180 fits
    Fitting 5 folds for each of 36 candidates, totalling 180 fits

    Best parameter for Valence (CV score=0.198):
    {'hidden_layer_sizes': (5, 5, 5), 'max_iter': 500}

    Best parameter for Arousal (CV score=0.177):
    {'hidden_layer_sizes': (5, 5, 5), 'max_iter': 500}

    CV Mean:  0.19816558523065714
    STD:  0.013616324243766003
    CV Mean:  0.17746596775912477
    STD:  0.022757705925352777

    Validation score for Valence: 0.16755552530343532
    Validation score for Arousal: 0.2115436506271312
    

### Test Set - MLP 

In [None]:
mlp_val = MLPRegressor(hidden_layer_sizes=val_par_mlp['hidden_layer_sizes'], max_iter=val_par_mlp['max_iter'], random_state = 2)
mlp_arou = MLPRegressor(hidden_layer_sizes=arou_par_mlp['hidden_layer_sizes'], max_iter=arou_par_mlp['max_iter'], random_state = 2)


In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
X_train.shape

In [None]:
print(mlp_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(mlp_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))

### Feature Selection

In [None]:
from sklearn.feature_selection import RFE


In [None]:

for i in [5,10,15,20,25]:

    print(i, 'FEATURES')
    estimator = LinearRegression(fit_intercept= True, positive= True)
    selector = RFE(estimator, n_features_to_select=i, step=1)
    selector = selector.fit(X_train, y_train_valence)

    estimator2 = LinearRegression(fit_intercept= True, positive= True)
    selector2 = RFE(estimator2, n_features_to_select=i, step=1)
    selector_arou = selector2.fit(X_train, y_train_arousal)

    lr_valence = LinearRegression(fit_intercept= True, positive= True)
    lr_arousal = LinearRegression(fit_intercept= True, positive= True)
    get_cv_scores(estimator, X_train[:,selector.support_], y_train_valence)
    get_cv_scores(estimator, X_train[:,selector_arou.support_], y_train_arousal)

    print()

In [None]:
# 20 features
estimator = LinearRegression(fit_intercept= True, positive= True)
selector = RFE(estimator, n_features_to_select=20, step=1)
selector = selector.fit(X_train, y_train_valence)

estimator2 = LinearRegression(fit_intercept= True, positive= True)
selector2 = RFE(estimator2, n_features_to_select=20, step=1)
selector_arou = selector2.fit(X_train, y_train_arousal)

In [None]:
mlp_valence = MLPRegressor(hidden_layer_sizes= (10,10), max_iter=500)
mlp_arousal = MLPRegressor(hidden_layer_sizes= 5, max_iter=500)
mlp_val_fit = mlp_valence.fit(X_train[:,selector.support_], y_train_valence)
mlp_arou_fit = mlp_arousal.fit(X_train[:,selector_arou.support_], y_train_arousal)

r2_validation_valence = mlp_val_fit.score(X_val[:,selector.support_], y_val_valence)
r2_validation_arousal = mlp_arou_fit.score(X_val[:,selector_arou.support_], y_val_arousal)

In [None]:

print()
print(f'Validation score for Valence: {r2_validation_valence}')
print(f'Validation score for Arousal: {r2_validation_arousal}')

In [None]:
df_train.columns[selector.support_]

In [None]:
df_train.columns[selector2.support_]

In [None]:
df_train.head()

In [None]:
do_mlp(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)