# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
# from sklearn.linear_model import LogisticRegression

import seaborn as sn
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
import nltk
from sklearn.metrics import roc_auc_score, f1_score
import seaborn as sn
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, GridSearchCV


  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


# Reading files and making it regressionable

In [3]:
df_train = pd.read_csv('merged_cleaned_sentiment_train.csv')
df_val = pd.read_csv('merged_cleaned_sentiment_validation.csv')

In [4]:
df_train = pd.concat([df_train, pd.read_csv('lyrics_features_train.csv')], axis=1).drop(['pos','neg','neu', 'compound'], axis = 1)
df_val = pd.concat([df_val, pd.read_csv('lyrics_features_val.csv')], axis=1).drop(['pos','neg','neu', 'compound'], axis = 1)

In [5]:
df_train = pd.concat([df_train, pd.get_dummies(df_train.key, drop_first = True, prefix = 'key')], axis=1)
df_val = pd.concat([df_val, pd.get_dummies(df_val.key, drop_first = True, prefix = 'key')], axis=1)

In [6]:
df_train = df_train.drop(['Unnamed: 0', 'artist', 'trackname', 'id', 'time_signature', 'lyrics', 'lyrics_cleaned'], axis=1)
df_val = df_val.drop(['Unnamed: 0', 'artist', 'trackname', 'id', 'time_signature', 'lyrics', 'lyrics_cleaned'], axis=1)

In [7]:
df_train = df_train.dropna()
df_val = df_val.dropna()

In [8]:
# function to get cross validation scores
def get_cv_scores(model, X_train, y_train):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))

# Splitting the data into X and y

In [9]:
#     train set
X_train = df_train.drop(['y_valence', 'y_arousal'], axis=1).values
y_train_valence = df_train.y_valence.values 
y_train_arousal = df_train.y_arousal.values
    
#     validation set
X_val = df_val.drop(['y_valence', 'y_arousal'], axis=1).values
y_val_valence = df_val.y_valence.values 
y_val_arousal = df_val.y_arousal.values 

In [10]:
def do_regression(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):
    # find optimal alpha with grid search
    #alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    
    param_grid = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
    
    # Train model
    lr_val = LinearRegression().fit(X, y_1)
    lr_arou = LinearRegression().fit(X, y_2)
    
    # get cross val scores
    get_cv_scores(lr_val, X, y_1)
    get_cv_scores(lr_arou, X, y_2)
    
    r2_validation_valence = lr_val.score(X_validation, y_1_validation)
    r2_validation_arousal = lr_arou.score(X_validation, y_2_validation)
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')

    # Grid search
    clf_vale = GridSearchCV(lr_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(lr_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    
    return clf_vale.best_params_, clf_arou.best_params_

In [11]:
def do_forest_regression(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):
    
    # Train model
    rf_val = RandomForestRegressor(random_state=0).fit(X, y_1)
    rf_arou = RandomForestRegressor(random_state=0).fit(X, y_2)
    
    # get cross val scores
    get_cv_scores(rf_val, X, y_1)
    get_cv_scores(rf_arou, X, y_2)
    
    r2_validation_valence = rf_val.score(X_validation, y_1_validation)
    r2_validation_arousal = rf_arou.score(X_validation, y_2_validation)
    
    param_grid = { 
    'n_estimators': [100, 500],
    'max_features': ['auto'], #, 'sqrt', 'log2'
    'max_depth' : [5,10]
    #'criterion' :['gini', 'entropy']
    }
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')

    # Grid search
    clf_vale = GridSearchCV(rf_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(rf_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    
    return clf_vale.best_params_, clf_arou.best_params_

In [12]:
def do_svr(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):

    # Train model
    svr_val = SVR().fit(X, y_1)
    svr_arou = SVR().fit(X, y_2)
    
    param_grid = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
                  'C' : [1,5,10],
                  'degree' : [3,8],
                  'coef0' : [0.01,10,0.5],
                  'gamma' : ('auto','scale')
                 }

    # get cross val scores
    get_cv_scores(svr_val, X, y_1)
    get_cv_scores(svr_arou, X, y_2)
    
    r2_validation_valence = svr_val.score(X_validation, y_1_validation)
    r2_validation_arousal = svr_arou.score(X_validation, y_2_validation)
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')

    # Grid search
    clf_vale = GridSearchCV(svr_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(svr_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    
    return clf_vale.best_params_, clf_arou.best_params_

In [13]:
def do_mlp(X, y_1, y_2, X_validation, y_1_validation, y_2_validation):

    # Normalization
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X_validation = scaler.fit_transform(X_validation)

    # Train model
    mlp_val = MLPRegressor(random_state = 2).fit(X, y_1)
    mlp_arou = MLPRegressor(random_state = 2).fit(X, y_2)
    
    param_grid = {'hidden_layer_sizes':[(5), (10), (15), (5,5), (10,10), (15,15), (5,5,5), (10,10,10), (15,15,15)], 
    'max_iter':[500, 1000, 2000, 2500]}

    # get cross val scores
    get_cv_scores(mlp_val, X, y_1)
    get_cv_scores(mlp_arou, X, y_2)
    
    r2_validation_valence = mlp_val.score(X_validation, y_1_validation)
    r2_validation_arousal = mlp_arou.score(X_validation, y_2_validation)
    
    print()
    print(f'Validation score for Valence: {r2_validation_valence}')
    print(f'Validation score for Arousal: {r2_validation_arousal}')

    # Grid search
    clf_vale = GridSearchCV(mlp_val, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(mlp_arou, 
                            param_grid, 
                            scoring='r2', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Print best results on training data    
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    # add new lines to separate rows
    print()
    print("Best parameter for Valence (CV score=%0.3f):" % clf_vale.best_score_)
    print(clf_vale.best_params_)
    
    print()
    print("Best parameter for Arousal (CV score=%0.3f):" % clf_arou.best_score_)
    print(clf_arou.best_params_)
    
    return clf_vale.best_params_, clf_arou.best_params_

In [16]:
do_forest_regression(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

KeyboardInterrupt: 

In [None]:
do_regression(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

In [None]:
do_svr(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

In [17]:
do_mlp(X_train, y_train_valence, y_train_arousal, X_val, y_val_valence, y_val_arousal)

CV Mean:  -2.9055657941817072
STD:  0.6025851016565362
CV Mean:  -3.3818495924592256
STD:  0.4305334619433558

Validation score for Valence: -11.523837912835157
Validation score for Arousal: -13.025722013280896
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best parameter for Valence (CV score=-0.213):
{'hidden_layer_sizes': 5, 'max_iter': 500}

Best parameter for Arousal (CV score=-0.043):
{'hidden_layer_sizes': (5, 5), 'max_iter': 500}


({'hidden_layer_sizes': 5, 'max_iter': 500},
 {'hidden_layer_sizes': (5, 5), 'max_iter': 500})