# Using K-Fold Cross Validation to find the best model to predict song outcome (Dataset: 6k songs. Data: Uncleaned, Duplicates, No Genres)


## Imports and DB Set Up

In [34]:
import requests
import json
import pymongo
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from pymongo import MongoClient

In [3]:
conn = MongoClient("mongodb://localhost:27017/")
db = conn['gym-music-database']
content_col = db['test_songlist']
cursor = content_col.find({})
df =  pd.DataFrame(list(cursor))

## Setting features, matrices, KFold and Stratified KFold

In [7]:
feature_cols = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'mode', 'speechiness', 'tempo', 'time_signature']
X = df.loc[:, feature_cols]
y = df.song_type

In [8]:
kf = KFold(n_splits=10)
kf

KFold(n_splits=10, random_state=None, shuffle=False)

In [14]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 10)

### Create method and loop to show accuracy for each prediction model using Stratified KFold 

In [31]:
def get_model_accuracy(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [35]:
scores_logistic = []
scores_svm = []
scores_rf = []
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    scores_logistic.append(get_model_accuracy(LogisticRegression(solver='liblinear'), X_train, X_test, y_train, y_test))  
    scores_rf.append(get_model_accuracy(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    

In [36]:
scores_rf

[0.953416149068323,
 0.8726708074534162,
 0.8400621118012422,
 0.9797822706065319,
 0.9735202492211839,
 0.9314641744548287,
 0.9485981308411215,
 0.926791277258567,
 0.8894080996884736,
 0.9485981308411215]

In [37]:
scores_logistic

[0.5807453416149069,
 0.8385093167701864,
 0.6055900621118012,
 0.609642301710731,
 0.602803738317757,
 0.5607476635514018,
 0.6012461059190031,
 0.5934579439252337,
 0.6074766355140186,
 0.5514018691588785]

## Comparing and param tuning using cross_val_score method

In [39]:
from sklearn.model_selection import cross_val_score

In [49]:
rf_score = cross_val_score(RandomForestClassifier(n_estimators=40),X,y,cv=10) #changing n=200 only increases 
np.average(rf_score)                                                          #by 0.002%

0.9259645771374778

In [55]:
lg_score = cross_val_score(LogisticRegression(solver='liblinear'),X,y,cv=10)
np.average(lg_score)

0.6151620978593917

In [44]:
svm_score = cross_val_score(SVC(gamma='auto'),X,y,cv=10)
np.average(svm_score)

0.5352439641763356

## Visualising data using seaborn

In [56]:
import seaborn as sns
%matplotlib inline 