In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn import linear_model
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
DATA_DIR="data/"

In [96]:
with open(DATA_DIR+"lyrics_genre_MSD_1000chunks.pickle","rb") as f:
    data=pickle.load(f)

data=data[['tempo', 'artist_familiarity', 'mode', 'loudness', 'artist_hotttnesss', 'key_confidence', 'end_of_fade_in', 'start_of_fade_out', 'duration', 'time_signature', 'key', 'song_hotttnesss','genre']]
print(data.shape)

(49658, 13)


In [97]:
arr=np.load(DATA_DIR+"feature_vectors.npy")
dataf=pd.DataFrame(arr,index=None,columns=None)
print(dataf.shape)

(49658, 300)


In [99]:
from sklearn.decomposition import PCA
pca=PCA(n_components=20)
pca.fit(dataf)
dataf_red=pd.DataFrame(pca.fit_transform(dataf))
dataf_red.shape

(49658, 20)

In [100]:
df=pd.concat([data.reset_index(drop=True),dataf_red.reset_index(drop=True)],axis=1)
print(df.shape)

(49658, 33)


In [101]:
df=df.dropna()
df.shape

(34294, 33)

In [102]:
X=df.drop("song_hotttnesss",axis=1)
y=df["song_hotttnesss"]

print(X.shape)
print(y.shape)

(34294, 32)
(34294,)


In [103]:
#Label encoding - converting columns with non-numeric datatypes to numeric 
le = LabelEncoder()
for column in X.columns:
    if(X[column].dtype=='object'):
        print("Transforming column: ", column)
        X[column] = le.fit_transform(X[column])
print(X.shape)

Transforming column:  genre
(34294, 32)


In [113]:
cX = (X - np.mean(X, axis=0))/np.std(X, axis=0)
x_train, x_test, y_train, y_test = train_test_split(cX, y, test_size=0.6, random_state=42)

In [114]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(13717, 32)
(20577, 32)
(13717,)
(20577,)


<h1>Linear Regression</h1>

In [115]:
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)


Train Score:  0.37621277863484975
Test Score:  0.3620605190818106
Root Mean squared error: 0.18


<h1>Ridge Regression</h1>

In [68]:
reg = linear_model.Ridge(alpha=0.5)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.37139109374730944
Test Score:  0.34762160954788346
Root Mean squared error: 0.18


<h1>Ridge Regression (with Cross validation)</h1>

In [69]:
reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13), cv=10)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.37139107149870976
Test Score:  0.3476220704729228
Root Mean squared error: 0.18


<h1>Lasso Regression</h1>

In [70]:
reg = linear_model.LassoLarsCV(cv=10)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.371166451873956
Test Score:  0.34762259074782087
Root Mean squared error: 0.18


<h1>Elastic Net</h1>

In [71]:
reg = linear_model.ElasticNetCV(cv=5, random_state=3)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.3711481491827278
Test Score:  0.34762382791456303
Root Mean squared error: 0.18


<h1>Orthogonal Matching Pursuit</h1>

In [72]:
reg = linear_model.OrthogonalMatchingPursuit()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.3268726213393013
Test Score:  0.3092903361235064
Root Mean squared error: 0.19


<h1>Bayesian Ridge Regression</h1>

In [73]:
reg = linear_model.BayesianRidge()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.37137011212773213
Test Score:  0.3476245608337668
Root Mean squared error: 0.18


<h1>Decision Trees</h1>

In [74]:
reg = tree.DecisionTreeRegressor(max_depth=10)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.5221942716062586
Test Score:  0.3285538333233089
Root Mean squared error: 0.19


<h1>Gradient Boosting</h1>

In [75]:
reg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=0, loss='ls')
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print("Train Score: ", reg.score(x_train, y_train))
print("Test Score: ", reg.score(x_test, y_test))
print("Root Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)**0.5)

Train Score:  0.8089634296598317
Test Score:  0.4099645513305516
Root Mean squared error: 0.17
