In [None]:
#Import packages
import pandas as pd
import os 
import numpy as np 
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from scipy.special import boxcox1p
import seaborn as sns
import warnings
from math import sqrt
import pylab
from sklearn.linear_model import Lasso, Ridge
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
#Import data
song_data = pd.read_csv(r'https://raw.githubusercontent.com/PixarJunkie/dsc-424-final-project/master/data/song_data.csv')
song_info = pd.read_csv(r'https://raw.githubusercontent.com/PixarJunkie/dsc-424-final-project/master/data/song_info.csv')

#Shape of data
print('song_data shape: ' + str(song_data.shape))
print('sing_info shape: ' + str(song_info.shape))

In [None]:
#Columns
print('song_data columns: ' + str(list(song_data.columns)))
print('song_info columns: ' + str(list(song_info.columns))) 

In [None]:
#Convert song_duration
song_data['song_duration_min'] = (song_data.song_duration_ms/1000)/60
song_data.drop(columns = ['song_name', 'song_duration_ms'], inplace = True)

# Boxcox Transform

In [None]:
#Boxcox Transform
cols_ = ['acousticness', 'instrumentalness', 'time_signature', 'song_duration_min', 'speechiness']
for col in cols_:
    if song_data[col].min() == 0: 
        song_data[col] = song_data[col] + 0.000001
    song_data[col] = stats.boxcox(song_data[col])[0]

In [None]:
song_data.describe()

# Train-test split

In [None]:
#Training and test sets
X = song_data.drop('song_popularity', axis = 1)
y = song_data.song_popularity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 23)
print('X_train shape: ' + str(X_train.shape))
print('y_train shape: ' + str(y_train.shape))
print('X_test shape: ' + str(X_test.shape))
print('y_test shape: ' + str(y_test.shape))

# Lasso Regression

In [None]:
#Model definition
lasso_ = Lasso(max_iter = 1000)

lasso_params = {'alpha': [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]}

#Gridsearch cross-validation 
lasso_model = GridSearchCV(lasso_, param_grid = lasso_params, scoring = 'neg_mean_squared_error', cv = 5)
lasso_model.fit(X_train, y_train)
lasso_preds_train = lasso_model.predict(X_train)
lasso_preds_test = lasso_model.predict(X_test)

In [None]:
#Best Params/score
print('Best Params: ' + str(lasso_model.best_params_))
print('Best Score: ' + str(abs(lasso_model.best_score_)))

In [None]:
#Train probability Plot
lasso_res = y_train - lasso_preds_train
measurements = np.random.normal(loc = 20, scale = 5, size=100)   
stats.probplot(res, dist="norm", plot=pylab)
pylab.show()

In [None]:
#Prediction on test set/plot predicted vs. actuals
plt.scatter(y_test, lasso_preds_test)
plt.xlabel('Actuals')
plt.ylabel('Predicted')
#Train RMSE
train_rmse = sqrt(mean_squared_error(y_train, lasso_preds_train))
#Test RMSE
test_rmse = sqrt(mean_squared_error(y_test, lasso_preds_test)) 
print('Train Root Mean Square Error: ' + str(train_rmse))
print('Test Root Mean Square Error: ' + str(test_rmse))