In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rc('figure', figsize=(16,9))

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt

from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.ensemble import IsolationForest, RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

# Wrangling

In [3]:
from preprocessing import spotify_split, scale_data #,modeling_prep 
from acquire import concat_csv_files
from prepare import prepare_df, set_index

In [4]:
def modeling_prep():
    '''
    This function prepares the data for modeling
    '''
    # all local csv data compiled into a dataframe
    df = concat_csv_files()
    # adds new features, handles nulls, fixes data types, 
    # set the index to track_id, and fixes the tempo feature
    df = prepare_df(df)

    #encode album_type
    album_dummies = pd.get_dummies(df.album_type, drop_first=True).astype('int')
    df = pd.concat([df, album_dummies], axis=1)

    # drop any columns that won't contribute to modeling
    df = df.drop(columns=['album_popularity','label', 'artist', 
                        'album', 'release_date', 'track_name', 'album_id', 'album_type',
                        'release_year', 'release_month', 'release_day', 'duration_minutes', 
                        'duration_seconds', 'decade'])
    return df

In [5]:
df = modeling_prep()

In [6]:
#genre = pd.read_csv('genre_count_df.csv')
#genre = set_index(genre)
#genre = genre[genre.columns[32:]]

In [7]:
#df = df.merge(genre, on='track_id', suffixes=[None, '_'])
#df = df.dropna()

In [8]:
#df.info()

In [9]:
#df.head(2)

In [10]:
#genre.head(2)

In [11]:
df.head(2)

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,explicit,popularity,disc_number,track_number,is_featured_artist,is_top_billboard_label,compilation,single
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6mecZbKK3JDeMdFRNxsCV5,0.792,0.594,2,-8.544,1,0.3,0.0,0.244,0.351,82.512,232803,4,1,43,1,1,0,0,0,1
5PtMwNq8Dp31uYdGGacVJE,0.816,0.578,9,-6.912,1,0.233,0.0,0.114,0.265,148.077,193920,4,1,61,1,11,0,0,0,0


In [12]:
df.shape

(5733, 20)

---
### Split the Data

In [13]:
X_train, y_train, X_validate, y_validate, X_test, y_test, train, validate, test = spotify_split(df, 'popularity')
X_train.head(2)

Shape of train: (4012, 19) | Shape of validate: (861, 19) | Shape of test: (860, 19)
Percent train: 70.0        | Percent validate: 15.0       | Percent test: 15.0


Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,explicit,disc_number,track_number,is_featured_artist,is_top_billboard_label,compilation,single
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
30bqVoKjX479ab90a8Pafp,0.585,0.471,4,-9.934,0,0.0616,0.0184,0.115,0.323,93.099,142000,4,1,1,1,0,0,0,1
0HO8pCseEpgozNi3z0R4bc,0.833,0.518,10,-10.126,0,0.349,0.0,0.635,0.773,90.004,120000,4,1,1,11,0,0,0,0


---
### Scale the Data

In [14]:
#  Using MIN-MAX scaler
X_train_mm, X_validate_mm, X_test_mm = scale_data(train, validate, test, 'popularity', 'MinMax')
X_train_mm.head(3)

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,explicit,disc_number,track_number,is_featured_artist,is_top_billboard_label,compilation,single
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
30bqVoKjX479ab90a8Pafp,0.593306,0.439493,0.363636,0.580823,0.0,0.064033,0.019127,0.101302,0.328921,0.547673,0.234238,0.8,1.0,0.0,0.0,0.0,0.0,0.0,1.0
0HO8pCseEpgozNi3z0R4bc,0.844828,0.48996,0.909091,0.572667,0.0,0.362786,0.0,0.651741,0.787169,0.529466,0.194025,0.8,1.0,0.0,0.163934,0.0,0.0,0.0,0.0
643K3eEgRvdJiXjSzlz7dg,0.477688,0.654247,0.090909,0.745826,1.0,0.35447,0.0,0.3056,0.86558,0.520216,0.435841,0.8,1.0,0.0,0.016393,0.0,0.0,0.0,0.0


--- 
# Feature Selection

In [15]:
from preprocessing import rfe, select_kbest

In [16]:
skb_features = select_kbest(X_train_mm, y_train, 5)
skb_features

['speechiness', 'explicit', 'track_number', 'is_featured_artist', 'single']

In [17]:
rfe_features = rfe(X_train, y_train, 5)
rfe_features

['danceability', 'energy', 'speechiness', 'explicit', 'is_top_billboard_label']

In [18]:
# Select K Best Top 5 Features DF
X_tr_skb = X_train_mm[skb_features]
X_v_skb = X_validate_mm[skb_features]
X_te_skb = X_test_mm[skb_features]

In [19]:
# Recursive Feature Elimination Top 5 Features DF
X_tr_rfe = X_train_mm[rfe_features]
X_v_rfe = X_validate_mm[rfe_features]
X_te_rfe = X_test_mm[rfe_features]

---
# Cross Validation
- Optimize Hyperparameters

### CV Using SKB Features

In [20]:
from crossval import crossval_GSCV#, PolynomialRegression
from sklearn.svm import SVR

In [21]:
# OLS
params = {'fit_intercept': [True, False],
         'normalize': [True, False]}
lm = LinearRegression()
crossval_GSCV(params, lm, X_tr_skb, y_train).head(3)

Unnamed: 0,fit_intercept,normalize,RMSE
0,True,True,-21.942778
1,True,False,-21.942778
2,False,True,-24.605475


In [22]:
# LASSO + LARS
params = {'fit_intercept': [True, False],
          'alpha': [.0001, .001, .01]
         }
lars = LassoLars()
crossval_GSCV(params, lars, X_tr_skb, y_train).head(3)

Unnamed: 0,alpha,fit_intercept,RMSE
0,0.0001,True,-21.942764
2,0.001,True,-21.942866
4,0.01,True,-21.966555


In [23]:
# Support Vector Regressor
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
          'gamma': ['scale', 'auto']
         }
svr = SVR()
crossval_GSCV(params, svr, X_tr_skb, y_train).head(3)

Unnamed: 0,gamma,kernel,RMSE
0,scale,linear,-22.018246
4,auto,linear,-22.018246
2,scale,rbf,-22.049793


In [24]:
# Genralized Linear Model with Tweedie Regressor
params = {'power': [0, 1], 
          'alpha': [0, 1],
          'link': ['auto', 'log']
         }
glm = TweedieRegressor()
crossval_GSCV(params, glm, X_tr_skb, y_train).head(3)

Unnamed: 0,alpha,link,power,RMSE
0,0,auto,0,-21.942779
1,0,auto,1,-21.972706
3,0,log,1,-21.972706


### CV Using RFE Features

In [25]:
# OLS
params = {'fit_intercept': [True, False],
         'normalize': [True, False]}
lm = LinearRegression()
crossval_GSCV(params, lm, X_tr_rfe, y_train).head(3)

Unnamed: 0,fit_intercept,normalize,RMSE
0,True,True,-22.070954
1,True,False,-22.070954
2,False,True,-22.391851


In [26]:
# LASSO + LARS
params = {'fit_intercept': [True, False],
          'alpha': [.0001, .001, .01]
         }
lars = LassoLars()
crossval_GSCV(params, lars, X_tr_rfe, y_train).head(3)

Unnamed: 0,alpha,fit_intercept,RMSE
0,0.0001,True,-22.070956
2,0.001,True,-22.071243
4,0.01,True,-22.101186


In [27]:
# Support Vector Regressor
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
          'gamma': ['scale', 'auto']
         }
svr = SVR()
crossval_GSCV(params, svr, X_tr_rfe, y_train).head(3)

Unnamed: 0,gamma,kernel,RMSE
2,scale,rbf,-22.065233
0,scale,linear,-22.123579
4,auto,linear,-22.123579


In [28]:
# Genralized Linear Model with Tweedie Regressor
params = {'power': [0, 1], 
          'alpha': [0, 1],
          'link': ['auto', 'log']
         }
glm = TweedieRegressor()
crossval_GSCV(params, glm, X_tr_rfe, y_train).head(3)

Unnamed: 0,alpha,link,power,RMSE
0,0,auto,0,-22.070953
6,1,log,0,-22.099155
2,0,log,0,-22.100629


---
# Modeling
### Set the baseline

In [29]:
from model import get_baseline_metrics, linear_regression_model, lasso_lars
from model import polynomial_regression, svr_model, glm_model, evaluate_df
from model import visualize_model, visualize_error

In [30]:
bl, bl_train_rmse = get_baseline_metrics(y_train)

RMSE (Root Mean Square Error) of Baseline on train data:
 22.897138


---
### Models using ALL Features

In [31]:
# OLS Model(defaults)
lm_rmse, lm_rmse_v, lm_rmse_t = linear_regression_model(
    X_train_mm, y_train, X_validate_mm, y_validate, X_test_mm, y_test)
# LASSO + LARS Model(alpha=.00001)
lars_rmse, lars_rmse_v, lars_rmse_t = lasso_lars(
    X_train_mm, y_train, X_validate_mm, y_validate, X_test_mm, y_test, 
    alpha=.00001, fit_intercept=True)
# Polynomial Features (squared, deg=2) with Linear Regression
lm_sq_rmse, lm_sq_rmse_v, lm_sq_rmse_t = polynomial_regression(
    X_train_mm, y_train, X_validate_mm, y_validate, X_test_mm, y_test, 
    'Squared', degree=2) 
# Support Vector Regression with RBF Kernel Scaled
svr_rmse, svr_rmse_v, svr_rmse_t = svr_model(
    X_train_mm, y_train, X_validate_mm, y_validate, X_test_mm, y_test, 
    'RBF', kernel='rbf', gamma='scale')
# General Linearized Model with Normal Distribution
glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t = glm_model(
    X_train_mm, y_train, X_validate_mm, y_validate, X_test_mm, y_test, 
    'Normal', alpha=0, link='auto', power=0)

RMSE for OLS using Linear Regression 

On train data:
 21.24588 

RMSE for LASSO + LARS 

On train data:
 21.24588 

RMSE for Polynomial Squared + Linear Regression 

On train data:
 20.39603 

RMSE for SVR using RBF Kernel 

On train data:
 21.388908 

RMSE for GLM using Normal Distribution 

On train data:
 21.24588 



In [32]:
columns = ['train_rmse', 'validate_rmse', 'test_rmse']
index = ['baseline', 'ols', 'lassolars', 'pf2_lr', 'SVM', 'GLM']
data = [[bl_train_rmse, '', ''],
        [lm_rmse, lm_rmse_v, ''],
        [lars_rmse, lars_rmse_v, ''],
        [lm_sq_rmse, lm_sq_rmse_v, ''], 
        [svr_rmse, svr_rmse_v, ''],
        [glm_rmse, glm_rmse_v, '']]
print('ALL FEATURES')
print(f'Model beat baseline by {abs((glm_rmse_t - bl_train_rmse)/bl_train_rmse)*100:.2f}%')
pd.DataFrame(columns=columns, data=data, index=index).sort_values(by='train_rmse')

ALL FEATURES
Model beat baseline by 7.33%


Unnamed: 0,train_rmse,validate_rmse,test_rmse
pf2_lr,20.39603,21.4233,
ols,21.24588,21.2105,
GLM,21.24588,21.2105,
lassolars,21.24588,21.2103,
SVM,21.388908,21.4733,
baseline,22.897138,,


In [33]:
# create the model object
glm = TweedieRegressor(alpha = 0, link= 'log', power= 0)

# fit the model to our training data
glm.fit(X_train_mm, y_train)
feature_importances = pd.DataFrame(glm.coef_,
                                       index = X_train_mm.columns,
                                        columns=['importance']).sort_values('importance',ascending=False)
feature_importances.head(5), feature_importances.tail(5)

(                        importance
 loudness                  1.061048
 is_top_billboard_label    0.224020
 danceability              0.210244
 explicit                  0.192505
 is_featured_artist        0.174633,
                 importance
 time_signature   -0.112405
 duration_ms      -0.303110
 speechiness      -0.360544
 track_number     -0.632601
 energy           -0.737007)

---
### Models using SKB Features

In [35]:
# OLS Model(defaults)
lm_rmse, lm_rmse_v, lm_rmse_t = linear_regression_model(
    X_tr_skb, y_train, X_v_skb, y_validate, X_te_skb, y_test)
# LASSO + LARS Model(alpha=.00001)
lars_rmse, lars_rmse_v, lars_rmse_t = lasso_lars(
    X_tr_skb, y_train, X_v_skb, y_validate, X_te_skb, y_test, 
    alpha=.00001, fit_intercept=True)
# Polynomial Features (squared, deg=2) with Linear Regression
lm_sq_rmse, lm_sq_rmse_v, lm_sq_rmse_t = polynomial_regression(
    X_tr_skb, y_train, X_v_skb, y_validate, X_te_skb, y_test, 
    'Squared', degree=2) 
# Support Vector Regression with RBF Kernel Scaled
svr_rmse, svr_rmse_v, svr_rmse_t = svr_model(
    X_tr_skb, y_train, X_v_skb, y_validate, X_te_skb, y_test, 
    'RBF', kernel='rbf', gamma='scale')
# General Linearized Model with Normal Distribution
glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t = glm_model(
    X_tr_skb, y_train, X_v_skb, y_validate, X_te_skb, y_test, 
    'Normal', alpha=0, link='auto', power=0)

RMSE for OLS using Linear Regression 

On train data:
 21.905982 

RMSE for LASSO + LARS 

On train data:
 21.905982 

RMSE for Polynomial Squared + Linear Regression 

On train data:
 21.82078 

RMSE for SVR using RBF Kernel 

On train data:
 21.960632 

RMSE for GLM using Normal Distribution 

On train data:
 21.905982 



In [36]:
columns = ['train_rmse', 'validate_rmse', 'test_rmse']
index = ['baseline', 'ols', 'lassolars', 'pf2_lr', 'SVM', 'GLM']
data = [[bl_train_rmse, '', ''],
        [lm_rmse, lm_rmse_v, ''],
        [lars_rmse, lars_rmse_v, ''],
        [lm_sq_rmse, lm_sq_rmse_v, ''], 
        [svr_rmse, svr_rmse_v, ''],
        [glm_rmse, glm_rmse_v, '']]
print('SKB FEATURES')
print(f'Model beat baseline by {abs((lm_sq_rmse_t - bl_train_rmse)/bl_train_rmse)*100:.2f}%')
print(skb_features)
pd.DataFrame(columns=columns, data=data, index=index).sort_values(by='train_rmse')

SKB FEATURES
Model beat baseline by 5.16%
['speechiness', 'explicit', 'track_number', 'is_featured_artist', 'single']


Unnamed: 0,train_rmse,validate_rmse,test_rmse
pf2_lr,21.82078,21.8672,
ols,21.905982,21.9519,
GLM,21.905982,21.9519,
lassolars,21.905982,21.9519,
SVM,21.960632,21.9514,
baseline,22.897138,,


---
### Models using RFE Features

In [37]:
# OLS Model(defaults)
lm_rmse, lm_rmse_v, lm_rmse_t = linear_regression_model(
    X_tr_rfe, y_train, X_v_rfe, y_validate, X_te_rfe, y_test)
# LASSO + LARS Model(alpha=.00001)
lars_rmse, lars_rmse_v, lars_rmse_t = lasso_lars(
    X_tr_rfe, y_train, X_v_rfe, y_validate, X_te_rfe, y_test, 
    alpha=.00001, fit_intercept=True)
# Polynomial Features (squared, deg=2) with Linear Regression
lm_sq_rmse, lm_sq_rmse_v, lm_sq_rmse_t = polynomial_regression(
    X_tr_rfe, y_train, X_v_rfe, y_validate, X_te_rfe, y_test, 
    'Squared', degree=2) 
# Support Vector Regression with RBF Kernel Scaled
svr_rmse, svr_rmse_v, svr_rmse_t = svr_model(
    X_tr_rfe, y_train, X_v_rfe, y_validate, X_te_rfe, y_test, 
    'RBF', kernel='rbf', gamma='scale')
# General Linearized Model with Normal Distribution
glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t = glm_model(
    X_tr_rfe, y_train, X_v_rfe, y_validate, X_te_rfe, y_test, 
    'Normal', alpha=0, link='auto', power=0)

RMSE for OLS using Linear Regression 

On train data:
 22.02482 

RMSE for LASSO + LARS 

On train data:
 22.02482 

RMSE for Polynomial Squared + Linear Regression 

On train data:
 21.843232 

RMSE for SVR using RBF Kernel 

On train data:
 21.979183 

RMSE for GLM using Normal Distribution 

On train data:
 22.02482 



In [38]:
columns = ['train_rmse', 'validate_rmse', 'test_rmse']
index = ['baseline', 'ols', 'lassolars', 'pf2_lr', 'SVM', 'GLM']
data = [[bl_train_rmse, '', ''],
        [lm_rmse, lm_rmse_v, ''],
        [lars_rmse, lars_rmse_v, ''],
        [lm_sq_rmse, lm_sq_rmse_v, ''], 
        [svr_rmse, svr_rmse_v, ''],
        [glm_rmse, glm_rmse_v, '']]
print('RFE FEATURES')
print(f'Model beat baseline by {abs((lm_rmse_t - bl_train_rmse)/bl_train_rmse)*100:.2f}%')
print(rfe_features)
pd.DataFrame(columns=columns, data=data, index=index).sort_values(by='train_rmse')

RFE FEATURES
Model beat baseline by 3.22%
['danceability', 'energy', 'speechiness', 'explicit', 'is_top_billboard_label']


Unnamed: 0,train_rmse,validate_rmse,test_rmse
pf2_lr,21.843232,21.8576,
SVM,21.979183,21.9367,
ols,22.02482,22.0338,
GLM,22.02482,22.0338,
lassolars,22.02482,22.0338,
baseline,22.897138,,


In [39]:
#visualize_model(glm_pred_t, y_test, baseline, 'GLM using Normal Distribution')

In [40]:
#visualize_error(glm_pred_t, y_test, baseline, 'GLM using Normal Distribution')

In [41]:
def get_important_feats(model, X):
    feature_importances = pd.DataFrame(rf.featureimportances,
                                       index = X_train_scaled.columns,
                                        columns=['importance']).sort_values('importance',ascending=False)
return feature_importances

SyntaxError: 'return' outside function (<ipython-input-41-ca7696b7f0e7>, line 5)

Next
- put modeling prep into a function
- check the important feats for the ALL FEATURES MODEL
- clean up the notebook