In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
#Import our data
songs_df = pd.read_csv('SpotifyFeatures.csv')

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['genre'] == 'Indie']
genre_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
92824,Indie,Halsey,Without Me,5p7ujcrUXASCNwRaWNHR1C,97,0.297,0.752,201661,0.488,9e-06,F#,0.0936,-7.05,Major,0.0705,136.041,4/4,0.533
92825,Indie,YUNGBLUD,11 Minutes (with Halsey feat. Travis Barker),4mGdjNMo0RonTlOEb7cYg4,87,0.0116,0.464,239507,0.852,0.0,B,0.108,-3.804,Major,0.067,160.075,4/4,0.233
92826,Indie,Grouplove,Tongue Tied,0GO8y8jQk1PkHzS31d699N,80,0.00847,0.56,218013,0.936,0.0,D#,0.161,-5.835,Major,0.0439,112.96,4/4,0.371
92827,Indie,The Killers,Mr. Brightside,7oK9VyNzrYvRFo7nQEYkWN,80,0.00108,0.33,222587,0.936,0.0,C#,0.0926,-3.66,Major,0.0917,148.112,4/4,0.234
92828,Indie,Lord Huron,The Night We Met,0QZ5yyl6B6utIWkxeBDxQN,78,0.968,0.441,208227,0.379,0.262,D,0.639,-9.545,Major,0.0449,174.118,3/4,0.105


In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['genre','artist_name','track_name','track_id','time_signature','liveness','acousticness','speechiness'])

genre_df.head()

Unnamed: 0,popularity,danceability,duration_ms,energy,instrumentalness,key,loudness,mode,tempo,valence
92824,97,0.752,201661,0.488,9e-06,F#,-7.05,Major,136.041,0.533
92825,87,0.464,239507,0.852,0.0,B,-3.804,Major,160.075,0.233
92826,80,0.56,218013,0.936,0.0,D#,-5.835,Major,112.96,0.371
92827,80,0.33,222587,0.936,0.0,C#,-3.66,Major,148.112,0.234
92828,78,0.441,208227,0.379,0.262,D,-9.545,Major,174.118,0.105


In [5]:
#Encode our categorical variables
genre_df_encoded = pd.get_dummies(genre_df, columns = ['mode','key'])
genre_df_encoded

Unnamed: 0,popularity,danceability,duration_ms,energy,instrumentalness,loudness,tempo,valence,mode_Major,mode_Minor,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
92824,97,0.752,201661,0.488,0.000009,-7.050,136.041,0.533,1,0,...,0,0,0,0,0,0,0,1,0,0
92825,87,0.464,239507,0.852,0.000000,-3.804,160.075,0.233,1,0,...,1,0,0,0,0,0,0,0,0,0
92826,80,0.560,218013,0.936,0.000000,-5.835,112.960,0.371,1,0,...,0,0,0,0,1,0,0,0,0,0
92827,80,0.330,222587,0.936,0.000000,-3.660,148.112,0.234,1,0,...,0,0,1,0,0,0,0,0,0,0
92828,78,0.441,208227,0.379,0.262000,-9.545,174.118,0.105,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145424,45,0.528,232528,0.874,0.000022,-3.396,123.503,0.886,0,1,...,1,0,0,0,0,0,0,0,0,0
145425,43,0.551,181773,0.305,0.000001,-11.360,145.811,0.420,1,0,...,1,0,0,0,0,0,0,0,0,0
145426,52,0.605,200240,0.731,0.000951,-4.683,92.483,0.412,1,0,...,0,0,0,1,0,0,0,0,0,0
145427,48,0.599,231111,0.870,0.000000,-4.394,80.949,0.611,1,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#Split our data into target and feature variables
X = genre_df_encoded.drop(columns=['popularity']).values
y= genre_df_encoded['popularity'].values


In [7]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, random_state=1)

X_train

array([[4.15000e-01, 1.56987e+05, 7.13000e-01, ..., 0.00000e+00,
        1.00000e+00, 0.00000e+00],
       [4.06000e-01, 2.58531e+05, 4.49000e-01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [6.94000e-01, 2.58000e+05, 8.28000e-01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [4.92000e-01, 1.96760e+05, 1.65000e-01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.98000e-01, 2.02111e+05, 6.48000e-01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.25000e-01, 1.92028e+05, 6.35000e-01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [8]:
#Create and fit RandomForestRegression Model
rr= RandomForestRegressor(random_state = 0)

In [9]:
#Set up parameters
n_estimators = [5,10,50,100,250,500]
max_features = ['auto','sqrt']
max_depth = [2,3,4]
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]

In [10]:
#Optimize parameters
parameters = {
    "n_estimators": n_estimators,
    "max_depth" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "bootstrap" : bootstrap
    }
cv = GridSearchCV(rr,parameters,cv=5)
cv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=0),
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 3, 4],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [5, 10, 50, 100, 250, 500]})

In [11]:
print(f'The best parameters are {cv.best_params_}')

The best parameters are {'bootstrap': True, 'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 500}


In [12]:
#Fit the model with best parameters
rr = RandomForestRegressor(n_estimators = cv.best_params_['n_estimators'], 
                           max_depth = cv.best_params_['max_depth'], 
                           min_samples_split = cv.best_params_['min_samples_split'],
                           min_samples_leaf =  cv.best_params_['min_samples_leaf'],
                           bootstrap = cv.best_params_['bootstrap'],
                           random_state = 0)
rr.fit(X_train, y_train)

RandomForestRegressor(max_depth=4, min_samples_leaf=3, n_estimators=500,
                      random_state=0)

In [13]:
#Create predictions
y_pred = rr.predict(X_test)
print(y_pred)

[54.33204468 54.21916441 54.79926564 ... 55.04620869 55.14464108
 55.22299164]


In [14]:
#Check R-squared value
r_square = rr.score(X,y)
r_square

0.04986144104167656

In [15]:
#Check Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
mse

52.680148942159114

In [16]:
#Check Root Mean Squared Error
rmse = np.sqrt(mse)
rmse

7.258109185053578

In [17]:
#Mean Absolute error
mae = mean_absolute_error(y_test, y_pred)
mae

5.702081234998351

In [18]:
#Check feature importance
feat_imp = rr.feature_importances_
pd.DataFrame(feat_imp)

Unnamed: 0,0
0,0.157072
1,0.107536
2,0.138493
3,0.244038
4,0.109658
5,0.115159
6,0.072244
7,0.002661
8,0.003217
9,0.001325


In [19]:
rr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 4,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}