In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
#Import our data
songs_df = pd.read_csv('SpotifyFeatures.csv')

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['genre'] == 'Indie']
genre_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
92824,Indie,Halsey,Without Me,5p7ujcrUXASCNwRaWNHR1C,97,0.297,0.752,201661,0.488,9e-06,F#,0.0936,-7.05,Major,0.0705,136.041,4/4,0.533
92825,Indie,YUNGBLUD,11 Minutes (with Halsey feat. Travis Barker),4mGdjNMo0RonTlOEb7cYg4,87,0.0116,0.464,239507,0.852,0.0,B,0.108,-3.804,Major,0.067,160.075,4/4,0.233
92826,Indie,Grouplove,Tongue Tied,0GO8y8jQk1PkHzS31d699N,80,0.00847,0.56,218013,0.936,0.0,D#,0.161,-5.835,Major,0.0439,112.96,4/4,0.371
92827,Indie,The Killers,Mr. Brightside,7oK9VyNzrYvRFo7nQEYkWN,80,0.00108,0.33,222587,0.936,0.0,C#,0.0926,-3.66,Major,0.0917,148.112,4/4,0.234
92828,Indie,Lord Huron,The Night We Met,0QZ5yyl6B6utIWkxeBDxQN,78,0.968,0.441,208227,0.379,0.262,D,0.639,-9.545,Major,0.0449,174.118,3/4,0.105


In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['genre','artist_name','track_name','track_id','time_signature'])

genre_df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
92824,97,0.297,0.752,201661,0.488,9e-06,F#,0.0936,-7.05,Major,0.0705,136.041,0.533
92825,87,0.0116,0.464,239507,0.852,0.0,B,0.108,-3.804,Major,0.067,160.075,0.233
92826,80,0.00847,0.56,218013,0.936,0.0,D#,0.161,-5.835,Major,0.0439,112.96,0.371
92827,80,0.00108,0.33,222587,0.936,0.0,C#,0.0926,-3.66,Major,0.0917,148.112,0.234
92828,78,0.968,0.441,208227,0.379,0.262,D,0.639,-9.545,Major,0.0449,174.118,0.105


In [5]:
#Encode our categorical variables
genre_df_encoded = pd.get_dummies(genre_df, columns = ['mode','key'])
genre_df_encoded

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
92824,97,0.29700,0.752,201661,0.488,0.000009,0.0936,-7.050,0.0705,136.041,...,0,0,0,0,0,0,0,1,0,0
92825,87,0.01160,0.464,239507,0.852,0.000000,0.1080,-3.804,0.0670,160.075,...,1,0,0,0,0,0,0,0,0,0
92826,80,0.00847,0.560,218013,0.936,0.000000,0.1610,-5.835,0.0439,112.960,...,0,0,0,0,1,0,0,0,0,0
92827,80,0.00108,0.330,222587,0.936,0.000000,0.0926,-3.660,0.0917,148.112,...,0,0,1,0,0,0,0,0,0,0
92828,78,0.96800,0.441,208227,0.379,0.262000,0.6390,-9.545,0.0449,174.118,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145424,45,0.02980,0.528,232528,0.874,0.000022,0.1800,-3.396,0.0334,123.503,...,1,0,0,0,0,0,0,0,0,0
145425,43,0.89100,0.551,181773,0.305,0.000001,0.2580,-11.360,0.0321,145.811,...,1,0,0,0,0,0,0,0,0,0
145426,52,0.53300,0.605,200240,0.731,0.000951,0.1110,-4.683,0.0349,92.483,...,0,0,0,1,0,0,0,0,0,0
145427,48,0.03140,0.599,231111,0.870,0.000000,0.2610,-4.394,0.1050,80.949,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#Split our data into target and feature variables
X = genre_df_encoded.drop(columns=['popularity']).values
y= genre_df_encoded['popularity'].values


In [7]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, random_state=1)

X_train

array([[2.11000e-02, 4.15000e-01, 1.56987e+05, ..., 0.00000e+00,
        1.00000e+00, 0.00000e+00],
       [5.20000e-01, 4.06000e-01, 2.58531e+05, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.45000e-02, 6.94000e-01, 2.58000e+05, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [9.67000e-01, 4.92000e-01, 1.96760e+05, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.51000e-01, 5.98000e-01, 2.02111e+05, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.47000e-01, 5.25000e-01, 1.92028e+05, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [8]:
#Create and fit RandomForestRegression Model
rr= RandomForestRegressor(n_estimators = 50, random_state = 0)

rr.fit(X_train, y_train)

RandomForestRegressor(n_estimators=50, random_state=0)

In [9]:
#Create predictions
y_pred = rr.predict(X_test)
print(y_pred)

[51.48 55.72 55.94 ... 56.06 53.86 55.36]


In [10]:
#Check R-squared value
r_square = rr.score(X,y)
r_square

0.20959584481196591

In [11]:
#Check Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
mse

54.47428906897676

In [12]:
#Check Root Mean Squared Error
rmse = np.sqrt(mse)
rmse

7.380669960713374

In [13]:
#Mean Absolute error
mae = mean_absolute_error(y_test, y_pred)
mae

5.783255554003073

In [14]:
#Check feature importance
feat_imp = rr.feature_importances_
pd.DataFrame(feat_imp)

Unnamed: 0,0
0,0.088011
1,0.098696
2,0.096703
3,0.087261
4,0.086342
5,0.0859
6,0.094032
7,0.083374
8,0.102216
9,0.088004


In [15]:
rr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}