In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [2]:
#Import our data
songs_df = pd.read_csv('SpotifyFeatures.csv')
songs_df.shape

(232725, 18)

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['genre'] == 'Rap']
genre_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
86951,Rap,Post Malone,Wow.,6MWtB6iiXyIwun0YzU6DFP,99,0.163,0.833,149520,0.539,2e-06,B,0.101,-7.399,Minor,0.178,99.947,4/4,0.385
86952,Rap,J. Cole,MIDDLE CHILD,2JvzF1RMd7lE3KmFlsyZD8,96,0.149,0.837,213594,0.364,0.0,G#,0.271,-11.713,Major,0.276,123.984,4/4,0.463
86953,Rap,Post Malone,Sunflower - Spider-Man: Into the Spider-Verse,3KkXRkHbMCARz0aVfEt68P,97,0.556,0.76,158040,0.479,0.0,D,0.0703,-5.574,Major,0.0466,89.911,4/4,0.913
86954,Rap,Travis Scott,SICKO MODE,2xLMifQCjDGFmkHkpNLD9h,94,0.00513,0.834,312820,0.73,0.0,G#,0.124,-3.714,Major,0.222,155.008,4/4,0.446
86955,Rap,Meek Mill,Going Bad (feat. Drake),2IRZnDFmlqMuOrYOLnZZyc,95,0.259,0.889,180522,0.496,0.0,E,0.252,-6.365,Minor,0.0905,86.003,4/4,0.544


In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['genre','artist_name','track_name','track_id','time_signature','liveness','acousticness','speechiness'])

genre_df.head()

Unnamed: 0,popularity,danceability,duration_ms,energy,instrumentalness,key,loudness,mode,tempo,valence
86951,99,0.833,149520,0.539,2e-06,B,-7.399,Minor,99.947,0.385
86952,96,0.837,213594,0.364,0.0,G#,-11.713,Major,123.984,0.463
86953,97,0.76,158040,0.479,0.0,D,-5.574,Major,89.911,0.913
86954,94,0.834,312820,0.73,0.0,G#,-3.714,Major,155.008,0.446
86955,95,0.889,180522,0.496,0.0,E,-6.365,Minor,86.003,0.544


In [5]:
#bin target variable (popularity) into 5 classes: 'Very Unpopular', 'Unpopular', 'Somewhat popular', 'Popular', 'Very Popular'
#bin target variable relative to genre of interest
#drop old popularity 
bins_1 = [0,50,65,80,100]
labels_1 = ['Unpopular','Somewhat Popular','Popular', 'Very Popular']

bins_2 = [0,50,60,70,100]
labels_2 = ['Unpoopular','Somewhat Popular', 'Popular','Very Popular']

if genre_df['popularity'][genre_df['popularity'] > 80].count() <= 100:
        genre_df['popularity_binned'] = pd.cut(genre_df['popularity'], bins = bins_2, labels = labels_2)
        
else:
    genre_df['popularity_binned'] = pd.cut(genre_df['popularity'], bins = bins_1, labels = labels_1)
    
genre_df = genre_df.drop(columns = ['popularity'])

genre_df['popularity_binned'].value_counts()

Somewhat Popular    6357
Popular             2107
Unpopular            596
Very Popular         172
Name: popularity_binned, dtype: int64

In [6]:
#encode categorical variables
genre_df_encoded = pd.get_dummies(genre_df, columns = ['key','mode'])
genre_df_encoded['popularity_binned'].value_counts()

Somewhat Popular    6357
Popular             2107
Unpopular            596
Very Popular         172
Name: popularity_binned, dtype: int64

In [7]:
#Split our data into target and feature variables
X = genre_df_encoded.drop(columns=['popularity_binned']).values
y= genre_df_encoded['popularity_binned'].values


In [8]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=1)


In [9]:
#Encode our categorical variables
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

encoder.fit(y_train)
y_train = encoder.transform(y_train)

print(np.unique(y_train, return_counts = True))

(array([0, 1, 2, 3]), array([ 622, 1935,  167,   46], dtype=int64))


In [10]:
#Scale and Transform our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [11]:
#Set up parameters
n_estimators = [5,25,50]
max_features = ['auto','sqrt']
max_depth = [2,3,4]
criterion = ['gain','entropy']
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]

In [12]:
#create Random Forest Classifier model and optimize parameters
rfc = RandomForestClassifier()
parameters = {
    "n_estimators": n_estimators,
    "max_depth" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "bootstrap" : bootstrap
    }
cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 3, 4],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [5, 25, 50]})

In [13]:
print(f'The best parameters are {cv.best_params_}')

The best parameters are {'bootstrap': True, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 5}


In [14]:
#Fit the model with best parameters
rfc = RandomForestClassifier(n_estimators = cv.best_params_['n_estimators'], 
                           max_depth = cv.best_params_['max_depth'], 
                           min_samples_split = cv.best_params_['min_samples_split'],
                           min_samples_leaf =  cv.best_params_['min_samples_leaf'],
                           bootstrap = cv.best_params_['bootstrap'],
                           random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, min_samples_split=10, n_estimators=5,
                       random_state=0)

In [15]:
#predictions
y_pred = rfc.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [16]:
cm = multilabel_confusion_matrix(y_test, y_pred)

In [17]:
cm_dict = {}
for i in np.arange(0,4):
    cm_dict.update({f'Confusion Matrix {i+1}': pd.DataFrame(cm[i], columns = ['Pred_Pos','Pred_Neg'], index = ['Acc_Pos','Acc_Neg'])})

In [18]:
cm_dict['Confusion Matrix 1']

Unnamed: 0,Pred_Pos,Pred_Neg
Acc_Pos,4970,7
Acc_Neg,1484,1


In [19]:
for i in np.arange(1,5):
    TP = cm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[0]
    TN = cm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[1]
    FP = cm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[1]
    FN = cm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[0]
    print(cm_dict[f'Confusion Matrix {i}'])
    precision = TP / (FP + TP)
    recall = TP / (FN + TP)
    accuracy = (TP + TN)/ (TP + FN + TN + FP)
    
    print(f'The precision score is {precision}')
    print(f'The recall score is {recall}')
    print(f'The accuracy score is {accuracy}')
    
    print('-------------------------------------')
          

    
    
   

         Pred_Pos  Pred_Neg
Acc_Pos      4970         7
Acc_Neg      1484         1
The precision score is 0.7700650759219089
The recall score is 0.9985935302390999
The accuracy score is 0.7692664809656453
-------------------------------------
         Pred_Pos  Pred_Neg
Acc_Pos         2      2038
Acc_Neg         6      4416
The precision score is 0.25
The recall score is 0.000980392156862745
The accuracy score is 0.6836892602909316
-------------------------------------
         Pred_Pos  Pred_Neg
Acc_Pos      6033         0
Acc_Neg       429         0
The precision score is 0.9336118848653667
The recall score is 1.0
The accuracy score is 0.9336118848653667
-------------------------------------
         Pred_Pos  Pred_Neg
Acc_Pos      6336         0
Acc_Neg       126         0
The precision score is 0.9805013927576601
The recall score is 1.0
The accuracy score is 0.9805013927576601
-------------------------------------


In [20]:
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.6835345094398019

In [21]:
#Check feature importance
feat_imp = rfc.feature_importances_
pd.DataFrame(feat_imp)

Unnamed: 0,0
0,0.058322
1,0.203746
2,0.159306
3,0.027806
4,0.22944
5,0.108566
6,0.090423
7,0.016139
8,0.008339
9,0.0
