In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import multilabel_confusion_matrix, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.combine import SMOTEENN


In [2]:
#Import our data
songs_df = pd.read_csv('spotify_songs.csv')

songs_df['playlist_genre'].unique()  

array(['rock', 'r&b', 'pop', 'edm', 'latin', 'rap'], dtype=object)

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['playlist_genre'] == 'latin']

genre_df['track_popularity'].count()



2178

In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['key','playlist_genre','playlist_name','track_artist','lyrics','track_album_name','track_name','track_album_id','track_id','track_album_release_date','playlist_id','language','playlist_subgenre','liveness','acousticness','speechiness'])

genre_df['track_popularity'].hist

<bound method hist_series of 10        1
12       69
14       74
19       62
24       32
         ..
18425    54
18427     3
18441    61
18444     0
18445    57
Name: track_popularity, Length: 2178, dtype: int64>

In [5]:
#bin target variable (popularity) into 5 classes: 'Very Unpopular', 'Unpopular', 'Somewhat popular', 'Popular', 'Very Popular'
#bin target variable relative to genre of interest
#drop old popularity 
bins_1 = [0,35,60,100]
labels_1 = ['U','A','P']

bins_2 = [0,20,40,60,100]
labels_2 = ['vu','u','swp','p']

#if genre_df['track_popularity'][genre_df['track_popularity'] > 80].count() <= 100:
        #genre_df['popularity_binned'] = pd.cut(genre_df['track_popularity'], bins = bins_2, labels = labels_2)
        
#else:
genre_df['popularity_binned'] = pd.cut(genre_df['track_popularity'], bins = bins_1, labels = labels_1)
    
genre_df = genre_df.drop(columns = ['track_popularity'])

genre_df['popularity_binned'].unique()

['U', 'P', 'A', NaN]
Categories (3, object): ['U' < 'A' < 'P']

In [6]:
#encode categorical variables
#genre_df = pd.get_dummies(genre_df, columns = ['mode'])
genre_df['popularity_binned'].value_counts()

genre_df = genre_df.dropna(axis=0)

genre_df.shape

(2018, 9)

In [7]:
#Split our data into target and feature variables
X = genre_df.drop(columns=['popularity_binned']).values
y= genre_df['popularity_binned'].dropna().values


In [8]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, train_size = 0.50, test_size = 0.50, random_state=1)


In [9]:
#Fix class imbalance by using SMOTEENN resampling
#smote_enn = SMOTEENN(random_state = 0)
#X_resampled, y_resampled = smote_enn.fit_resample(X,y)

In [10]:
#Encode our categorical variables
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

encoder.fit(y_train)
y_train = encoder.transform(y_train)

print(np.unique(y_train, return_counts = True))

(array([0, 1, 2]), array([450, 351, 208], dtype=int64))


In [11]:
#Scale and Transform our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [12]:
#Set up parameters
n_estimators = [5,25,50,100]
max_features = ['auto','sqrt']
max_depth = [2,3,4,5]
criterion = ['gain','entropy']
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]

In [13]:
#create Random Forest Classifier model and optimize parameters
rfc = RandomForestClassifier()
parameters = {
    "n_estimators": n_estimators,
    "max_depth" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "bootstrap" : bootstrap
    }
cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 3, 4, 5],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [5, 25, 50, 100]})

In [14]:
print(f'The best parameters are {cv.best_params_}')

The best parameters are {'bootstrap': False, 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 50}


In [15]:
#Fit the model with best parameters
rfc = RandomForestClassifier(n_estimators = cv.best_params_['n_estimators'], 
                           max_depth = cv.best_params_['max_depth'], 
                           min_samples_split = cv.best_params_['min_samples_split'],
                           min_samples_leaf =  cv.best_params_['min_samples_leaf'],
                           bootstrap = cv.best_params_['bootstrap'],
                           random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=5, min_samples_leaf=3,
                       n_estimators=50, random_state=0)

In [16]:
#predictions
y_pred = rfc.predict(X_test)
y_pred


array([0, 0, 1, ..., 0, 1, 1])

In [17]:
mcm = multilabel_confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


In [18]:
cm_df = pd.DataFrame(cm, index = ['Actual_U','Actual_A','Actual_P'], columns = ['Pred_U','Actual_A','Pred_P'])

cm_df

Unnamed: 0,Pred_U,Actual_A,Pred_P
Actual_U,308,131,8
Actual_A,188,131,1
Actual_P,180,53,9


In [19]:
mcm_dict = {}
for i in np.arange(0,3):
    mcm_dict.update({f'Confusion Matrix {i+1}': pd.DataFrame(mcm[i], columns = ['Pred_Neg','Pred_Pos'], index = ['Acc_Neg','Acc_Pos'])})


In [20]:
np.arange(0,2)

array([0, 1])

In [21]:
classes = ['Unpopular','Average','Popular']


for i in np.arange(1,4):
    print(f'Model Scoring for {classes[i-1]} songs')
    TP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[0]                
    TN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[1]
    FP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[1]
    FN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[0]
    print(mcm_dict[f'Confusion Matrix {i}'])
    precision = TP / (FP + TP)
    recall = TP / (FN + TP)
    accuracy = (TP + TN)/ (TP + FN + TN + FP)
    
    print(f'The precision score is {precision}')
    print(f'The recall score is {recall}')
    print(f'The accuracy score is {accuracy}')
    print('-------------------------------------')
          

    
    
   

Model Scoring for Unpopular songs
         Pred_Neg  Pred_Pos
Acc_Neg       194       368
Acc_Pos       139       308
The precision score is 0.5443786982248521
The recall score is 0.6548042704626335
The accuracy score is 0.5024777006937562
-------------------------------------
Model Scoring for Average songs
         Pred_Neg  Pred_Pos
Acc_Neg       505       184
Acc_Pos       189       131
The precision score is 0.5841269841269842
The recall score is 0.26705370101596515
The accuracy score is 0.36967294350842417
-------------------------------------
Model Scoring for Popular songs
         Pred_Neg  Pred_Pos
Acc_Neg       758         9
Acc_Pos       233         9
The precision score is 0.5
The recall score is 0.011734028683181226
The accuracy score is 0.2398414271555996
-------------------------------------


In [22]:
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.44400396432111

In [23]:
X_df = genre_df.drop(columns=['popularity_binned'])
X_df.columns
pd.DataFrame(y_pred).value_counts()

0    676
1    315
2     18
dtype: int64

In [24]:
#Evaluate the model
cr = classification_report(y_test, y_pred, target_names = classes)
print(cr)

              precision    recall  f1-score   support

   Unpopular       0.46      0.69      0.55       447
     Average       0.42      0.41      0.41       320
     Popular       0.50      0.04      0.07       242

    accuracy                           0.44      1009
   macro avg       0.46      0.38      0.34      1009
weighted avg       0.45      0.44      0.39      1009



In [25]:
#Check feature importance
feat_imp = rfc.feature_importances_

# We can sort the features by their importance.
sorted(zip(rfc.feature_importances_, X_df.columns), reverse=True)

[(0.243309393537558, 'loudness'),
 (0.14303531753106857, 'tempo'),
 (0.13180711450644836, 'duration_ms'),
 (0.12130193116723936, 'valence'),
 (0.11281485537788823, 'danceability'),
 (0.1103883195393008, 'energy'),
 (0.10222640524620873, 'instrumentalness'),
 (0.03511666309428789, 'mode')]