# ECS 171 - Group 1
## Random Forest Model Development

In [1]:
# Import some libraries
import numpy as np
import pandas as pd
from numpy import array
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, multilabel_confusion_matrix, classification_report

In [2]:
# Dataset Readin
df = pd.read_csv('SpotifyWithDate.csv',encoding='latin-1')
df

Unnamed: 0,track_id,popularity,danceability,instrumentalness,liveness,loudness,speechiness,valence,year
0,3kxkjirben9RVm9NqYa6rm,81,0.477,0.783000,0.1560,-9.413,0.0364,0.1230,2017
1,1foMv2HQwfQ2vntFf9HFeG,81,0.676,0.000687,0.0463,-5.815,0.0302,0.8520,2010
2,2Y0iGXY6m6immVb2ktbseM,82,0.705,0.000855,0.1000,-6.156,0.0385,0.6200,2018
3,4DpNNXFMMxQEKl7r0ykkWA,81,0.680,0.000000,0.2240,-5.077,0.0475,0.4460,2015
4,2jQiSYrwJehQAcuaaQrXnS,81,0.422,0.000005,0.1080,-6.361,0.0335,0.0911,2015
...,...,...,...,...,...,...,...,...,...
122658,2dag0uQrHLihj1vvaMcIfK,9,0.897,0.002050,0.0784,-9.583,0.0753,0.8410,1998
122659,0iiteHczcpMQs9FcpjaDYB,20,0.180,0.930000,0.1040,-27.710,0.0373,0.0424,2014
122660,2KpiSebbOdyR5vv3ocAMoj,7,0.224,0.112000,0.0680,-28.260,0.0432,0.1420,2000
122661,6R6L70n7FGNYLSgwiBuKB1,12,0.686,0.000000,0.2050,-18.487,0.9110,0.4190,2012


In [4]:
# Split to X and y 
X = df.copy().drop(columns = ['popularity','track_id'])
# Set Popularity for each category
y = df['popularity']
y = ((y-1)//20)+1

In [5]:
# Get distribution for y
def getdistribution(data):
    distribution = dict()
    for value in data:
        if value in distribution:
            distribution[value] += 1
        else:
            distribution[value] = 1
    return distribution

dist = getdistribution(y)
print(dist) # print frequency

{5: 534, 4: 9622, 3: 40131, 2: 59120, 1: 13256}


In [64]:
# GridSearch to find the best combination of hyperparameters
gridSearch = GridSearchCV(RandomForestClassifier(random_state=4), {
    'n_estimators':[150,200],
    'max_depth': [20,30,40],
    'max_features':['auto','log2']
})
test_result = gridSearch.fit(X, y)
print(test_result)
print(test_result.best_score_)
print(test_result.best_params_)

GridSearchCV(estimator=RandomForestClassifier(random_state=4),
             param_grid={'max_depth': [20, 30, 40],
                         'max_features': ['auto', 'log2'],
                         'n_estimators': [150, 200]})
0.5113848562997657
{'max_depth': 20, 'max_features': 'auto', 'n_estimators': 200}


In [10]:
# Split to training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=5)
# Create the model and fit
rf_cla = RandomForestClassifier(n_estimators=200,random_state=4, max_depth = 20)
rf_cla.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, n_estimators=200, random_state=4)

In [11]:
# Make prediction
y_pred = rf_cla.predict(X_test)

# Accuracy
print("\nAccuracy: ")
print(accuracy_score(y_test, y_pred))

# MSE
print('\nMSE: ')
print(mean_squared_error(y_test,y_pred))

# The map of label and their precision and recall
print("\nPrecision and recall:" )
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion matrix: ")
matrix = confusion_matrix(y_test, y_pred)
multi_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(multi_matrix)


Accuracy: 
0.5313251538743733

MSE: 
0.6209595239065748

Precision and recall:
              precision    recall  f1-score   support

           1       0.63      0.18      0.28      2650
           2       0.55      0.77      0.64     11821
           3       0.47      0.40      0.43      8083
           4       0.77      0.09      0.17      1880
           5       0.69      0.24      0.36        99

    accuracy                           0.53     24533
   macro avg       0.62      0.34      0.38     24533
weighted avg       0.55      0.53      0.50     24533

Confusion matrix: 
[[[21603   280]
  [ 2175   475]]

 [[ 5221  7491]
  [ 2701  9120]]

 [[12788  3662]
  [ 4843  3240]]

 [[22599    54]
  [ 1704   176]]

 [[24423    11]
  [   75    24]]]


In [12]:
import joblib
# save the model to disk
filename = 'random_forest_model200.joblib'
joblib.dump(rf_cla, filename)

['random_forest_model200.joblib']