In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


## Loading and Preprocessing Loans Encoded Data

Load the `songs_normalize.csv` in a pandas DataFrame called `spotify_df`

In [2]:
# Loading data
file_path = Path("../Resources/songs_normalize.csv")
spotify_df = pd.read_csv(file_path)
spotify_df.head()


Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


In [3]:
#Select only the following from the genres column: "pop", "hip hop", "rock", "Dance/Electronic", "latin", "R&B", "country", "metal"
genres = ["pop", "hip hop", "rock", "Dance/Electronic", "latin", "R&B", "country", "metal"]


spotify_df = spotify_df[spotify_df["genre"].isin(genres)]

spotify_df.head()


Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop
6,Eminem,The Real Slim Shady,284200,True,2000,86,0.949,0.661,5,-4.244,0,0.0572,0.0302,0.0,0.0454,0.76,104.504,hip hop
9,Modjo,Lady - Hear Me Tonight,307153,False,2001,77,0.72,0.808,6,-5.627,1,0.0379,0.00793,0.0293,0.0634,0.869,126.041,Dance/Electronic
10,Gigi D'Agostino,L'Amour Toujours,238759,False,2011,1,0.617,0.728,7,-7.932,1,0.0292,0.0328,0.0482,0.36,0.808,139.066,pop


In [4]:
# Define features set
X = spotify_df.copy()
X.drop(["genre", "artist", "song"], axis=1, inplace=True)
X.head()


Unnamed: 0,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053
4,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656
6,284200,True,2000,86,0.949,0.661,5,-4.244,0,0.0572,0.0302,0.0,0.0454,0.76,104.504
9,307153,False,2001,77,0.72,0.808,6,-5.627,1,0.0379,0.00793,0.0293,0.0634,0.869,126.041
10,238759,False,2011,1,0.617,0.728,7,-7.932,1,0.0292,0.0328,0.0482,0.36,0.808,139.066


Create the target vector by assigning the values of the `genre` column from the `spotify_df` DataFrame.

In [5]:
spotify_df["genre"].values[0:5]

array(['pop', 'pop', 'hip hop', 'Dance/Electronic', 'pop'], dtype=object)

In [6]:
# Define target vector
y = spotify_df["genre"].values.reshape(-1, 1)
y[:5]


array([['pop'],
       ['pop'],
       ['hip hop'],
       ['Dance/Electronic'],
       ['pop']], dtype=object)

Split the data into training and testing sets.

In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [8]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [9]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [10]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

Once the data is scaled, create a random forest instance and train it with the training data (`X_train_scaled` and `y_train`), define `n_estimators=500` and `random_state=78`.

In [11]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [12]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

## Making Predictions Using the Random Forest Model

Validate the trained model by predicting loan defaults using the testing data (`X_test_scaled`).

In [13]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array(['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop', 'hip hop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'hip hop', 'pop', 'pop', 'hip hop', 'pop', 'hip hop',
       'rock', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'Dance/Electronic', 'pop', 'pop', 'pop',
       'pop', 'pop', 'hip hop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'hip hop', 'pop', 'hip hop', 'hip hop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'Dance/Electronic', 'hip hop', 'hip hop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'hip hop', 'hip hop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop',
       'hi

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [14]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [15]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7
Actual 0,0,0,0,0,0,0,12,0
Actual 1,0,0,0,0,0,0,3,0
Actual 2,0,0,0,0,0,0,3,0
Actual 3,0,0,0,17,0,0,7,0
Actual 4,0,0,0,0,0,0,3,0
Actual 5,0,0,0,1,0,0,1,0
Actual 6,3,0,0,6,0,0,110,0
Actual 7,1,0,0,0,0,0,6,2


Accuracy Score : 0.7371428571428571
Classification Report
                  precision    recall  f1-score   support

Dance/Electronic       0.00      0.00      0.00        12
             R&B       0.00      0.00      0.00         3
         country       0.00      0.00      0.00         3
         hip hop       0.71      0.71      0.71        24
           latin       0.00      0.00      0.00         3
           metal       0.00      0.00      0.00         2
             pop       0.76      0.92      0.83       119
            rock       1.00      0.22      0.36         9

        accuracy                           0.74       175
       macro avg       0.31      0.23      0.24       175
    weighted avg       0.66      0.74      0.68       175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Feature Importance

In this section, you are asked to fetch the features' importance from the random forest model and display the top 10 most important features.

In [16]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.11387716183106786, 'speechiness'),
 (0.1052316114313298, 'explicit'),
 (0.08842745320085862, 'danceability'),
 (0.07462211903743451, 'instrumentalness'),
 (0.07326570692707403, 'duration_ms'),
 (0.0711688419446682, 'popularity'),
 (0.06721287436684628, 'acousticness'),
 (0.06604648033774425, 'tempo'),
 (0.06210315517631599, 'energy'),
 (0.06017571790694437, 'loudness')]

# Rerun with top 5 features

In [17]:
features = ['speechiness', 'explicit', 'danceability', 'instrumentalness', 'duration_ms']
X = spotify_df[features]
y = spotify_df["genre"].values.reshape(-1, 1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [19]:
scaler = StandardScaler()

In [20]:
X_scaler = scaler.fit(X_train)

In [21]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [23]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [24]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array(['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop',
       'pop', 'pop', 'Dance/Electronic', 'rock', 'pop', 'hip hop', 'pop',
       'pop', 'pop', 'hip hop', 'hip hop', 'latin', 'hip hop', 'pop',
       'hip hop', 'rock', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'rock', 'pop', 'pop', 'pop', 'Dance/Electronic', 'pop',
       'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'hip hop', 'pop', 'hip hop', 'hip hop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop',
       'Dance/Electronic', 'pop', 'pop', 'pop', 'Dance/Electronic',
       'hip hop', 'hip hop', 'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop',
       'hip hop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'rock', 'pop',

In [25]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7
Actual 0,2,0,0,1,0,0,7,2
Actual 1,0,0,0,0,0,0,3,0
Actual 2,0,0,0,0,0,0,3,0
Actual 3,0,0,0,16,0,0,8,0
Actual 4,0,0,0,0,0,0,3,0
Actual 5,0,0,0,1,0,0,1,0
Actual 6,3,0,0,9,1,0,104,2
Actual 7,2,0,0,0,0,0,5,2


Accuracy Score : 0.7085714285714285
Classification Report
                  precision    recall  f1-score   support

Dance/Electronic       0.29      0.17      0.21        12
             R&B       0.00      0.00      0.00         3
         country       0.00      0.00      0.00         3
         hip hop       0.59      0.67      0.63        24
           latin       0.00      0.00      0.00         3
           metal       0.00      0.00      0.00         2
             pop       0.78      0.87      0.82       119
            rock       0.33      0.22      0.27         9

        accuracy                           0.71       175
       macro avg       0.25      0.24      0.24       175
    weighted avg       0.65      0.71      0.67       175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Run top 10 features 

In [27]:
features = ['speechiness', 'explicit', 'danceability', 'instrumentalness', 'duration_ms', 'popularity', 'acousticness', 'tempo', 'energy', 'loudness']
X = spotify_df[features]
y = spotify_df["genre"].values.reshape(-1, 1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [29]:
scaler = StandardScaler()

In [30]:
X_scaler = scaler.fit(X_train)

In [31]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define a range of values for n_estimators
param_grid = {'n_estimators': [100, 200, 300, 500, 1000]}

# Initialize the random forest model
rf = RandomForestClassifier(random_state=78)

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best estimator
best_n_estimators = grid_search.best_params_['n_estimators']
print(f"Optimal n_estimators: {best_n_estimators}")


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Optimal n_estimators: 1000


In [33]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=1000, random_state=78)

In [34]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [35]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array(['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'hip hop', 'pop', 'hip hop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'rock', 'hip hop', 'pop', 'pop', 'hip hop', 'pop', 'hip hop',
       'rock', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'rock', 'pop',
       'rock', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'Dance/Electronic', 'pop', 'pop', 'pop',
       'pop', 'pop', 'hip hop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
       'pop', 'hip hop', 'pop', 'hip hop', 'hip hop', 'pop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'hip hop', 'pop', 'Dance/Electronic', 'pop',
       'pop', 'pop', 'Dance/Electronic', 'hip hop', 'hip hop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'hip hop', 'hip hop', 'pop', 'pop',
       'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop',
 

In [36]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [37]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7
Actual 0,3,0,0,0,0,0,9,0
Actual 1,0,0,0,0,0,0,3,0
Actual 2,0,0,0,0,0,0,3,0
Actual 3,0,0,0,17,0,0,7,0
Actual 4,0,0,0,0,0,0,3,0
Actual 5,0,0,0,1,0,0,1,0
Actual 6,3,0,0,5,0,0,108,3
Actual 7,1,0,0,0,0,0,5,3


Accuracy Score : 0.7485714285714286
Classification Report
                  precision    recall  f1-score   support

Dance/Electronic       0.43      0.25      0.32        12
             R&B       0.00      0.00      0.00         3
         country       0.00      0.00      0.00         3
         hip hop       0.74      0.71      0.72        24
           latin       0.00      0.00      0.00         3
           metal       0.00      0.00      0.00         2
             pop       0.78      0.91      0.84       119
            rock       0.50      0.33      0.40         9

        accuracy                           0.75       175
       macro avg       0.31      0.27      0.28       175
    weighted avg       0.68      0.75      0.71       175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
