# Random Forrest Regression 

## Import Jobs and get Dataframe

In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

#For Big Query
from google.cloud import bigquery     
from google.oauth2 import service_account

#For ML Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from scipy.stats import randint

#For Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz


In [123]:
# Connect to BQ

credentials = service_account.Credentials.from_service_account_file(
'C:/Users/miria/Desktop/music-recommendation-system-24-3d0d21fb1f8b.json')
# music-recommendation-system-24-3d0d21fb1f8b.json is the service account JSON file. Save the file locally on your device and add the path here. 
# make sure that the slashes in the path are '/' and not '\'

project_id = 'music-recommendation-system-24'
client = bigquery.Client(credentials= credentials,project=project_id)

In [124]:
# Query BQ

query_job = client.query("""
   SELECT 
      *
   FROM `music-recommendation-system-24.ml_tables_eu.song_list_obama_wo_duplicates_view`""")


results = query_job.result() # Wait for the job to complete.

rows = [dict(row) for row in results]

# Convert the list of dictionaries to a DataFrame
df_ml = pd.DataFrame(rows)

In [125]:
df_ml.dtypes


acousticness                float64
danceability                float64
duration_min                float64
energy                      float64
genre                        object
genre2                       object
instrumentalness            float64
key                          object
liveness                    float64
loudness                    float64
mode                         object
speechiness                 float64
track_album_name             object
track_album_release_year      int64
track_artist                 object
track_id                     object
track_name                   object
track_popularity              int64
tempo                         int64
valence                     float64
in_obama_playlist             int64
dtype: object

## Build Model

In [126]:
#split X and y
X = df_ml[['acousticness', 
           'danceability', 
           'duration_min', 
           'energy', 
           'genre',
           'instrumentalness',
           'key',
           'liveness',
           'loudness',
           'mode',
           'speechiness',
           'track_popularity',
           'tempo',
           'valence']]
#drop: 
# genres (too much difference between origninal tables), 
# track_album_name, 
# track_album_release_year (to much difference between orignal tables)
# track_id & track_name (no value for model)
# in_obama_playlist (is y)
y = df_ml[['in_obama_playlist']]

In [127]:
#split test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, y_train.shape

((373, 14), (373, 1))

In [128]:
# split numeric and non numeric columns for X_train and X_test
X_train_numeric = X_train.select_dtypes(include=['int64', 'float64'])
X_test_numeric = X_test.select_dtypes(include=['int64', 'float64'])

X_train_non_numeric = X_train.select_dtypes(exclude=['int64', 'float64'])
X_test_non_numeric = X_test.select_dtypes(exclude=['int64', 'float64'])

X_train_numeric.shape, X_train_non_numeric.shape, y_train.shape

((373, 11), (373, 3), (373, 1))

In [129]:
#Classify non numeric data

X_train_non_numeric_ohe = enc.fit_transform(X_train_non_numeric)
X_test_non_numeric_ohe = enc.transform(X_test_non_numeric)

X_train_non_numeric_ohe = pd.DataFrame(X_train_non_numeric_ohe, columns=enc.get_feature_names_out())

X_test_non_numeric_ohe = pd.DataFrame(X_test_non_numeric_ohe, columns=enc.get_feature_names_out())

X_train_numeric.shape, X_train_non_numeric_ohe.shape, y_train.shape


((373, 11), (373, 32), (373, 1))

In [130]:
# concat normalised X_train's and X_test's

X_train_normalised = pd.concat([X_train_numeric.reset_index(), X_train_non_numeric_ohe.reset_index()], axis=1)

X_test_normalised = pd.concat([X_test_numeric.reset_index(), X_test_non_numeric_ohe.reset_index()], axis=1)

X_test_normalised.shape, X_train_normalised.shape, y_test.shape, y_train.shape


((185, 45), (373, 45), (185, 1), (373, 1))

In [131]:
#start Random Forrest
rf.fit(X_train_normalised, y_train)

  return fit_method(estimator, *args, **kwargs)


In [132]:
#predict Model
y_pred = rf.predict_proba(X_test_normalised)

#y_pred = pd.DataFrame(y_pred, columns = y_test)

#Prediction = pd.concat([X_test_normalised.reset_index(), y_pred.reset_index()], axis=1)


In [133]:
#Test Model
rf.score(X_test_normalised, y_test)

0.9027027027027027

In [134]:
# get the indices of the songs of (test)
test_indices = X_test_normalised[["index"]].iloc[:, 0].values

# convert predictions into a data frame and name columns as probabilities

df_proba_test = pd.DataFrame(y_pred, columns = ["proba_0", "proba_1"])

# Merging 

merged_proba = pd.concat([df_ml.loc[test_indices].reset_index(), df_proba_test], axis = 1)

## Get Prediction Data 

In [135]:
# Query BQ

query_job = client.query("""
   SELECT 
      *
   FROM `music-recommendation-system-24.ml_tables_eu.spotify_top_100_for_prediction_csv_V1`""")


results = query_job.result() # Wait for the job to complete.

rows = [dict(row) for row in results]

# Convert the list of dictionaries to a DataFrame
df_pred = pd.DataFrame(rows)

In [136]:
df_pred

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_name,track_album_release_year,duration_min,genre,record_label,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,3hRV0jL3vUpRrcy398teAU,The Night We Met,Lord Huron,88,Strange Trails,2015,3.47,folk,Play It Again Sam,44.8,...,A,-9.514,Major,4.52,96.8,27.2,63.8,9.98,174,3
1,0nrRP2bk19rLc0orkWPQk2,Wake Me Up,Avicii,83,TRUE,2013,4.12,edm,Universal Music AB,53.2,...,D,-5.697,Major,5.23,0.38,0.12,16.1,64.3,124,4
2,7BqHUALzNBTanL6OvsqmC1,Happier,"Marshmello,Bastille",82,Happier,2018,3.57,edm,Joytime Collective,68.7,...,F,-2.749,Major,4.52,19.1,0.0,16.7,67.1,100,4
3,5HCyWlXZPP0y6Gqq8TgA20,STAY (with Justin Bieber),"The Kid LAROI,Justin Bieber",78,STAY (with Justin Bieber),2021,2.36,pop,Columbia,59.1,...,C#,-5.484,Major,4.83,3.83,0.0,10.3,47.8,170,4
4,1BxfuPKGuaTgP7aM0Bbdwr,Cruel Summer,Taylor Swift,89,Lover,2019,2.97,pop,Taylor Swift,55.2,...,A,-5.707,Major,15.7,11.7,0.0,10.5,56.4,170,4
5,5wANPM4fQCJwkGd4rN57mH,drivers license,Olivia Rodrigo,82,SOUR,2021,4.03,pop,Olivia Rodrigo PS,56.1,...,A#,-8.81,Major,5.78,76.8,0.0,10.6,13.7,144,4
6,6FZDfxM3a3UCqtzo5pxSLZ,Without Me,Halsey,77,Manic,2020,3.36,pop,Capitol Records,75.2,...,F#,-7.05,Major,7.05,29.3,0.0,9.36,53.3,136,4
7,4cktbXiXOapiLBMprHFErI,Memories,Maroon 5,81,JORDI (Deluxe),2021,3.16,pop,Interscope Records*,77.5,...,B,-7.241,Major,5.57,84.1,0.0,8.21,59.5,91,4
8,7DSAEUvxU8FajXtRloy8M0,Flowers,Miley Cyrus,87,Endless Summer Vacation,2023,3.34,pop,Columbia,70.6,...,C,-4.775,Major,6.33,5.84,0.01,2.32,63.2,118,4
9,6P4d1NWBCNIYZjzF9k1mVN,good 4 u,Olivia Rodrigo,57,SOUR,2021,2.97,pop,Olivia Rodrigo PS,56.2,...,A,-5.025,Major,14.1,29.7,0.0,8.48,67.2,167,4


In [137]:
#Set x
X_pred = df_pred[['acousticness', 
           'danceability', 
           'duration_min', 
           'energy', 
           'genre',
           'instrumentalness',
           'key',
           'liveness',
           'loudness',
           'mode',
           'speechiness',
           'track_popularity',
           'tempo',
           'valence']]



In [138]:
#transform non numeric data and concat it back
X_pred_numeric = X_pred.select_dtypes(include=['int64', 'float64'])
X_pred_non_numeric = X_pred.select_dtypes(exclude=['int64', 'float64'])

X_pred_non_numeric_ohe = enc.transform(X_pred_non_numeric)
X_pred_non_numeric_ohe = pd.DataFrame(X_pred_non_numeric_ohe, columns=enc.get_feature_names_out())

X_pred_normalised = pd.concat([X_pred_numeric.reset_index(), X_pred_non_numeric_ohe.reset_index()], axis=1)

In [139]:
liked_procentage = rf.predict_proba(X_pred_normalised)

#liked_procentage = pd.DataFrame(liked_procentage, columns = y_test)

In [140]:
#top_songs_prediction = pd.concat([df_pred[['track_id', 'track_name', 'track_artist']].reset_index(), liked_procentage.reset_index()], axis=1)
# get the indices of the songs of (test)
pred_indices = X_pred_normalised[["index"]].iloc[:, 0].values

# convert predictions into a data frame and name columns as probabilities

df_proba_pred = pd.DataFrame(liked_procentage, columns = ["proba_0", "proba_1"])

# Merging 

merged_proba_pred = pd.concat([df_pred.loc[pred_indices].reset_index(), df_proba_pred], axis = 1)

In [143]:
merged_proba_pred[['track_id', 'track_name', 'track_artist', 'proba_1']].sort_values(['proba_1'], ascending=False)

Unnamed: 0,track_id,track_name,track_artist,proba_1
11,3bNv3VuUOKgrf5hu3YcuRo,Someone Like You,Adele,0.8
30,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,0.78
34,68Dni7IE4VyPkTOH9mRWHr,No Role Modelz,J. Cole,0.73
0,3hRV0jL3vUpRrcy398teAU,The Night We Met,Lord Huron,0.73
18,3JvKfv6T31zO0ini8iNItO,Another Love,Tom Odell,0.72
14,1mXVgsBdtIVeCLJnSnmtdV,Too Good At Goodbyes,Sam Smith,0.71
29,3z8h0TU7ReDPLIbEnYhWZb,Bohemian Rhapsody,Queen,0.7
5,5wANPM4fQCJwkGd4rN57mH,drivers license,Olivia Rodrigo,0.69
36,27NovPIUIRrOZoCHxABJwK,INDUSTRY BABY (feat. Jack Harlow),"Lil Nas X,Jack Harlow",0.68
7,4cktbXiXOapiLBMprHFErI,Memories,Maroon 5,0.63
