## Ensemble methods - XGBoost & Random Forest

This notebook focuses on the benchmark & optimization of ensemble methods models such as XGBoost and Random Forest regressors. Ensemble methods are models of utilizing nested models  
such as large number of decision trees in the case of forests for example. Ensemble methods are considered more complex ML methods, we will compare the results against simpler models such as linear models.

In [1]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from etl import *

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

from dotenv import load_dotenv
load_dotenv()

import warnings

warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = './Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

RANDOM_STATE = 21
TEST_SIZE = 0.1

In [3]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

In [4]:
X = data.drop('popularity', axis=1)
y = data['popularity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [5]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name']

X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

In [6]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']
genre_col = ['artist_genres']
follower_col = ['artist_followers']
artist_popularity_col = ['artist_popularities']


numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

In [7]:
numeric_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer()),
    ('polynomials', PolynomialFeatures()),
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('imputation', SimpleImputer()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('scaling', StandardScaler())
])

circle_of_fifths_pipeline = Pipeline(steps=[
    ('encoding', CircleOfFifthsEncoding()),
    ('imputation', SimpleImputer())
])

genre_pipeline = Pipeline(steps=[
    ('encoding', GenreEncoder()),
    ('scaling', StandardScaler())
])

followers_pipeline = Pipeline(steps=[
    ('encoding', FollowerCountEncoder()),
    ('scaling', StandardScaler())
])

artist_popularity_pipeline = Pipeline(steps=[
    ('encoding', ArtistPopularityEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', circle_of_fifths_pipeline, circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('follower_encoding', followers_pipeline, follower_col),
    ('genres_encoding', genre_pipeline, genre_col),
    ('artist_popularity_encoding', artist_popularity_pipeline, artist_popularity_col),
    ('numeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop').set_output(transform='pandas')


preprocessing = Pipeline(steps=[
    
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)

]).set_output(transform='pandas')

In [8]:
pipeline_config_subset = {
    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2]
}

### XGBoost algorithm

- we will utilize GPU acceleration
- requires scikit-learn version <1.6.0 (due to release updates there is a conflict with xgboost library)

In [9]:
from xgboost import XGBRegressor

optimization config

In [10]:
param_grid = {
    'model__n_estimators': [100, 200, 300, 500, 1000],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.1, 0.5],
    'model__reg_lambda': [1, 5, 10],

    'model__tree_method' : ['gpu_hist'],

    'model__verbosity' : [2]
}

k_fold = 5

In [11]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', XGBRegressor())
])

fitting

In [12]:
gscv = GridSearchCV(model_pipeline, param_grid=param_grid | pipeline_config_subset, cv=k_fold, n_jobs=5, verbose=1)

Fitting time is not feasible -> cost of optimization

In [13]:
#gscv.fit(X_train, y_train)

we will use random search instead

In [14]:
rscv = RandomizedSearchCV(model_pipeline, param_distributions=param_grid | pipeline_config_subset, cv=k_fold, n_iter=100, n_jobs=8)

In [15]:
rscv.fit(X_train, y_train)

KeyboardInterrupt: 

Tuning on preprocessing pipeline

In [35]:
pipeline_config_subset = {

    # attribute calculation strategies

    'preprocessing__transformation__artist_encoding__encoding__strategy' : ['max', 'avg'],
    'preprocessing__transformation__follower_encoding__encoding__strategy' : ['max', 'avg'],
    ''


    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2]
}

#### Results

In [16]:
results = pd.DataFrame(rscv.cv_results_)

In [17]:
results.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__transformation__numeric_processing__polynomials__degree,param_model__verbosity,param_model__tree_method,param_model__subsample,param_model__reg_lambda,param_model__reg_alpha,...,param_model__colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
52,9.182378,0.655404,0.171857,0.022349,1,2,gpu_hist,1.0,5,0.5,...,1.0,{'preprocessing__transformation__numeric_proce...,0.572067,0.587101,0.579728,0.590265,0.585344,0.582901,0.006407,1
76,9.161959,0.338591,0.268341,0.047249,1,2,gpu_hist,0.8,10,0.5,...,0.8,{'preprocessing__transformation__numeric_proce...,0.571304,0.58716,0.583456,0.586014,0.584198,0.582427,0.005713,2
86,8.258669,0.799266,0.223362,0.031572,1,2,gpu_hist,1.0,1,0.1,...,0.6,{'preprocessing__transformation__numeric_proce...,0.572174,0.587177,0.582075,0.584158,0.583545,0.581826,0.005104,3
92,30.198967,0.231026,0.286923,0.008619,1,2,gpu_hist,0.8,5,0.0,...,1.0,{'preprocessing__transformation__numeric_proce...,0.569715,0.576123,0.575477,0.579811,0.579682,0.576161,0.003681,4
95,6.339097,0.237366,0.191917,0.016652,1,2,gpu_hist,1.0,5,0.0,...,0.6,{'preprocessing__transformation__numeric_proce...,0.564807,0.577699,0.572004,0.579854,0.57409,0.573691,0.005215,5


In [18]:
best_model = rscv.best_estimator_

In [19]:
rscv.best_params_

{'preprocessing__transformation__numeric_processing__polynomials__degree': 1,
 'model__verbosity': 2,
 'model__tree_method': 'gpu_hist',
 'model__subsample': 1.0,
 'model__reg_lambda': 5,
 'model__reg_alpha': 0.5,
 'model__n_estimators': 300,
 'model__min_child_weight': 10,
 'model__max_depth': 7,
 'model__learning_rate': 0.1,
 'model__colsample_bytree': 1.0}

testing data

In [23]:
metric = 'MSE'

In [24]:
y_pred = best_model.predict(X_test)

print("model: XGBoost")
print(f"Train R^2: {best_model.score(X_train, y_train):.4f}")
print(f"Test R^2: {best_model.score(X_test, y_test):.4f}")

print(f"Metric: {metric}")
print(f"{metric}: {mean_squared_error(y_test, y_pred)}")

model: XGBoost
Train R^2: 0.7350
Test R^2: 0.5864
Metric: MSE
MSE: 147.50786783958216


model exports

In [27]:
import joblib

export1

In [26]:
rscv.best_params_

{'preprocessing__transformation__numeric_processing__polynomials__degree': 1,
 'model__verbosity': 2,
 'model__tree_method': 'gpu_hist',
 'model__subsample': 1.0,
 'model__reg_lambda': 5,
 'model__reg_alpha': 0.5,
 'model__n_estimators': 300,
 'model__min_child_weight': 10,
 'model__max_depth': 7,
 'model__learning_rate': 0.1,
 'model__colsample_bytree': 1.0}

In [32]:
DIR = '../../Prod/Models/Ensemble/'
FILE = 'xgboost-regressor-tuning-1.joblib'

In [33]:
joblib.dump(rscv.best_estimator_, DIR + FILE)

['../../Prod/Models/Ensemble/xgboost-regressor-tuning-1.joblib']

### RandomForest algorithm

In [2]:
from sklearn.ensemble import RandomForestRegressor