## Ensemble methods - XGBoost & Random Forest

This notebook focuses on the benchmark & optimization of ensemble methods models such as XGBoost and Random Forest regressors. Ensemble methods are models of utilizing nested models  
such as large number of decision trees in the case of forests for example. Ensemble methods are considered more complex ML methods, we will compare the results against simpler models such as linear models.

In [61]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from etl import *

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

from dotenv import load_dotenv
load_dotenv()

import warnings

warnings.filterwarnings('ignore')

In [62]:
DATA_DIR = './Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

RANDOM_STATE = 21
TEST_SIZE = 0.1

In [63]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

In [64]:
X = data.drop('popularity', axis=1)
y = data['popularity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [65]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name']

X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

In [66]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']
genre_col = ['artist_genres']
follower_col = ['artist_followers']
artist_popularity_col = ['artist_popularities']


numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

In [67]:
numeric_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer()),
    ('polynomials', PolynomialFeatures()),
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('imputation', SimpleImputer()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('scaling', StandardScaler())
])

circle_of_fifths_pipeline = Pipeline(steps=[
    ('encoding', CircleOfFifthsEncoding()),
    ('imputation', SimpleImputer())
])

genre_pipeline = Pipeline(steps=[
    ('encoding', GenreEncoder()),
    ('scaling', StandardScaler())
])

followers_pipeline = Pipeline(steps=[
    ('encoding', FollowerCountEncoder()),
    ('scaling', StandardScaler())
])

artist_popularity_pipeline = Pipeline(steps=[
    ('encoding', ArtistPopularityEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', circle_of_fifths_pipeline, circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('follower_encoding', followers_pipeline, follower_col),
    ('genres_encoding', genre_pipeline, genre_col),
    ('artist_popularity_encoding', artist_popularity_pipeline, artist_popularity_col),
    ('numeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop').set_output(transform='pandas')


preprocessing = Pipeline(steps=[
    
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)

]).set_output(transform='pandas')

In [68]:
pipeline_config_subset = {
    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2]
}

### XGBoost algorithm

- we will utilize GPU acceleration
- requires scikit-learn version <1.6.0 (due to release updates there is a conflict with xgboost library)

In [69]:
from xgboost import XGBRegressor

Xgboost benchmark

In [70]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', XGBRegressor())
])

In [71]:
model_pipeline.fit(X_train, y_train)

In [74]:
joblib.dump(model_pipeline, '../../Prod/Models/Ensemble/xgb-benchmark.joblib')

['../../Prod/Models/Ensemble/xgb-benchmark.joblib']

exporting benchmark

In [11]:
import joblib

In [12]:
DIR = '../../Prod/Models/Ensemble/'
FILE = 'xgboost-regressor-benchmark-1.joblib'

In [13]:
#joblib.dump(model_pipeline, DIR + FILE)

In [14]:
#joblib.load(DIR + FILE).score(X_test, y_test)

optimization config

In [15]:
param_grid = {
    'model__n_estimators': [100, 200, 300, 500, 1000],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.1, 0.5],
    'model__reg_lambda': [1, 5, 10],

    'model__tree_method' : ['gpu_hist'],

    'model__verbosity' : [2]
}

k_fold = 5

In [16]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', XGBRegressor())
])

fitting

In [17]:
gscv = GridSearchCV(model_pipeline, param_grid=param_grid | pipeline_config_subset, cv=k_fold, n_jobs=5, verbose=1)

Fitting time is not feasible -> cost of optimization

In [18]:
#gscv.fit(X_train, y_train)

we will use random search instead

In [20]:
rscv = RandomizedSearchCV(model_pipeline, param_distributions=param_grid | pipeline_config_subset, cv=k_fold, n_iter=100, n_jobs=8)

In [21]:
rscv.fit(X_train, y_train)

In [22]:
results = pd.DataFrame(rscv.cv_results_)
results.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__transformation__numeric_processing__polynomials__degree,param_model__verbosity,param_model__tree_method,param_model__subsample,param_model__reg_lambda,param_model__reg_alpha,...,param_model__colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
99,11.454962,1.507466,0.247091,0.025394,1,2,gpu_hist,0.8,1,0.1,...,1.0,{'preprocessing__transformation__numeric_proce...,0.577766,0.588508,0.583704,0.586672,0.58556,0.584442,0.003684,1
10,10.442371,0.412418,0.25631,0.024192,1,2,gpu_hist,0.8,5,0.1,...,1.0,{'preprocessing__transformation__numeric_proce...,0.574581,0.586746,0.582828,0.589942,0.583508,0.583521,0.005139,2
84,22.231233,0.539706,0.34245,0.030573,1,2,gpu_hist,1.0,10,0.1,...,1.0,{'preprocessing__transformation__numeric_proce...,0.572587,0.586452,0.578858,0.586465,0.581091,0.581091,0.005195,3
72,11.236942,0.262089,0.335681,0.017189,1,2,gpu_hist,1.0,1,0.5,...,0.6,{'preprocessing__transformation__numeric_proce...,0.568451,0.582113,0.580989,0.585287,0.58378,0.580124,0.006016,4
77,20.730443,2.353031,0.463441,0.040066,1,2,gpu_hist,1.0,10,0.0,...,0.8,{'preprocessing__transformation__numeric_proce...,0.568904,0.584211,0.579069,0.579016,0.579334,0.578107,0.005004,5


In [42]:
results.query('rank_test_score == 1')['params'][99]

{'preprocessing__transformation__numeric_processing__polynomials__degree': 1,
 'model__verbosity': 2,
 'model__tree_method': 'gpu_hist',
 'model__subsample': 0.8,
 'model__reg_lambda': 1,
 'model__reg_alpha': 0.1,
 'model__n_estimators': 200,
 'model__min_child_weight': 10,
 'model__max_depth': 7,
 'model__learning_rate': 0.1,
 'model__colsample_bytree': 1.0}

In [49]:
#rscv.best_estimator_.named_steps['preprocessing'].transform(X_train).columns.to_frame().reset_index(drop=True).to_csv('../../Prod/Models/Ensemble/optimal-fit-30-columns.csv')

In [48]:
rscv.best_estimator_.named_steps['model'].feature_importances_

array([0.23225313, 0.01629204, 0.14204417, 0.01171475, 0.01479839,
       0.00851094, 0.01206668, 0.00616339, 0.00613781, 0.06996829,
       0.04566331, 0.03394162, 0.13473098, 0.04823739, 0.05015741,
       0.        , 0.00819255, 0.01683648, 0.027273  , 0.00790621,
       0.01073089, 0.0079278 , 0.01329653, 0.00939185, 0.01251799,
       0.00761676, 0.00733105, 0.00583847, 0.00750937, 0.02495088],
      dtype=float32)

In [59]:
#joblib.dump(rscv.best_estimator_, '../../Prod/Models/Ensemble/rgboost-regressor-optimized-30-columns.joblib')

In [49]:
print(rscv.score(X_train, y_train))
print(rscv.score(X_test, y_test))

0.7434321045875549
0.5873437523841858


export

In [26]:
import joblib

In [50]:
DIR = '../../Prod/Models/Ensemble/'
FILE = 'xgboost-regressor-tuning-1.joblib'

In [60]:
#joblib.dump(rscv.best_estimator_, DIR + FILE)

Tuning on preprocessing pipeline

In [31]:
pipeline_config_subset_2 = {

    # attribute calculation strategies

    'preprocessing__transformation__artist_encoding__encoding__strategy' : ['max', 'avg'],
    'preprocessing__transformation__follower_encoding__encoding__strategy' : ['max', 'avg'],
    'preprocessing__transformation__artist_popularity_encoding__encoding__strategy' : ['max', 'avg', 'both'],
    'preprocessing__transformation__genres_encoding__encoding__strategy' : ['max', 'avg', 'sum'],

    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2]
}

In [32]:
rscv_2 = RandomizedSearchCV(model_pipeline, param_distributions=param_grid | pipeline_config_subset_2, cv=k_fold, n_iter=100, n_jobs=8)

fitting

In [33]:
rscv_2.fit(X_train, y_train)

In [39]:
print(rscv_2.score(X_train, y_train))
print(rscv_2.score(X_test, y_test))

0.7587664723396301
0.5823652744293213


Huber loss metric

In [44]:
param_grid = {
    'model__n_estimators': [100, 200, 300, 500, 1000],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.1, 0.5],
    'model__reg_lambda': [1, 5, 10],

    'model__tree_method' : ['gpu_hist'],

    'model__verbosity' : [2]
}

k_fold = 5

model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', XGBRegressor(objective='reg:pseudohubererror'))
])

In [45]:
rscv_3 = RandomizedSearchCV(model_pipeline, param_distributions=param_grid | pipeline_config_subset_2, cv=k_fold, n_iter=100, n_jobs=8)

In [46]:
rscv_3.fit(X_train, y_train)

In [48]:
rscv_3.best_params_

{'preprocessing__transformation__numeric_processing__polynomials__degree': 1,
 'preprocessing__transformation__genres_encoding__encoding__strategy': 'sum',
 'preprocessing__transformation__follower_encoding__encoding__strategy': 'avg',
 'preprocessing__transformation__artist_popularity_encoding__encoding__strategy': 'both',
 'preprocessing__transformation__artist_encoding__encoding__strategy': 'max',
 'model__verbosity': 2,
 'model__tree_method': 'gpu_hist',
 'model__subsample': 0.8,
 'model__reg_lambda': 5,
 'model__reg_alpha': 0,
 'model__n_estimators': 1000,
 'model__min_child_weight': 1,
 'model__max_depth': 7,
 'model__learning_rate': 0.2,
 'model__colsample_bytree': 1.0}

In [47]:
print(rscv_3.score(X_train, y_train))
print(rscv_3.score(X_test, y_test))

0.7637568712234497
0.5506798624992371


### RandomForest algorithm

In [2]:
from sklearn.ensemble import RandomForestRegressor