## Modelling schema

This notebook provides schema for preparing data transformation pipeline for modelling, data needs to be transformed to model ready form.  
This notebook serves as a guide, feel free to modify your code however you want - it just serves to explain the transformation logic.

In [213]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

import importlib
import etl

importlib.reload(etl)
from etl import (
    FrequencyEncoder,
    CircleOfFifthsEncoding,
    ConvertNull,
    ArtistPopularityEncoder,
    FollowerCountEncoder,
    AlbumNameEncoder,
    GenreEncoder,
)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from dotenv import load_dotenv

load_dotenv()

True

loading raw data

In [214]:
DATA_DIR = './Data/'
TRACK_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

In [215]:
tracks = pd.read_csv(DATA_DIR + TRACK_FILE)
artists = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(tracks, artists, on='track_id', how='left')

Data splitting -> working with predefined $X_{train}$

In [216]:
RANDOM_STATE = 21
TEST_SIZE = 0.1

In [217]:
X = data.drop('popularity', axis=1)
y = data['popularity']

In [218]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

Dropping columns defined in EDA

In [219]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name_x', 'track_name_y', 'artist_ids', 'artist_names']

# removed the calculation from scraper as it is added in etl.py
# this is in case data is not re-run
if 'avg_artist_popularity' in X_train.columns:
    drop_columns.append('avg_artist_popularity')

In [220]:
X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

transformation pipeline

In [221]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']
artist_popularity_col = ['artist_popularities']
follower_count_col = ['artist_followers']
artist_genres_col = ['artist_genres']

numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode', 'artist_popularities', 'artist_followers']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence', 'artist_popularities', 'artist_followers']

In [245]:
numeric_pipeline = Pipeline(steps=[
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('scaling', StandardScaler())
])

artist_popularity_pipeline = Pipeline(steps=[
    ('encoding', ArtistPopularityEncoder()),
    ('scaling', StandardScaler())
])

artist_followers_pipeline = Pipeline(steps=[
    ('encoding', FollowerCountEncoder(strategy='avg')),
    ('scaling', StandardScaler())
])

artist_genres_pipeline = Pipeline(steps=[
    ('encoding', GenreEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', CircleOfFifthsEncoding(), circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('artist_popularity_encoding', artist_popularity_pipeline, artist_popularity_col),
    ('follower_count_encoding', artist_followers_pipeline, follower_count_col),
    ('genres_encoding', GenreEncoder(), artist_genres_col),
    ('nummeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop')


preprocessing = Pipeline(steps=[
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)
])

transformation of data

In [231]:
pd.DataFrame(preprocessing.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,8.660254e-01,-1.136780,...,0.705974,-1.247168,-0.452275,-0.482662,0.021763,0.057611,0.411660,0.285070,-0.998566,0.577459
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.500000,-8.660254e-01,-0.751348,...,-0.232340,0.438704,-0.476210,0.788392,0.024609,-0.028825,-1.271181,0.285070,-0.773930,0.681340
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.000000,1.224647e-16,-0.558060,...,-0.086448,-0.321994,-0.476210,-0.111938,0.024680,-0.573281,1.007857,0.285070,-0.910996,0.889102
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000e+00,0.226529,...,-0.375460,0.594956,-0.476210,0.611857,0.025782,-0.443182,-0.632847,0.285070,-0.221858,0.473579
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.500000,-8.660254e-01,-0.323599,...,0.796792,-1.066245,-0.476053,-0.676263,0.023992,-0.534073,0.069415,0.285070,-0.773930,0.265817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56080,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.866025,-5.000000e-01,-0.878301,...,-0.943066,-0.860651,-0.476210,-0.447355,0.023165,-0.537637,-0.457916,-5.671029,-1.291735,0.577459
56081,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.866025,-5.000000e-01,-0.301868,...,0.090251,0.944466,-0.476167,-0.476778,0.023966,-0.353181,1.907194,0.285070,1.415320,0.369698
56082,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.500000,-8.660254e-01,-0.377353,...,-1.217829,1.413220,2.429753,0.152865,0.023702,-0.249815,0.356159,0.285070,0.082733,0.992982
56083,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.866025,5.000000e-01,-0.211514,...,0.522147,-0.794861,-0.475891,-0.653313,0.023778,-0.173181,-1.515118,0.285070,-0.023873,-0.149706


### Random Forest Modeling

In [232]:
from sklearn.ensemble import RandomForestRegressor

In [233]:
RF_model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE,
                                    max_depth=None, min_samples_split=10, min_samples_leaf=3))
])

RF_model.fit(X_train, y_train)

"""
RF_param_grid = {
    'model__n_estimators': [300, 350],
    'model__max_depth': [None],
    'model__min_samples_split': [10, 12],
    'model__min_samples_leaf': [3, 4]
}

RF_grid_search = GridSearchCV(estimator=RF_model, param_grid=RF_param_grid, 
                              cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

RF_grid_search.fit(X_train, y_train)
"""

KeyboardInterrupt: 

In [182]:
"""
RF_best_model = RF_grid_search.best_estimator_
RF_best_params = RF_grid_search.best_params_
print("Best Parameters:", RF_best_params)
"""

y_pred = RF_model.predict(X_test)

print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2: ', RF_best_model.score(X_test, y_test))

Best Parameters: {'model__max_depth': None, 'model__min_samples_leaf': 3, 'model__min_samples_split': 10, 'model__n_estimators': 350}
MSE:  146.6028819854889
RMSE:  12.107967706658657
R2:  0.5889142178365582


#### Hyperparameter Tuning Results
RF_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [2, 4]
}
best: 200, None, 10, 4
    
RF_param_grid = {
    'model__n_estimators': [200, 250],
    'model__max_depth': [None, 2],
    'model__min_samples_split': [10, 12],
    'model__min_samples_leaf': [4, 6]
}
best: 250, None, 10, 4
R2:  0.5890150062574195

RF_param_grid = {
    'model__n_estimators': [250, 300],
    'model__max_depth': [None],
    'model__min_samples_split': [8, 10],
    'model__min_samples_leaf': [3, 5]
}
best: 300, None, 10, 3
R2:  0.5890150062574195

RF_param_grid = {
    'model__n_estimators': [300, 350],
    'model__max_depth': [None],
    'model__min_samples_split': [10, 12],
    'model__min_samples_leaf': [3, 4]
}
best: 350, None, 10, 3
R2:  0.5889142178365582

### XGBoost Modeling

In [183]:
from xgboost import XGBRegressor

In [246]:
XGB_model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=RANDOM_STATE, 
                           n_estimators=127, max_depth=12, learning_rate=0.05, min_child_weight=7,
                           subsample=1.0, colsample_bytree=0.7))
])

XGB_model.fit(X_train, y_train)

"""
XGB_param_grid = {
    'model__n_estimators': [127],
    'model__max_depth': [12],
    'model__learning_rate': [0.05],
    'model__min_child_weight': [7],
    'model__subsample': [1.0, 1.1, 1.2],
    'model__colsample_bytree': [0.6, 0.7, 0.8]
}

XGB_grid_search = GridSearchCV(
    estimator=XGB_model,
    param_grid=XGB_param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

XGB_grid_search.fit(X_train, y_train)

XGB_best_model = XGB_grid_search.best_estimator_
XGB_best_params = XGB_grid_search.best_params_
print("Best Parameters:", XGB_best_params)
"""

'\nXGB_param_grid = {\n    \'model__n_estimators\': [127],\n    \'model__max_depth\': [12],\n    \'model__learning_rate\': [0.05],\n    \'model__min_child_weight\': [7],\n    \'model__subsample\': [1.0, 1.1, 1.2],\n    \'model__colsample_bytree\': [0.6, 0.7, 0.8]\n}\n\nXGB_grid_search = GridSearchCV(\n    estimator=XGB_model,\n    param_grid=XGB_param_grid,\n    cv=5,\n    scoring=\'neg_mean_squared_error\',\n    n_jobs=-1\n)\n\nXGB_grid_search.fit(X_train, y_train)\n\nXGB_best_model = XGB_grid_search.best_estimator_\nXGB_best_params = XGB_grid_search.best_params_\nprint("Best Parameters:", XGB_best_params)\n'

In [247]:
y_pred = XGB_model.predict(X_test)

print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2: ', XGB_model.score(X_test, y_test))

MSE:  139.77702734803748
RMSE:  11.822733497293994
R2:  0.608054518699646


#### Hyperparameter Tuning Results

XGB_param_grid = {
    'model__n_estimators': [150, 200, 250],
    'model__max_depth': [6, 9, 12],
    'model__learning_rate': [0.05, 0.1],
    'model__min_child_weight': [1, 3]
}
best: 150, 12, 0.05, 3
R2: 0.5993564128875732

XGB_param_grid = {
    'model__n_estimators': [100,150, 175],
    'model__max_depth': [12, 15, 18],
    'model__learning_rate': [0.25, 0.75],
    'model__min_child_weight': [3, 5]
}
best: 100, 12, 0.25, 5
R2:  0.5655938982963562

XGB_param_grid = {
    'model__n_estimators': [75, 100, 150],
    'model__max_depth': [9, 12, 15],
    'model__learning_rate': [0.03, 0.05],
    'model__min_child_weight': [5, 7]
}
best: 150, 12, 0.05, 7
R2:  0.6038635969161987

XGB_param_grid = {
    'model__n_estimators': [125, 150, 175],
    'model__max_depth': [11, 12, 13],
    'model__learning_rate': [0.05, 0.07],
    'model__min_child_weight': [7, 9]
}
best: 125, 12, 0.05, 7
R2:  0.605049729347229

XGB_param_grid = {
    'model__n_estimators': [120, 125, 130],
    'model__max_depth': [12],
    'model__learning_rate': [0.04, 0.06],
    'model__min_child_weight': [6, 8]
}
best: 130, 12, 0.04, 6
R2:  0.6025806069374084

XGB_param_grid = {
    'model__n_estimators': [127, 130, 140],
    'model__max_depth': [12],
    'model__learning_rate': [0.04, 0.05],
    'model__min_child_weight': [6, 7]
}
best: 127, 12, 0.05, 7
R2:  0.6050568222999573

XGB_param_grid = {
    'model__n_estimators': [127],
    'model__max_depth': [12],
    'model__learning_rate': [0.05],
    'model__min_child_weight': [7],
    'model__subsample': [0.8, 0.9 <strong>1.0</strong>],
    'model__colsample_bytree': [<strong>0.8</strong>, 0.9 1.0]
}
R2:  0.6078505516052246

127, 12, 0.05, 7, 1.0, 0.7
R2:  0.6079931259155273

In [248]:
scores = cross_val_score(XGB_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Cross-Validation RMSE:", (-scores.mean()) ** 0.5)

Cross-Validation RMSE: 11.627782311767973
