## Modelling schema

This notebook provides schema for preparing data transformation pipeline for modelling, data needs to be transformed to model ready form.  
This notebook serves as a guide, feel free to modify your code however you want - it just serves to explain the transformation logic.

In [None]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

import importlib
import etl

importlib.reload(etl)
from etl import (
    FrequencyEncoder,
    CircleOfFifthsEncoding,
    ConvertNull,
    ArtistPopularityEncoder,
    FollowerCountEncoder,
    AlbumNameEncoder,
    GenreEncoder,
)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from dotenv import load_dotenv

load_dotenv()

loading raw data

In [155]:
DATA_DIR = './Data/'
TRACK_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

In [156]:
tracks = pd.read_csv(DATA_DIR + TRACK_FILE)
artists = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(tracks, artists, on='track_id', how='left')

Data splitting -> working with predefined $X_{train}$

In [157]:
RANDOM_STATE = 21
TEST_SIZE = 0.1

In [158]:
X = data.drop('popularity', axis=1)
y = data['popularity']

In [159]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

Dropping columns defined in EDA

In [160]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name_x', 'track_name_y', 'artist_ids', 'artist_names']

# removed the calculation from scraper as it is added in etl.py
# this is in case data is not re-run
if 'avg_artist_popularity' in X_train.columns:
    drop_columns.append('avg_artist_popularity')

In [161]:
X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

transformation pipeline

In [162]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']
artist_popularity_col = ['artist_popularities']
follower_count_col = ['artist_followers']
artist_genres_col = ['artist_genres']

numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode', 'artist_popularities', 'artist_followers']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence', 'artist_popularities', 'artist_followers']

In [163]:
numeric_pipeline = Pipeline(steps=[
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('scaling', StandardScaler())
])

artist_popularity_pipeline = Pipeline(steps=[
    ('encoding', ArtistPopularityEncoder()),
    ('scaling', StandardScaler())
])

artist_followers_pipeline = Pipeline(steps=[
    ('encoding', FollowerCountEncoder()),
    ('scaling', StandardScaler())
])

artist_genres_pipeline = Pipeline(steps=[
    ('encoding', GenreEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', CircleOfFifthsEncoding(), circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('artist_popularity_encoding', artist_popularity_pipeline, artist_popularity_col),
    ('follower_count_encoding', artist_followers_pipeline, follower_count_col),
    ('genres_encoding', GenreEncoder(), artist_genres_col),
    ('nummeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop')


preprocessing = Pipeline(steps=[
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)
])

transformation of data

In [None]:
pd.DataFrame(preprocessing.fit_transform(X_train))

### Random Forest Modeling

In [165]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
RF_model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE))
])

RF_param_grid = {
    'model__n_estimators': [250, 300],
    'model__max_depth': [None],
    'model__min_samples_split': [8, 10],
    'model__min_samples_leaf': [3, 5]
}

RF_grid_search = GridSearchCV(estimator=RF_model, param_grid=RF_param_grid, 
                              cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

RF_grid_search.fit(X_train, y_train)

RF_best_model = RF_grid_search.best_estimator_
RF_best_params = RF_grid_search.best_params_
print("Best Parameters:", RF_best_params)

In [None]:
y_pred = RF_best_model.predict(X_test)

print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2: ', RF_best_model.score(X_test, y_test))

#### Hyperparameter Tuning Results
RF_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [2, 4]
}
best: 200, None, 10, 4
    
RF_param_grid = {
    'model__n_estimators': [200, 250],
    'model__max_depth': [None, 2],
    'model__min_samples_split': [10, 12],
    'model__min_samples_leaf': [4, 6]
}
best: 250, None, 10, 4
R2:  0.5890150062574195

RF_param_grid = {
    'model__n_estimators': [250, 300],
    'model__max_depth': [None],
    'model__min_samples_split': [8, 10],
    'model__min_samples_leaf': [3, 5]
}

### XGBoost Modeling

In [141]:
from xgboost import XGBRegressor

In [None]:
XGB_model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=RANDOM_STATE, 
                           n_estimators=127, max_depth=12, learning_rate=0.05, min_child_weight=7))
])

XGB_model.fit(X_train, y_train)

"""
XGB_param_grid = {
    'model__n_estimators': [127, 130, 140],
    'model__max_depth': [12],
    'model__learning_rate': [0.04, 0.05],
    'model__min_child_weight': [6, 7]
}

XGB_grid_search = GridSearchCV(
    estimator=XGB_model,
    param_grid=XGB_param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

XGB_grid_search.fit(X_train, y_train)

XGB_best_model = XGB_grid_search.best_estimator_
XGB_best_params = XGB_grid_search.best_params_
print("Best Parameters:", XGB_best_params)
"""

In [None]:
y_pred = XGB_model.predict(X_test)

print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2: ', XGB_model.score(X_test, y_test))

#### Hyperparameter Tuning Results

XGB_param_grid = {
    'model__n_estimators': [150, 200, 250],
    'model__max_depth': [6, 9, 12],
    'model__learning_rate': [0.05, 0.1],
    'model__min_child_weight': [1, 3]
}
best: 150, 12, 0.05, 3
R2: 0.5993564128875732

XGB_param_grid = {
    'model__n_estimators': [100,150, 175],
    'model__max_depth': [12, 15, 18],
    'model__learning_rate': [0.25, 0.75],
    'model__min_child_weight': [3, 5]
}
best: 100, 12, 0.25, 5
R2:  0.5655938982963562

XGB_param_grid = {
    'model__n_estimators': [75, 100, 150],
    'model__max_depth': [9, 12, 15],
    'model__learning_rate': [0.03, 0.05],
    'model__min_child_weight': [5, 7]
}
best: 150, 12, 0.05, 7
R2:  0.6038635969161987

XGB_param_grid = {
    'model__n_estimators': [125, 150, 175],
    'model__max_depth': [11, 12, 13],
    'model__learning_rate': [0.05, 0.07],
    'model__min_child_weight': [7, 9]
}
best: 125, 12, 0.05, 7
R2:  0.605049729347229

XGB_param_grid = {
    'model__n_estimators': [120, 125, 130],
    'model__max_depth': [12],
    'model__learning_rate': [0.04, 0.06],
    'model__min_child_weight': [6, 8]
}
best: 130, 12, 0.04, 6
R2:  0.6025806069374084

XGB_param_grid = {
    'model__n_estimators': [127, 130, 140],
    'model__max_depth': [12],
    'model__learning_rate': [0.04, 0.05],
    'model__min_child_weight': [6, 7]
}
best: 127, 12, 0.05, 7
R2:  0.6050568222999573