## Automated Modelling Schema

This modelling schema includes full parametrization of the transformations, the objective is to achieve encapsulated gri-search viable pipeline  
to improve optimization processes and efficiency. Pipeline model allows us to export parameters of the entire pipeline instead of just model weights.

Includes:
- Imputation strategies (ETL)
- New columns processing (ETL)

In [4]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from etl import *

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from dotenv import load_dotenv
load_dotenv()

True

loading raw data

In [5]:
DATA_DIR = './Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

In [6]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

Merge

In [7]:
data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

Data splitting -> working with predefined $X_{train}$

In [8]:
RANDOM_STATE = 21
TEST_SIZE = 0.1

In [9]:
X = data.drop('popularity', axis=1)
y = data['popularity']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

Dropping columns defined in EDA

In [11]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name']

In [12]:
X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

transformation pipeline

In [13]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']
genre_col = ['artist_genres']
follower_col = ['artist_followers']
artist_popularity_col = ['artist_popularities']


numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

In [14]:
numeric_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer()),
    ('polynomials', PolynomialFeatures()),
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('imputation', SimpleImputer()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('scaling', StandardScaler())
])

circle_of_fifths_pipeline = Pipeline(steps=[
    ('encoding', CircleOfFifthsEncoding()),
    ('imputation', SimpleImputer())
])

genre_pipeline = Pipeline(steps=[
    ('encoding', GenreEncoder()),
    ('scaling', StandardScaler())
])

followers_pipeline = Pipeline(steps=[
    ('encoding', FollowerCountEncoder()),
    ('scaling', StandardScaler())
])

artist_popularity_pipeline = Pipeline(steps=[
    ('encoding', ArtistPopularityEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', circle_of_fifths_pipeline, circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('follower_encoding', followers_pipeline, follower_col),
    ('genres_encoding', genre_pipeline, genre_col),
    ('artist_popularity_encoding', artist_popularity_pipeline, artist_popularity_col),
    ('numeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop').set_output(transform='pandas')


preprocessing = Pipeline(steps=[
    
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)

]).set_output(transform='pandas')

In [15]:
preprocessing

### Model configuration

- select model
- provide appropriate hyperparameter selection
- autoamted optimization
- export best fit

In [16]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=RANDOM_STATE)

In [17]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

In [18]:
# run model pipeline without hyperparameter optimization
results = model_pipeline.fit(X_train, y_train)
y_pred = results.predict(X_test)

NameError: name 'mean_squared_error' is not defined

In [22]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {np.sqrt(mse)}')
print(f'R2: {model_pipeline.score(X_test, y_test)}')

MSE: 158.86820180463812
RMSE: 12.604292991066105


R2: 0.5545213155753408


optimization - hyperparameter tuning

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [41]:
n_iter_search = 50
pipeline_config_subset_2 = {

    # attribute calculation strategies

    'preprocessing__transformation__artist_encoding__encoding__strategy' : ['max', 'avg'],
    'preprocessing__transformation__follower_encoding__encoding__strategy' : ['max', 'avg'],
    'preprocessing__transformation__artist_popularity_encoding__encoding__strategy' : ['max', 'avg', 'both'],
    'preprocessing__transformation__genres_encoding__encoding__strategy' : ['max', 'avg', 'sum'],

    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2]
}
param_grid = {
    'model__n_estimators': [350],
    'model__max_depth': [None],
    'model__min_samples_split': [10],
    'model__min_samples_leaf': [3]
}
k_fold = 3

In [42]:
results = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_grid | pipeline_config_subset_2,
    cv=k_fold,
    n_iter=n_iter_search,
    n_jobs=-1,
    verbose=2,
    random_state=RANDOM_STATE)

In [43]:
import os
print(f"Kernel PID: {os.getpid()}")

Kernel PID: 45484


In [44]:
# ran for 14 hours, could not complete
# ran for 47 minutes with less parameters and logging, only logged the initial info
# not sure what the issue was, abndoning this method
results.fit(X_train, y_train)   

Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

In [None]:
y_pred = results.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')
print(f'RMSE: {np.sqrt(mse)}')
print(f'R2: {results.score(X_test, y_test)}')
print(f'Best Parameters: {results.best_params_}')