## Optimized modelling - linear models

The objective is to find the best avaible fit across the entire pipeline.

In [1]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from etl import *

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

from dotenv import load_dotenv
load_dotenv()

False

In [2]:
DATA_DIR = './Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

RANDOM_STATE = 21
TEST_SIZE = 0.1

In [3]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

In [4]:
X = data.drop('popularity', axis=1)
y = data['popularity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [5]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name']

X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

In [6]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']

numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

In [7]:
numeric_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer()),
    ('polynomials', PolynomialFeatures()),
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('imputation', SimpleImputer()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('imputation', SimpleImputer())
])

circle_of_fifths_pipeline = Pipeline(steps=[
    ('encoding', CircleOfFifthsEncoding()),
    ('imputation', SimpleImputer())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', circle_of_fifths_pipeline, circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('numeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop').set_output(transform='pandas')


preprocessing = Pipeline(steps=[
    
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)

]).set_output(transform='pandas')

### Configurations

In [23]:
pipeline_config = {

    'preprocessing__transformation__numeric_processing__imputation__strategy' : ['mean', 'median', 'most_frequent'],
    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2, 3],

    'preprocessing__transformation__artist_encoding__imputation__strategy' : ['mean', 'median', 'most_frequent'],
    'preprocessing__transformation__album_encoding__imputation__strategy' : ['mean', 'median', 'most_frequent'],

}

pipeline_config_subset = {
    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2, 3]
}

model configs

In [9]:
param_grid = [
    {
        'model' : [LinearRegression()]
    },

    {
        'model' : [Ridge()],
        'model__alpha': [0.01, 0.1, 1, 10, 100, 1000]
    },

    {
        'model' : [Lasso()],
        'model__alpha': [0.01, 0.1, 1, 10, 100, 1000]
    }
]

adding pipeline config

In [10]:
selected_pipeline_config = pipeline_config_subset

In [11]:
param_grid = [config | selected_pipeline_config for config in param_grid]

#### Fitting

In [12]:
k_fold = 5

model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', LinearRegression())                   # NOTE: placeholder
])

determining number of fits

In [13]:
fit_count = 1

for config in param_grid:
    
    for hyperparams in config.values():
        
        fit_count *= len(hyperparams)

fit_count = fit_count * k_fold

In [14]:
print(f"Total number of fitted models based on provided param_grid = {fit_count} models.")

Total number of fitted models based on provided param_grid = 11520 models.


In [15]:
gscv = GridSearchCV(model_pipeline, param_grid, cv=k_fold, scoring='neg_mean_squared_error', n_jobs=2)

In [16]:
gscv.fit(X_train, y_train)

In [17]:
cv_results = pd.DataFrame(gscv.cv_results_).sort_values(by='rank_test_score')

In [19]:
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocessing__transformation__numeric_processing__polynomials__degree,param_model__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,1.151512,0.152694,0.118829,0.040881,LinearRegression(),2,,"{'model': LinearRegression(), 'preprocessing__...",-255.281684,-259.039169,-254.014619,-247.848866,-250.569908,-253.350849,3.860859,1
5,1.130567,0.041799,0.235298,0.012971,Ridge(),2,0.01,"{'model': Ridge(), 'model__alpha': 0.01, 'prep...",-255.433532,-259.109425,-254.175132,-247.942963,-250.382191,-253.408649,3.904348,2
22,1.6458,0.052266,0.206863,0.009917,Ridge(),3,100.0,"{'model': Ridge(), 'model__alpha': 100, 'prepr...",-256.232844,-259.877443,-254.753442,-247.974381,-250.493485,-253.866319,4.209798,3
9,0.858775,0.359657,0.131727,0.066966,Ridge(),2,0.1,"{'model': Ridge(), 'model__alpha': 0.1, 'prepr...",-256.249333,-259.550417,-254.56618,-248.258813,-250.788123,-253.882573,3.984862,4
30,63.188108,0.74766,0.249044,0.104145,Lasso(),3,0.01,"{'model': Lasso(), 'model__alpha': 0.01, 'prep...",-257.511869,-261.140928,-255.519984,-248.952573,-251.059776,-254.837026,4.387592,5


selection of best models per model type

In [20]:
best_models = cv_results.drop_duplicates(subset='param_model')

In [21]:
best_models

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocessing__transformation__numeric_processing__polynomials__degree,param_model__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,1.151512,0.152694,0.118829,0.040881,LinearRegression(),2,,"{'model': LinearRegression(), 'preprocessing__...",-255.281684,-259.039169,-254.014619,-247.848866,-250.569908,-253.350849,3.860859,1
5,1.130567,0.041799,0.235298,0.012971,Ridge(),2,0.01,"{'model': Ridge(), 'model__alpha': 0.01, 'prep...",-255.433532,-259.109425,-254.175132,-247.942963,-250.382191,-253.408649,3.904348,2
30,63.188108,0.74766,0.249044,0.104145,Lasso(),3,0.01,"{'model': Lasso(), 'model__alpha': 0.01, 'prep...",-257.511869,-261.140928,-255.519984,-248.952573,-251.059776,-254.837026,4.387592,5


Testing against test data

In [22]:
best_model = gscv.best_estimator_

y_pred = best_model.predict(X_test)

r2 = best_model.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)


print(f"Best Model: {best_model['model']}\nconfiguration: {gscv.best_params_}")
print(f"...")
print(f"Best fit scores:")
print(f"Train R^2 = {best_model.score(X_train, y_train):.5f}")
print(f"Test R^2 = {r2:.5f}")
print(f"MSE = {mse:.5f}")
print(f"RMSE = {rmse:.5f}")

Best Model: LinearRegression()
configuration: {'model': LinearRegression(), 'preprocessing__transformation__numeric_processing__polynomials__degree': 2}
...
Best fit scores:
Train R^2 = 0.27227
Test R^2 = 0.26790
MSE = 261.08458
RMSE = 16.15811


Saving best model

In [25]:
import joblib
import pickle

In [26]:
DIR = '../../Prod/Models/Linear/'
FILE = 'best_linear_model.joblib'

In [29]:
#joblib.dump(best_model, DIR + FILE)