## Optimized modelling - linear models

The objective is to find the best avaible fit across the entire pipeline.

In [57]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from etl import *

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

from dotenv import load_dotenv
load_dotenv()

False

In [58]:
DATA_DIR = './Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

RANDOM_STATE = 21
TEST_SIZE = 0.1

In [59]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

In [60]:
X = data.drop('popularity', axis=1)
y = data['popularity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [61]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name']

X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

In [62]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']

numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

In [63]:
numeric_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer()),
    ('polynomials', PolynomialFeatures()),
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('imputation', SimpleImputer()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('imputation', SimpleImputer())
])

circle_of_fifths_pipeline = Pipeline(steps=[
    ('encoding', CircleOfFifthsEncoding()),
    ('imputation', SimpleImputer())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', circle_of_fifths_pipeline, circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('numeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop').set_output(transform='pandas')


preprocessing = Pipeline(steps=[
    
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)

]).set_output(transform='pandas')

### Configurations

In [64]:
pipeline_config = {

    'preprocessing__transformation__numeric_processing__imputation__strategy' : ['mean', 'median', 'most_frequent'],
    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2, 3, 4, 5],

    'preprocessing__transformation__artist_encoding__imputation__strategy' : ['mean', 'median', 'most_frequent'],
    'preprocessing__transformation__album_encoding__imputation__strategy' : ['mean', 'median', 'most_frequent'],

}

model configs

In [65]:
param_grid = [
    {
        'model' : [LinearRegression()]
    },

    {
        'model' : [Ridge()],
        'model__alpha': [0.01, 0.1, 1, 10, 100, 1000]
    },

    {
        'model' : [Lasso()],
        'model__alpha': [0.01, 0.1, 1, 10, 100, 1000]
    }
]

adding pipeline config

In [66]:
param_grid = [config | pipeline_config for config in param_grid]

#### Fitting

In [67]:
k_fold = 5

model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', LinearRegression())                   # NOTE: placeholder
])

In [2]:
gscv = GridSearchCV(model_pipeline, param_grid, cv=k_fold, scoring='neg_mean_squared_error', n_jobs=2)

NameError: name 'GridSearchCV' is not defined

In [1]:
#gscv.fit(X_train, y_train)