## Bechmark modelling - Linear models

The purpose of this file is to create model benchmarks which we will compare against the final optimized fits, this ensures we can  
check whether the optimization was successful and signifficant as well as changes in attribute weights (e.g. regression analysis)

The second part of the file is to perform regression analysis utilizing the benchmark models to determine the biggest contributors (attributes)  
to the target attribute = popularity.

In [1]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

False

In [2]:
from etl import *

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

loading data

In [3]:
DATA_DIR = './Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'
ARTIST_FILE = 'spotify_tracks_artist_details.csv'

RANDOM_STATE = 21
TEST_SIZE = 0.1

In [4]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

benchmark

In [5]:
data = data.dropna()

In [6]:
X = data.drop('popularity', axis=1)
y = data['popularity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [7]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name']

X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

### Testing scenario

We will compare the transformation pipeline results against the basic dataset form, to determine the utility of the transformations.  
It is important to state that the overall benefit of transformation pipeline could be present after the optimization - that means the verdict of the effectiveness  
is valid only for the benchmark linear fits.

#### 1. Original data - Nulls are not processed, original encoding, not-scaled

In [8]:
models_1 = {
    'linear_regression' : LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso()
}

In [9]:
drop_columns = ['track_id', 'artwork_url', 'track_url',
                 'track_name', 'album_name', 'artist_name',
                   'language', 'track_name_x', 'track_name_y',
                   'artist_ids', 'artist_names', 'artist_popularities', 'artist_genres', 'artist_followers']

In [10]:
X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

In [11]:
for model_name, model in models_1.items():
    model.fit(X_train, y_train)
    print(f"fitting {model_name:<20} ... train R^2 = {model.score(X_train, y_train):.5f}")

fitting linear_regression    ... train R^2 = 0.11610
fitting Ridge                ... train R^2 = 0.11610
fitting Lasso                ... train R^2 = 0.08303


prediction

In [12]:
for model_name, model in models_1.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    print(f"Model: {model_name:<20} ... test R^2 = {model.score(X_test, y_test):.5f} | MSE: {mse:.5f} | RMSE: {rmse:.5f}")

Model: linear_regression    ... test R^2 = 0.10258 | MSE: 308.37629 | RMSE: 17.56065
Model: Ridge                ... test R^2 = 0.10259 | MSE: 308.37475 | RMSE: 17.56060
Model: Lasso                ... test R^2 = 0.07485 | MSE: 317.90659 | RMSE: 17.82994


In [13]:
import joblib

In [14]:
DIR = '../../Prod/Models/Linear/'
FILE = 'linear-benchmark-lasso-1.joblib'

In [19]:
#X_train.columns.to_frame().reset_index(drop=True).to_csv('../../Prod/Models/Linear/linear-benchmark-columns.csv')

In [18]:
#joblib.dump(models_1['Lasso'], DIR + FILE)

#### 2. Transformed data - includes custom encoding, scaling, ...

In [17]:
data_tracks = pd.read_csv(DATA_DIR + DATA_FILE)
data_artist = pd.read_csv(DATA_DIR + ARTIST_FILE)

data = pd.merge(data_tracks, data_artist, on='track_id', how='left')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name_x', 'track_name_y']

X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

In [21]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']
album_name_col = ['album_name']
genre_col = ['artist_genres']
follower_col = ['artist_followers']
artist_popularity_col = ['artist_popularities']


numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

In [22]:
numeric_pipeline = Pipeline(steps=[
    ('imputation', SimpleImputer()),
    ('polynomials', PolynomialFeatures()),
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('imputation', SimpleImputer()),
    ('scaling', StandardScaler())
])

album_name_pipeline = Pipeline(steps=[
    ('encoding', AlbumNameEncoder()),
    ('scaling', StandardScaler())
])

circle_of_fifths_pipeline = Pipeline(steps=[
    ('encoding', CircleOfFifthsEncoding()),
    ('imputation', SimpleImputer())
])

genre_pipeline = Pipeline(steps=[
    ('encoding', GenreEncoder()),
    ('scaling', StandardScaler())
])

followers_pipeline = Pipeline(steps=[
    ('encoding', FollowerCountEncoder()),
    ('scaling', StandardScaler())
])

artist_popularity_pipeline = Pipeline(steps=[
    ('encoding', ArtistPopularityEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', circle_of_fifths_pipeline, circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('album_encoding', album_name_pipeline, album_name_col),
    ('follower_encoding', followers_pipeline, follower_col),
    ('genres_encoding', genre_pipeline, genre_col),
    ('artist_popularity_encoding', artist_popularity_pipeline, artist_popularity_col),
    ('numeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop').set_output(transform='pandas')


preprocessing = Pipeline(steps=[
    
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)

]).set_output(transform='pandas')

In [23]:
(preprocessing.fit_transform(X_train).dtypes == object).any()

False

In [24]:
preprocessing.fit(X_train, y_train)

In [25]:
X_train_transformed = preprocessing.transform(X_train)
X_test_transformed = preprocessing.transform(X_test)

In [26]:
models_2 = {
    'linear_regression' : LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso()
}

In [27]:
for model_name, model in models_2.items():
    model.fit(X_train_transformed, y_train)
    print(f"fitting {model_name:<20} ... train R^2 = {model.score(X_train_transformed, y_train):.5f}")

print()
print()

for model_name, model in models_2.items():
    y_pred = model.predict(X_test_transformed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    print(f"Model: {model_name:<20} ... test R^2 = {model.score(X_test_transformed, y_test):.5f} | MSE: {mse:.5f} | RMSE: {rmse:.5f}")

fitting linear_regression    ... train R^2 = 0.28612
fitting Ridge                ... train R^2 = 0.27459
fitting Lasso                ... train R^2 = 0.16885


Model: linear_regression    ... test R^2 = 0.26178 | MSE: 253.67128 | RMSE: 15.92706
Model: Ridge                ... test R^2 = 0.25034 | MSE: 257.60392 | RMSE: 16.05004
Model: Lasso                ... test R^2 = 0.15985 | MSE: 288.69616 | RMSE: 16.99106


#### Optimization

In [103]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', LinearRegression())
])

In [36]:
pipeline_config_subset_2 = {

    # attribute calculation strategies

    #'preprocessing__transformation__artist_encoding__encoding__strategy' : ['max', 'avg'],
    #'preprocessing__transformation__follower_encoding__encoding__strategy' : ['max', 'avg'],
    #'preprocessing__transformation__artist_popularity_encoding__encoding__strategy' : ['max', 'avg', 'both'],
    #'preprocessing__transformation__genres_encoding__encoding__strategy' : ['max', 'avg', 'sum'],

    'preprocessing__transformation__numeric_processing__polynomials__degree' : [1, 2]
}

In [37]:
param_grid = [
    {
        'model' : [LinearRegression()]
    },
    {
        'model' : [Ridge()],
        'model__alpha' : np.logspace(-4, 4, 100),

    },
    {
        'model' : [Lasso()],
        'model__alpha' : np.logspace(-4, 4, 100)
    }
]

In [38]:
param_grid = [grid | pipeline_config_subset_2 for grid in param_grid]

random search

In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [40]:
k_fold = 5

In [41]:
rscv = RandomizedSearchCV(model_pipeline, param_distributions=param_grid, n_iter=100, n_jobs=8, verbose=2)

In [42]:
rscv.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [43]:
optimization_results = pd.DataFrame(rscv.cv_results_).sort_values('rank_test_score')

In [44]:
best_models = optimization_results.drop_duplicates(subset='param_model')

In [65]:
from sklearn.model_selection import GridSearchCV

In [73]:
best_models['params'].values

array([{'preprocessing__transformation__numeric_processing__polynomials__degree': 2, 'model__alpha': 0.0019630406500402726, 'model': Ridge()},
       {'preprocessing__transformation__numeric_processing__polynomials__degree': 2, 'model': LinearRegression()},
       {'preprocessing__transformation__numeric_processing__polynomials__degree': 2, 'model__alpha': 0.0003053855508833416, 'model': Lasso()}],
      dtype=object)

In [90]:
optimized_lr = GridSearchCV(model_pipeline, param_grid={'preprocessing__transformation__numeric_processing__polynomials__degree': [2], 'model': [LinearRegression()]}, cv=5, n_jobs=5, verbose=3)
optimized_ridge = GridSearchCV(model_pipeline, param_grid={'preprocessing__transformation__numeric_processing__polynomials__degree': [2], 'model__alpha': [0.0019630406500402726], 'model': [Ridge()]}, cv=5)
optimized_lasso = GridSearchCV(model_pipeline, param_grid={'preprocessing__transformation__numeric_processing__polynomials__degree': [2], 'model__alpha': [0.0003053855508833416], 'model': [Lasso()]}, cv=5, n_jobs=10)

In [105]:
optimized_lr.fit(X_train, y_train)
optimized_ridge.fit(X_train, y_train)
optimized_lasso.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  model = cd_fast.enet_coordinate_descent(


In [97]:
print(optimized_ridge.best_estimator_.named_steps['model'].coef_.shape)
print(optimized_lr.best_estimator_.named_steps['model'].coef_.shape)
print(optimized_lasso.best_estimator_.named_steps['model'].coef_.shape)

(135,)
(135,)
(135,)


0.26178097968480785

In [122]:
opt_data = {
    'Linear_regression' : [optimized_lr.score(X_train, y_train), optimized_lr.score(X_test, y_test), mean_squared_error(y_test, optimized_lr.predict(X_test))],
    'Ridge' : [optimized_ridge.score(X_train, y_train), optimized_ridge.score(X_test, y_test), mean_squared_error(y_test, optimized_ridge.predict(X_test))],
    'Lasso' : [optimized_lasso.score(X_train, y_train), optimized_lasso.score(X_test, y_test), mean_squared_error(y_test, optimized_lasso.predict(X_test))]
}

In [123]:
opt_results = pd.DataFrame(opt_data).T.rename(columns={0 : 'Train R2', 1 : 'Test R2', 2 : 'MSE'})

In [124]:
opt_results.to_csv('../../Prod/Models/Linear/optimization-results.csv')

In [102]:
#joblib.dump(optimized_lr.best_estimator_, '../../Prod/Models/Linear/linear-optimized-lr.joblib')
#joblib.dump(optimized_ridge.best_estimator_, '../../Prod/Models/Linear/linear-optimized-ridge.joblib')
#joblib.dump(optimized_lasso.best_estimator_, '../../Prod/Models/Linear/linear-optimized-lasso.joblib')

In [52]:
files = ['ridge', 'lr', 'lasso']

prefix = 'linear-optimized-'

for i, (idx, row) in enumerate(best_models.iterrows()):
    
    selected = row['param_model']
    
    #joblib.dump(selected, f'../../Prod/Models/Linear/{prefix}{files[i]}-1.joblib')

Verdict: Transformation pipeline doubled the quality of the fit from 5% --> 10%

### Regression analysis - interpretation of feature importance

In [103]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

In [104]:
columns = preprocessing.transform(X_train).columns

weights = {
    model_name : [
        (column, coef) for column, coef in zip(columns, model.coef_)
    ]

    for model_name, model in models_2.items()
}

results = {
    model_name : pd.DataFrame(weights[model_name], columns=['attribute', 'weight']) for model_name in models_2.keys()
}

In [1]:
fig, axs = plt.subplots(3, 2, figsize=(19, 18), gridspec_kw={'width_ratios' : [1, 2], 'wspace' : 0.5})

model_names = list(models_2.keys())

for i, axes in enumerate(axs):

    axs[i][0].set_title(f"{model_names[i]} - parameter weights")
    axs[i][1].set_title(f"{model_names[i]} - best & worst parameters (10)")

    sns.barplot(data=results[model_names[i]], x='weight', y='attribute', ax=axs[i][0], color='tab:blue')

    n = 10

    subset = results[model_names[i]].sort_values(by='weight', ascending=False)
    target = pd.concat([subset.head(n), subset.tail(n)])

    target['attribute'] = target['attribute'].str.split('__').str[1]

    sns.barplot(data=target, x='weight', y='attribute', ax=axs[i][1], color='tab:blue')

    axs[i][0].set_yticks([])

NameError: name 'plt' is not defined

Best model results

In [107]:
results['linear_regression']

Unnamed: 0,attribute,weight
0,onehot_encoding__language_English,6.306953e+11
1,onehot_encoding__language_Hindi,6.306953e+11
2,onehot_encoding__language_Korean,6.306953e+11
3,onehot_encoding__language_Malayalam,6.306953e+11
4,onehot_encoding__language_Tamil,6.306953e+11
...,...,...
130,numeric_processing__time_signature valence,-6.029735e-01
131,numeric_processing__time_signature year,-6.012339e+01
132,numeric_processing__valence^2,-1.084747e+00
133,numeric_processing__valence year,-6.191901e+01


In [4]:
fig, ax = plt.subplots(1, 2, figsize=(18, 7), gridspec_kw={'width_ratios' : [1, 2], 'wspace' : 0.5})

model_name = 'linear_regression'
n_params = 10

ax[0].set_title(f'{model_name} - parameter weights')
ax[1].set_title(f'{model_name} - best & worst parameters ({n_params})')

sns.barplot(data=results[model_name], x='weight', y='attribute', ax=ax[0], color='tab:blue')

subset = results[model_name].sort_values(by='weight', ascending=False)
target = pd.concat([subset.head(n_params), subset.tail(n_params)])

target['attribute'] = target['attribute'].str.split('__').str[1]

sns.barplot(data=target, x='weight', y='attribute', ax=ax[1], color='tab:blue')

custom_legend = list(map(lambda x: f"{x} = {pipeline_results['linear_regression'][x]:.4f}", pipeline_results['linear_regression']))

ax[1].legend(
    title='Benchmark statistics:',
    labels=custom_legend,
    loc='lower right'
)

ax[0].set_yticks([])

fig.suptitle('Linear models benchmark results:')

#fig.savefig('../../Prod/Images/linear-benchmark-fit-lr-draft-2.png', bbox_inches='tight')

NameError: name 'plt' is not defined