In [75]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [76]:
# tr = training set
# ts = test set

df_tr = pd.read_csv('./dataset (missing + split)/processed.csv', skipinitialspace=True)
df_ts = pd.read_csv('./dataset (missing + split)/test.csv', skipinitialspace=True)

In [77]:
# preprocessing del TS set (rimozione feature, missing values e outliers)

timesig = round((df_ts['n_beats'] / (df_ts['n_bars']))).fillna(0)
df_ts['time_signature'].fillna(value=timesig, inplace=True)

df_ts['mode'].fillna(df_ts.groupby('key')['mode'].transform(lambda x: x.mode()[0]), inplace=True)


df_ts.drop(df_ts[df_ts['duration_ms'] >= 1331839].index, inplace = True)
df_ts.drop(df_ts[df_ts['speechiness'] >= 0.567].index, inplace = True)


df_ts.drop(columns=['features_duration_ms','n_bars','n_beats','popularity_confidence','processing','acousticness','loudness'], inplace=True)

In [78]:
cat = ['name','explicit','artists','album_name','key','mode','time_signature','genre']
num = [col for col in df_tr.columns if col not in cat]

# Univariate

In [79]:
target = 'danceability'

In [80]:
# TR set (input e target)
X_tr = df_tr[num].drop(columns=['danceability']).values
y_tr = df_tr[target]

# TS set (input e target)
X_ts = df_ts[num].drop(columns=['danceability']).values
y_ts = df_ts[target]

## Linear

In [81]:
linear = LinearRegression()
linear.fit(X_tr, y_tr)

ridge = Ridge()
ridge.fit(X_tr, y_tr)

lasso = Lasso()
lasso.fit(X_tr, y_tr)

#### Generalization error

In [82]:
print('Generalization error\n')

print('LINEAR'.ljust(15), 'RIDGE'.ljust(15), 'LASSO'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_ts, linear.predict(X_ts)),
      'R2: %.3f'.ljust(14) % r2_score(y_ts, ridge.predict(X_ts)),
      'R2: %.3f'.ljust(15) % r2_score(y_ts, lasso.predict(X_ts)))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, linear.predict(X_ts)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, ridge.predict(X_ts)),
      'MSE: %.3f'.ljust(15) % mean_squared_error(y_ts, lasso.predict(X_ts)))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, linear.predict(X_ts)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, ridge.predict(X_ts)),
      'MAE: %.3f'.ljust(15) % mean_absolute_error(y_ts, lasso.predict(X_ts)))

print('\n')
print('Loss\n')

print('LINEAR'.ljust(15), 'RIDGE'.ljust(15), 'LASSO'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_tr, linear.predict(X_tr)),
      'R2: %.3f'.ljust(14) % r2_score(y_tr, ridge.predict(X_tr)),
      'R2: %.3f'.ljust(15) % r2_score(y_tr, lasso.predict(X_tr)))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, linear.predict(X_tr)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, ridge.predict(X_tr)),
      'MSE: %.3f'.ljust(15) % mean_squared_error(y_tr, lasso.predict(X_tr)))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, linear.predict(X_tr)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, ridge.predict(X_tr)),
      'MAE: %.3f'.ljust(15) % mean_absolute_error(y_tr, lasso.predict(X_tr)))

Generalization error

LINEAR          RIDGE           LASSO          
R2: 0.356       R2: 0.356       R2: 0.002       
MSE: 0.024      MSE: 0.024      MSE: 0.037      
MAE: 0.125      MAE: 0.125      MAE: 0.153      


Loss

LINEAR          RIDGE           LASSO          
R2: 0.350       R2: 0.350       R2: 0.004       
MSE: 0.024      MSE: 0.024      MSE: 0.037      
MAE: 0.126      MAE: 0.126      MAE: 0.154      


### Nonlinear Regression

In [83]:
from sklearn.preprocessing import StandardScaler
norm = StandardScaler()
norm.fit(X_tr)

In [84]:
%%time

# Iperparametri scelti con grid search + k-fold cross validation
param_grid = {
    "n_neighbors": np.arange(start=10, stop=501, step=10),
    "weights": ["uniform"],
    "metric": ["euclidean", "cityblock"],
}

grid_search = GridSearchCV(
    KNeighborsRegressor(),
    param_grid=param_grid,
    cv=KFold(n_splits=10, shuffle=True, random_state=0),
    n_jobs=-1,
    refit=True,
    verbose=1,
)

grid_search.fit(norm.transform(X_tr), y_tr)
knn_reg = grid_search.best_estimator_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
CPU times: user 1.82 s, sys: 421 ms, total: 2.24 s
Wall time: 2min 5s


In [85]:
print(knn_reg.metric, knn_reg.n_neighbors, knn_reg.weights)

cityblock 20 uniform


In [9]:
%%time

# Ricerca di iperparametri come sopra ('None' a max_depth corrisponde a nessun limite di profondità)
param_grid = {
    'max_depth': list(np.arange(1, 102, step=20)) + [None],
    'min_samples_split': [2, 4, 8, 16, 32],
    'min_samples_leaf': [2, 4, 8, 16, 32, 64, 128],
    'criterion': ['squared_error', 'absolute_error']
}

grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=0),
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=0),
    n_jobs=-1,
    refit=True,
    verbose=1,
)

grid_search.fit(X_tr, y_tr)
dt_reg = grid_search.best_estimator_

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
CPU times: user 4.95 s, sys: 1.56 s, total: 6.51 s
Wall time: 21min 54s


In [10]:
print(dt_reg.max_depth, dt_reg.min_samples_split, dt_reg.min_samples_leaf, dt_reg.criterion)

21 2 64 squared_error


In [86]:
dt_reg = DecisionTreeRegressor(max_depth=21, min_samples_split=2, min_samples_leaf=64, criterion='squared_error')
dt_reg.fit(X_tr, y_tr)

knn_reg = KNeighborsRegressor(metric='cityblock', n_neighbors=20, weights='uniform')
knn_reg.fit(norm.transform(X_tr), y_tr)

#### Generalization error and Loss

In [87]:
print('Generalization error\n')

print('DEC TREE'.ljust(15), 'KNN'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_ts, dt_reg.predict(X_ts)),
      'R2: %.3f'.ljust(14) % r2_score(y_ts, knn_reg.predict(norm.transform(X_ts))))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, dt_reg.predict(X_ts)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, knn_reg.predict(norm.transform(X_ts))))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, dt_reg.predict(X_ts)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, knn_reg.predict(norm.transform(X_ts))))

print('\n')
print('Loss\n')

print('DEC TREE'.ljust(15), 'KNN'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_tr, dt_reg.predict(X_tr)),
      'R2: %.3f'.ljust(14) % r2_score(y_tr, knn_reg.predict(norm.transform(X_tr))))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, dt_reg.predict(X_tr)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, knn_reg.predict(norm.transform(X_tr))))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, dt_reg.predict(X_tr)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, knn_reg.predict(norm.transform(X_tr))))

Generalization error

DEC TREE        KNN            
R2: 0.541       R2: 0.590      
MSE: 0.017      MSE: 0.015     
MAE: 0.101      MAE: 0.095     


Loss

DEC TREE        KNN            
R2: 0.599       R2: 0.625      
MSE: 0.015      MSE: 0.014     
MAE: 0.095      MAE: 0.092     


# Multivariate

In [88]:
target = ['danceability', 'instrumentalness']

In [89]:
# TR set (input e target)
X_tr = df_tr[num].drop(columns=['danceability', 'instrumentalness']).values
y_tr = df_tr[target].values

# TS set (input e target)
X_ts = df_ts[num].drop(columns=['danceability', 'instrumentalness']).values
y_ts = df_ts[target].values

## Linear

In [90]:
linear = LinearRegression()
linear.fit(X_tr, y_tr)

ridge = Ridge()
ridge.fit(X_tr, y_tr)

lasso = Lasso()
lasso.fit(X_tr, y_tr)

#### Generalization error and Loss

In [91]:
print('Generalization error\n')

print('LINEAR'.ljust(15), 'RIDGE'.ljust(15), 'LASSO'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_ts, linear.predict(X_ts)),
      'R2: %.3f'.ljust(14) % r2_score(y_ts, ridge.predict(X_ts)),
      'R2: %.3f'.ljust(15) % r2_score(y_ts, lasso.predict(X_ts)))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, linear.predict(X_ts)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, ridge.predict(X_ts)),
      'MSE: %.3f'.ljust(15) % mean_squared_error(y_ts, lasso.predict(X_ts)))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, linear.predict(X_ts)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, ridge.predict(X_ts)),
      'MAE: %.3f'.ljust(15) % mean_absolute_error(y_ts, lasso.predict(X_ts)))

print('\n')
print('Loss\n')

print('LINEAR'.ljust(15), 'RIDGE'.ljust(15), 'LASSO'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_tr, linear.predict(X_tr)),
      'R2: %.3f'.ljust(14) % r2_score(y_tr, ridge.predict(X_tr)),
      'R2: %.3f'.ljust(15) % r2_score(y_tr, lasso.predict(X_tr)))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, linear.predict(X_tr)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, ridge.predict(X_tr)),
      'MSE: %.3f'.ljust(15) % mean_squared_error(y_tr, lasso.predict(X_tr)))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, linear.predict(X_tr)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, ridge.predict(X_tr)),
      'MAE: %.3f'.ljust(15) % mean_absolute_error(y_tr, lasso.predict(X_tr)))

Generalization error

LINEAR          RIDGE           LASSO          
R2: 0.290       R2: 0.290       R2: 0.039       
MSE: 0.069      MSE: 0.069      MSE: 0.086      
MAE: 0.205      MAE: 0.205      MAE: 0.243      


Loss

LINEAR          RIDGE           LASSO          
R2: 0.282       R2: 0.282       R2: 0.035       
MSE: 0.070      MSE: 0.070      MSE: 0.087      
MAE: 0.206      MAE: 0.206      MAE: 0.244      


In [92]:
norm = StandardScaler()
norm.fit(X_tr)

In [93]:
%%time

# Iperparametri scelti con grid search + k-fold cross validation
param_grid = {
    "n_neighbors": np.arange(start=10, stop=1000, step=10),
    "weights": ["distance", "uniform"],
    "metric": ["euclidean", "cityblock"],
}

grid_search = GridSearchCV(
    KNeighborsRegressor(),
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=0),
    n_jobs=-1,
    refit=True,
    verbose=1,
)

grid_search.fit(norm.transform(X_tr), y_tr)
knn_reg = grid_search.best_estimator_

Fitting 5 folds for each of 396 candidates, totalling 1980 fits
CPU times: user 4.11 s, sys: 1.15 s, total: 5.25 s
Wall time: 9min 2s


In [94]:
print(knn_reg.metric, knn_reg.n_neighbors, knn_reg.weights)

cityblock 30 distance


In [17]:
%%time

# Ricerca di iperparametri come sopra ('None' a max_depth corrisponde a nessun limite di profondità)
param_grid = {
    'max_depth': list(np.arange(1, 102, step=20)) + [None],
    'min_samples_split': [2, 4, 8, 16, 32, 64, 128],
    'min_samples_leaf': [2, 4, 8, 16, 32, 64, 128],
    'criterion': ['squared_error', 'absolute_error']
}

grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=0),
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=0),
    n_jobs=-1,
    refit=True,
    verbose=1,
)

grid_search.fit(X_tr, y_tr)
dt_reg = grid_search.best_estimator_

Fitting 5 folds for each of 686 candidates, totalling 3430 fits
CPU times: user 10.3 s, sys: 3.87 s, total: 14.2 s
Wall time: 1h 6min 6s


In [18]:
print(dt_reg.max_depth, dt_reg.min_samples_split, dt_reg.min_samples_leaf, dt_reg.criterion)

21 2 32 squared_error


In [97]:
dt_reg = DecisionTreeRegressor(max_depth=21, min_samples_split=2, min_samples_leaf=32, criterion='squared_error')
dt_reg.fit(X_tr, y_tr)

knn_reg = KNeighborsRegressor(metric='cityblock', weights='uniform', n_neighbors=30)
knn_reg.fit(norm.transform(X_tr), y_tr)

#### Generalization error and Loss

In [98]:
print('Generalization error\n')

print('DEC TREE'.ljust(15), 'KNN'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_ts, dt_reg.predict(X_ts)),
      'R2: %.3f'.ljust(14) % r2_score(y_ts, knn_reg.predict(norm.transform(X_ts))))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, dt_reg.predict(X_ts)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_ts, knn_reg.predict(norm.transform(X_ts))))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, dt_reg.predict(X_ts)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_ts, knn_reg.predict(norm.transform(X_ts))))

print('\n')
print('Loss\n')

print('DEC TREE'.ljust(15), 'KNN'.ljust(15))
print('R2: %.3f'.ljust(14) % r2_score(y_tr, dt_reg.predict(X_tr)),
      'R2: %.3f'.ljust(14) % r2_score(y_tr, knn_reg.predict(norm.transform(X_tr))))

print('MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, dt_reg.predict(X_tr)),
      'MSE: %.3f'.ljust(14) % mean_squared_error(y_tr, knn_reg.predict(norm.transform(X_tr))))

print('MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, dt_reg.predict(X_tr)),
      'MAE: %.3f'.ljust(14) % mean_absolute_error(y_tr, knn_reg.predict(norm.transform(X_tr))))

Generalization error

DEC TREE        KNN            
R2: 0.424       R2: 0.503      
MSE: 0.056      MSE: 0.050     
MAE: 0.164      MAE: 0.158     


Loss

DEC TREE        KNN            
R2: 0.513       R2: 0.527      
MSE: 0.047      MSE: 0.048     
MAE: 0.150      MAE: 0.155     


#### Conclusioni:
- I modelli lineari non riescono a generalizzare molto bene, perché i dati non sono perfettamente allineati lungo una retta/iperpiano
- I modelli non lineari (se selezionati con grid search e cross validation) producono risultati migliori, con errori più bassi e R^2 più alto