In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.covariance import (
    OAS,
    EmpiricalCovariance,
    LedoitWolf,
    MinCovDet,
)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression



plt.style.use("seaborn-v0_8")

In [2]:
random_state = 42

df = pd.read_csv("train.csv")

df_X = df.iloc[:, 1:]
df_y = df.iloc[:, 0]

# "stratify" is set because the target is very imbalanced
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, stratify=df_y, random_state=random_state)
print("Train length: {}\nTest length: {}".format(X_train.shape[0], X_test.shape[0]))

Train length: 176522
Test length: 75653


---
## Preprocessing

- To speed up the GridSearch, were removed the parameters of the following preprocessing methods that gave, in general, very low results.
Some of them are just commented.
- [Whitening transformation](https://en.wikipedia.org/wiki/Whitening_transformation) is tested
- [Shrinkage](https://en.wikipedia.org/wiki/Shrinkage_(statistics)#:~:text=In%20statistics%2C%20shrinkage%20is%20the,coefficient%20of%20determination%20'shrinks') is tested
- It's applied StandardScaler and not MinMaxScaler before PCA because the data has a normal distribution and PCA prefers standardized data.

In [40]:
# step_name, [(transformer_name, transformer_constructor()), ...], step_param_grid
transformers_list = [
    # ("raw", [("raw", "passthrough")], {}),  # does nothing to the data
    # ("std-scaled", [("std-scaler", preprocessing.StandardScaler())], {}), # DA TOGLIEREEEE???????????
    # ("min-maxed", [("min-max", preprocessing.MinMaxScaler())], {}), # DA TOGLIEREEEE???????????
    # (
    #     "std-scaler + pca",
    #     [("std-scaler", preprocessing.StandardScaler()), ("pca", PCA(random_state=random_state))],
    #     {
    #         "pca__svd_solver": ["auto"],
    #         "pca__n_components": ["mle"],
    #         "pca__whiten": [True, False],
    #     },
    # ),
    # (
    #     "std-scaler + pca 0.95%",
    #     [("std-scaler", preprocessing.StandardScaler()), ("pca", PCA(random_state=random_state))],
    #     {
    #         "pca__svd_solver": ["full"],
    #         "pca__n_components": [0.95],
    #         "pca__whiten": [True, False],
    #     },
    # ),
    # (
    #     "lda1",
    #     [("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["eigen"],
    #         "lda__shrinkage": ["auto", None],
    #     },
    # ),
    (
        "lda2",
        [("lda", LinearDiscriminantAnalysis())],
        {
            "lda__solver": ["eigen"],
            "lda__covariance_estimator": [
                EmpiricalCovariance(),
                LedoitWolf(),
                MinCovDet(),
                OAS(),
            ],
        },
    ),
    # (
    #     "lda3",
    #     [("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["svd"],
    #         "lda__store_covariance": [True, False],
    #     },
    # ),
    # (
    #     "ica",
    #     [("ica", FastICA(random_state=random_state))],
    #     {
    #         "ica__whiten": ["arbitrary-variance", "unit-variance"],
    #         "ica__whiten_solver": ["eigh"],
    #         "ica__fun": ["logcosh", "exp", "cube"],
    #         "ica__max_iter": [400],
    #     },
    # ),
]

---
## Modeling
- For every preprocess method defined above, execute a GridSearch over the preprocess parameters plus the model parameters
- Then it's chosen the best model with the best preprocess method looking at the R^2 and MSE scores (confermare???????????????)
- "StratifiedKFold" is chosen to keep the proportion of umbalanced target lables

### RandomForestRegressor
- The parameter "max_samples=0.1" is set to speed up the train phase, losing some accuracy.

In [10]:
best_estimators = {}

cv = StratifiedKFold(n_splits=5)

param_estimator = {
    "rf__max_samples": [0.1],
    "rf__criterion": ["squared_error"],
    "rf__n_estimators": [200],
}

for name_step, transformers, param_grid in transformers_list:
    pipe = Pipeline([*transformers, ("rf", RandomForestRegressor(n_jobs=-1))])
    gs = GridSearchCV(pipe, param_grid={**param_estimator, **param_grid}, n_jobs=-1, cv=cv, verbose=3)

    gs.fit(X_train.values, y_train)

    best_estimators[name_step] = gs

    y_pred = gs.best_estimator_.predict(X_test.values)

    print(
        "----> {} data. Params: {}. R^2: {}. MSE: {}".format(
            name_step, gs.best_params_, r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)
        )
    )

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[CV 1/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200, rf__warm_start=False;, score=0.112 total time= 5.4min
[CV 3/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200, rf__warm_start=True;, score=0.120 total time= 5.5min
[CV 4/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200, rf__warm_start=True;, score=0.115 total time= 5.5min
[CV 5/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200, rf__warm_start=True;, score=0.109 total time= 5.5min
[CV 1/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200, rf__warm_start=True;, score=0.113 total time= 5.5min
[CV 3/5] END l

In [14]:
# ATTENZIONE VEDI SOTTOOOOO

for name_step, estimator in best_estimators.items():
    
    y_pred = gs.best_estimator_.predict(X_test.values)
    
    print(
        "----> {} data. Params: {}. R^2: {}. MSE: {}".format(
            name_step, gs.best_params_, r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)
        )
    )
    if name_step in ["std-scaler + pca", "std-scaler + pca 0.95%", "lda1", "lda2", "lda3"]:
        ratio = best_estimators[name_step][0].explained_variance_ratio_
        print(
            "{}: number of components={}, total variance={}".format(name_step, len(ratio), np.flip(ratio.cumsum())[0])
        )
    if name_step in ["ica"]:
        print("{}: number of components={}".format(name_step, best_estimators[name_step][0].components_))

----> lda2 data. Params: {'lda__covariance_estimator': MinCovDet(), 'lda__solver': 'eigen', 'rf__criterion': 'squared_error', 'rf__max_samples': 0.1, 'rf__n_estimators': 200, 'rf__warm_start': True}. R^2: 0.30339866892557454. MSE: 76.76298913394048
lda2: number of components=53, total variance=1.1941348329657178


---
## Linear Regression

In [41]:
best_estimators = {}

cv = StratifiedKFold(n_splits=5)

param_estimator = {}

for name_step, transformers, param_grid in transformers_list:
    pipe = Pipeline([*transformers, ("lr", LinearRegression())])
    gs = GridSearchCV(pipe, param_grid={**param_estimator, **param_grid}, n_jobs=-1, cv=cv, verbose=3)

    gs.fit(X_train.values, y_train)

    best_estimators[name_step] = gs

    y_pred = gs.best_estimator_.predict(X_test.values)

    print(
        "----> {} data. Params: {}. R^2: {}. MSE: {}".format(
            name_step, gs.best_params_, r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)
        )
    )

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END lda__covariance_estimator=EmpiricalCovariance(), lda__solver=eigen;, score=0.226 total time=   2.2s
[CV 5/5] END lda__covariance_estimator=EmpiricalCovariance(), lda__solver=eigen;, score=0.232 total time=   2.3s
[CV 3/5] END lda__covariance_estimator=EmpiricalCovariance(), lda__solver=eigen;, score=0.233 total time=   2.6s
[CV 2/5] END lda__covariance_estimator=EmpiricalCovariance(), lda__solver=eigen;, score=0.233 total time=   3.5s
[CV 2/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen;, score=0.093 total time=   3.6s
[CV 4/5] END lda__covariance_estimator=EmpiricalCovariance(), lda__solver=eigen;, score=0.231 total time=   3.6s
[CV 3/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen;, score=0.090 total time=   3.7s
[CV 1/5] END lda__covariance_estimator=LedoitWolf(), lda__solver=eigen;, score=0.088 total time=   3.8s
[CV 5/5] END lda__covariance_estimator=LedoitWolf(), lda__solve

In [42]:
for name_step, gs in best_estimators.items():
    y_pred = gs.best_estimator_.predict(X_test.values)

    print(
        "----> {} data. Params: {}. R^2: {}. MSE: {}".format(
            name_step, gs.best_params_, r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)
        )
    )
    if name_step in ["std-scaler + pca", "std-scaler + pca 0.95%"]:
        ratio = gs.best_estimator_[1].explained_variance_ratio_
        print(
            "\t{}: number of components={}, total variance={}".format(name_step, len(ratio), np.flip(ratio.cumsum())[0])
        )
    if name_step in ["lda1", "lda2", "lda3"]:
        ratio = gs.best_estimator_[0].explained_variance_ratio_
        print(
            "\t{}: number of components={}, total variance={}".format(name_step, len(ratio), np.flip(ratio.cumsum())[0])
        )
    if name_step in ["ica"]:
        print("\t{}: number of components={}".format(name_step, len(gs.best_estimator_[0].components_)))

----> lda2 data. Params: {'lda__covariance_estimator': EmpiricalCovariance(), 'lda__solver': 'eigen'}. R^2: 0.2378258492422084. MSE: 83.9888806450468
	lda2: number of components=53, total variance=0.9999999999999859
