In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.covariance import (
    OAS,
    EmpiricalCovariance,
    LedoitWolf,
    MinCovDet,
)
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.neighbors import KNeighborsRegressor


plt.style.use("seaborn-v0_8")

In [2]:
random_state = 42

df = pd.read_csv("train.csv")

df_X = df.iloc[:, 1:]
df_y = df.iloc[:, 0]

# "stratify" is set because the target is very imbalanced
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, stratify=df_y, random_state=random_state)
print("Train: {}\nTest: {}".format(X_train.shape, X_test.shape))

Train: (176522, 90)
Test: (75653, 90)


---
# Preprocessing

- To speed up the GridSearch, were removed the parameters of the following preprocessing methods that gave, in general, very low results.
Some of them are just commented.
- [Whitening transformation](https://en.wikipedia.org/wiki/Whitening_transformation) is tested
- [Shrinkage](https://en.wikipedia.org/wiki/Shrinkage_(statistics)#:~:text=In%20statistics%2C%20shrinkage%20is%20the,coefficient%20of%20determination%20'shrinks') is tested
- It's applied StandardScaler and not MinMaxScaler before PCA because the data has a normal distribution and PCA prefers standardized data.

In [11]:
# step_name, [(transformer_name, transformer_constructor()), ...], step_param_grid
transformers_list = [
    # ("raw", [("raw", "passthrough")], {}),  # does nothing to the data
    # ("std-scaled", [("std-scaler", preprocessing.StandardScaler())], {}), # DA TOGLIEREEEE???????????
    # ("min-maxed", [("min-max", preprocessing.MinMaxScaler())], {}), # DA TOGLIEREEEE???????????
    # (
    #     "min-max + pca 0.95",
    #     [("min-max", preprocessing.MinMaxScaler()), ("pca", PCA(random_state=random_state))],
    #     {
    #         "pca__svd_solver": ["full"],
    #         "pca__n_components": [0.95],
    #         "pca__whiten": [True, False],
    #     },
    # ),
    # (
    #     "min-max + lda1",
    #     [("min-max", preprocessing.MinMaxScaler()), ("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["eigen"],
    #         "lda__shrinkage": ["auto", None],
    #     },
    # ),
    (
        "min-max + lda2",
        [("min-max", preprocessing.MinMaxScaler()), ("lda", LinearDiscriminantAnalysis())],
        {
            "lda__solver": ["eigen"],
            "lda__covariance_estimator": [
                EmpiricalCovariance(),
                LedoitWolf(),
                MinCovDet(),
                OAS(),
            ],
        },
    ),
    # (
    #     "std-scaler + lda2",
    #     [("std-scaler", preprocessing.StandardScaler()), ("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["eigen"],
    #         "lda__covariance_estimator": [
    #             EmpiricalCovariance(),
    #             LedoitWolf(),
    #             MinCovDet(),
    #             OAS(),
    #         ],
    #     },
    # ),
    # (
    #     "min-max + lda3",
    #     [("min-max", preprocessing.MinMaxScaler()), ("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["svd"],
    #         "lda__store_covariance": [True, False],
    #     },
    # ),
    # (
    #     "min-max + ica",
    #     [("min-max", preprocessing.MinMaxScaler()), ("ica", FastICA(random_state=random_state))],
    #     {
    #         "ica__whiten": ["arbitrary-variance", "unit-variance"],
    #         "ica__whiten_solver": ["eigh"],
    #         "ica__fun": ["logcosh", "exp", "cube"],
    #         "ica__max_iter": [400],
    #     },
    # ),
    # (
    #     "lda1",
    #     [("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["eigen"],
    #         "lda__shrinkage": ["auto", None],
    #     },
    # ),
    # (
    #     "lda2",
    #     [("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["eigen"],
    #         "lda__covariance_estimator": [
    #             EmpiricalCovariance(),
    #             LedoitWolf(),
    #             MinCovDet(),
    #             OAS(),
    #         ],
    #     },
    # ),
    # (
    #     "lda3",
    #     [("lda", LinearDiscriminantAnalysis())],
    #     {
    #         "lda__solver": ["svd"],
    #         "lda__store_covariance": [True, False],
    #     },
    # ),
    # (
    #     "ica",
    #     [("ica", FastICA(random_state=random_state))],
    #     {
    #         "ica__whiten": ["arbitrary-variance", "unit-variance"],
    #         "ica__whiten_solver": ["eigh"],
    #         "ica__fun": ["logcosh", "exp", "cube"],
    #         "ica__max_iter": [400],
    #     },
    # ),
]

---
---
# Modeling
- For every preprocess method defined above, execute a GridSearch over the preprocess parameters plus the model parameters
- Then it's chosen the best model with the best preprocess method looking at the R^2 score
- "StratifiedKFold" is chosen to keep the proportion of imbalanced target lables

In [4]:
def gridSearch(name_estimator, estimator, param_estimator, cv):
    searches = {}
    predictions = {}
    for name_step, transformers, param_grid in transformers_list:
        pipe = Pipeline([*transformers, (name_estimator, estimator)])
        gs = GridSearchCV(pipe, param_grid={**param_estimator, **param_grid}, n_jobs=-1, cv=cv, verbose=3)

        gs.fit(X_train.values, y_train)

        y_pred = gs.best_estimator_.predict(X_test.values)

        searches[name_step] = gs
        predictions[name_step] = y_pred

        print(
            "----> {} data. Params: {}. R^2: {}. MSE: {}".format(
                name_step, gs.best_params_, r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)
            )
        )
    return searches, predictions

In [5]:
def print_store_results(searches: dict, predictions: dict, name_est):
    best_r2_score = -100
    best_preprocessor = []
    best_estimator = None
    best_search_name = None

    for name_step, gs in searches.items():
        r2 = r2_score(y_test, predictions[name_step])

        if r2 > best_r2_score:
            best_r2_score = r2
            best_search_name = name_step
            best_preprocessor = gs.best_estimator_[:-1]  # take all but estimator
            best_estimator = gs.best_estimator_[-1]

        print(
            "----> {} data. Params: {}. R^2: {}. MSE: {}".format(
                name_step, gs.best_params_, r2, mean_squared_error(y_test, predictions[name_step])
            )
        )

        if name_step in [
            "min-max + pca",
            "min-max + pca 0.95%",
            "min-max + lda1",
            "min-max + lda2",
            "min-max + lda3",
            "std-scaler + lda2",
        ]:
            ratio = gs.best_estimator_[1].explained_variance_ratio_
            print(
                "\t\t{}: number of components={}, total variance={}".format(
                    name_step, len(ratio), np.flip(ratio.cumsum())[0]
                )
            )
        if name_step in ["lda1", "lda2", "lda3"]:
            ratio = gs.best_estimator_[0].explained_variance_ratio_
            print(
                "\t\t{}: number of components={}, total variance={}".format(
                    name_step, len(ratio), np.flip(ratio.cumsum())[0]
                )
            )
        if name_step in ["min-max + ica"]:
            print("\t\t{}: number of components={}".format(name_step, len(gs.best_estimator_[1].components_)))

    file = open(name_est + "_preproc_" + ".save", "wb")
    pickle.dump(best_preprocessor, file)
    file = open(name_est + "_model_" + ".save", "wb")
    pickle.dump(best_estimator, file)
    file.close()

    print("\nSAVED: ----> {} data. R^2: {}".format(best_search_name, best_r2_score))

---
## Linear Regression
- It appears that PCA with an explained variance ratio of 0.95 performs much better with a min-max scaling (54 components) instead of a standard-scaling (67 components) (the test was removed for clearance).
- Overall, looking at R^2 and MSE, the best model is LDA with explained variance ratio of almost 1 and 53 components. No differencies between a plain or pior min-max or standard-scaling.

In [25]:
cv = StratifiedKFold(n_splits=5)

param_estimator = {}

searches, predictions = gridSearch("lr", LinearRegression(), param_estimator, cv)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
----> raw data. Params: {}. R^2: 0.23782584924220862. MSE: 83.98888064504679
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----> min-max + lda2 data. Params: {'lda__covariance_estimator': EmpiricalCovariance(), 'lda__solver': 'eigen'}. R^2: 0.2378258492422083. MSE: 83.98888064504682
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----> std-scaler + lda2 data. Params: {'lda__covariance_estimator': EmpiricalCovariance(), 'lda__solver': 'eigen'}. R^2: 0.23782584924220818. MSE: 83.98888064504683


In [27]:
print_store_results(searches, predictions, "lr")

----> raw data. Params: {}. R^2: 0.23782584924220862. MSE: 83.98888064504679
----> min-max + lda2 data. Params: {'lda__covariance_estimator': EmpiricalCovariance(), 'lda__solver': 'eigen'}. R^2: 0.2378258492422083. MSE: 83.98888064504682
		min-max + lda2: number of components=53, total variance=1.0000000000000004
----> std-scaler + lda2 data. Params: {'lda__covariance_estimator': EmpiricalCovariance(), 'lda__solver': 'eigen'}. R^2: 0.23782584924220818. MSE: 83.98888064504683
		std-scaler + lda2: number of components=53, total variance=1.000000000000015

SAVED: ----> raw data. R^2: 0.23782584924220862


---
## RandomForestRegressor
- The parameter "max_samples=0.1" is set to speed up the train phase, losing some accuracy.

In [11]:
best_estimators = {}

cv = StratifiedKFold(n_splits=3)

param_estimator = {
    "rf__max_samples": [0.1],
    "rf__criterion": ["squared_error"],
    "rf__n_estimators": [200],
}

searches, predictions = gridSearch("rf", RandomForestRegressor(n_jobs=-1), param_estimator, cv)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[CV 1/3] END ica__fun=logcosh, ica__max_iter=400, ica__whiten=unit-variance, ica__whiten_solver=eigh, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200;, score=0.213 total time= 6.5min
[CV 2/3] END ica__fun=logcosh, ica__max_iter=400, ica__whiten=arbitrary-variance, ica__whiten_solver=eigh, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200;, score=0.220 total time= 7.0min
[CV 2/3] END ica__fun=logcosh, ica__max_iter=400, ica__whiten=unit-variance, ica__whiten_solver=eigh, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200;, score=0.219 total time= 7.0min
[CV 3/3] END ica__fun=logcosh, ica__max_iter=400, ica__whiten=arbitrary-variance, ica__whiten_solver=eigh, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_estimators=200;, score=0.225 total time= 7.1min
[CV 1/3] END ica__fun=logcosh, ica__max_iter=400, ica__whiten=arbitrary-variance, ica__whiten_solver=eigh, rf__criterion=squared_error, rf__max_samples=0.1, rf__n_est

In [13]:
print_store_results(searches, predictions, "rf")

----> ica data. Params: {'ica__fun': 'logcosh', 'ica__max_iter': 400, 'ica__whiten': 'arbitrary-variance', 'ica__whiten_solver': 'eigh', 'rf__criterion': 'squared_error', 'rf__max_samples': 0.1, 'rf__n_estimators': 200}. R^2: 0.23227960709198137. MSE: 84.60005680409236
	ica: number of components=90

SAVED: ----> ica data. R^2: 0.23227960709198137


---
## KNN

In [14]:
best_estimators = {}

cv = StratifiedKFold(n_splits=5)

param_estimator = {
    "knr__n_neighbors": [20],
    "knr__weights": ["uniform", "distance"],
    "knr__metric": [
        "cityblock",
        "cosine",
        "euclidean",
        "haversine",
        "l1",
        "l2",
        "manhattan",
        "nan_euclidean",
    ],
}

searches, predictions = gridSearch("knr", KNeighborsRegressor(n_jobs=-1), param_estimator, cv)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [13]:
print_store_results(searches, predictions, "knr")

----> min-max + lda2 data. Params: {'knr__n_neighbors': 20, 'knr__weights': 'distance', 'lda__covariance_estimator': OAS(), 'lda__solver': 'eigen'}. R^2: 0.31206933420092875. MSE: 75.80751265891068
		min-max + lda2: number of components=53, total variance=1.491844798635474

SAVED: ----> min-max + lda2 data. R^2: 0.31206933420092875
