In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# regression models
from sklearn import linear_model

In [3]:
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# config

In [4]:
SEED = 0
TARGET = 'water_T'
SCORING = 'neg_root_mean_squared_error'
N_ITER = 3
N_SPLITS = 4

# Load Data

In [5]:
df = pd.read_csv("./new_MinMaxScaled.csv", index_col=0)
df_not_normalized = pd.read_csv("./df_final_original_clean.csv", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: './new_MinMaxScaled.csv'

In [6]:
df.drop('regime', axis=1, inplace=True)

In [7]:
y = df_not_normalized.loc[df.index][TARGET]
X = df.drop(TARGET, axis=1)

In [8]:
X.shape, y.shape

((677080, 30), (677080,))

# Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# Feature Selection

In [10]:
df_train = pd.concat([X_train, y_train], axis=1)

In [11]:
corr = df_train.corr()[TARGET]

In [12]:
corr_features = corr[corr > 0.05].drop(TARGET)

In [13]:
corr_features

Month     0.074515
lon       0.073936
Dry_T     0.155902
Wet_T     0.137357
O2mll     0.659036
O2Sat     0.694239
Secchi    0.061262
Name: water_T, dtype: float64

In [14]:
X_train = X_train[corr_features.index]
X_test = X_test[corr_features.index]

In [15]:
X_train.shape, y_train.shape

((541664, 7), (541664,))

# Regression Models

In [10]:
regressors = {
    linear_model.Lasso(): {
        'reg__alpha': [0, 0.001, 0.003, 0.005, 0.02, 0.03, 0.05, 0.06, 0.08, 0.1],
    },
    linear_model.Ridge(): {
        'reg__alpha': [10, 100, 300, 500, 600, 700, 750, 800],
    }
}

In [12]:
def train(Xtrain, ytrain):
    print("Start training...")

    train_output = {}
    for reg, params in regressors.items():
        print(f"Running {reg.__class__}...")

        search_cv = GridSearchCV(
            # pipline
            estimator=Pipeline(
                steps=[
                    ('reg', reg)
                ]
            ),

            # cross validation
            cv=KFold(
                n_splits=N_SPLITS,
                shuffle=True,
                random_state=SEED
            ),

            # parameters
#             param_distributions=params,
            param_grid=params,

            scoring=SCORING,
#             n_iter=N_ITER,
#             random_state=SEED,
            n_jobs=8,
            verbose=0,
        )

        # fit model
        search_cv.fit(Xtrain, ytrain)

        # best params and model
        search_cv.best_params_

        train_output[reg] = {
            'best_estimator': search_cv.best_estimator_,
            'cv_results': search_cv.cv_results_
        }
    return train_output

In [13]:
train_output = train(X_train, y_train)

Start training...
Running <class 'sklearn.linear_model._coordinate_descent.Lasso'>...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  positive)
  positive)


Running <class 'sklearn.linear_model._ridge.Ridge'>...


In [15]:
for est, output in train_output.items():
    
    cv_results = output['cv_results']
    best_est = output['best_estimator']
    
    pred_test = best_est.predict(X_test)
    pred_train = best_est.predict(X_train)

    print(best_est.named_steps['reg'])
    print(mean_squared_error(pred_test, y_test, squared=True))
    print(mean_squared_error(pred_train, y_train, squared=True))
    print(len(cv_results['mean_test_score']))
    print('-'*50)

Lasso(alpha=0)
2.532678017975696
2.512930875162607
10
--------------------------------------------------
Ridge(alpha=10)
2.5308138905806277
2.518439925372517
8
--------------------------------------------------


In [12]:
sns.displot(pred_test - y_test)

NameError: name 'pred_test' is not defined