In [1]:
import pandas as pd
import numpy as np
# from xgboost import XGBRegressor
# import torch
# from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics import accuracy_score,mean_squared_error,mean_absolute_error,r2_score
# from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import optuna as opt
import joblib
import random
import os

In [2]:
def make_save_cv_model(i,model_name,model,optim,mse_loss,trial_data,output_path="./"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}")):
        joblib.dump(model, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/losses.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
        joblib.dump(trial_data, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_trial_data.z"))
    else:
        os.mkdir(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}"))
        joblib.dump(model, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/losses.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
        joblib.dump(trial_data, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_trial_data.z"))

In [3]:
def save_cv_model(i,model_name,model,optim,mse_loss,output_path="./"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"{i}_{model_name}_{optim}")):
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/losses_{fold}.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
    else:
        os.mkdir(os.path.join(output_path,f"{i}_{model_name}_{optim}"))
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/losses_{fold}.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")

In [4]:
def get_trial_data(trial) -> list:
  ''' This function takes the trial objects and returns the dictionary containing the trial details for plotting and comparing purposes '''
  trial_data = trial.get_trials()
  value_dict = {}
  for i in trial_data:
    print(i.params)
    value_dict[i.number] = {"params": i.params , "rmse": i.values}
    print(f"{i.number} : {i.values}")
  return value_dict

In [5]:
def train_trial(fold_dict,fold,model_name,sc_df,tar_col,optim,optim_trial,k_folds,tar_cols="",verbose=1):

    ''' this function is used to train the model with parameters optimization using optuna and cross validation using stratified k_folds'''

    y = sc_df[tar_col]
    print(y.shape)
    x = sc_df.drop([tar_col],axis=1)
    print(x.shape)
    model_name = model_name 
    def objective(trial):
      train_index = fold_dict[fold]["train"]
      test_index = fold_dict[fold]["test"]
      clf = Ridge(max_iter = trial.suggest_categorical("max_iter",[2000,4000,5000]),
                 tol = trial.suggest_categorical("tol",[1e-4,1e-3,1e-5]),
                 solver = trial.suggest_categorical("solver",["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
                 random_state=1234)
      # print(f" train_index :: {train_index}")
      # print(f" test_index :: {test_index}")
      X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
      # print(X_train.shape, X_test.shape)
      X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
      Y_train, Y_test = y.iloc[train_index].to_numpy(dtype=np.float64), y.iloc[test_index].to_numpy(np.float64)
      # Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
      print(X_train.shape)
      print(Y_train.shape)
      print(X_test.shape)
      print(Y_test.shape)
      clf.fit(X_train, Y_train)
      Y_pred = clf.predict(X_test)
      mse_error = mean_squared_error(Y_pred, Y_test, squared=False)
      return mse_error

    print(f"Starting optimization for fold : [{fold}/{k_folds}]")
    study = opt.create_study(direction='minimize')
    study.optimize(objective, n_trials=optim_trial)
    best_params = study.best_params
    trial_data = get_trial_data(study)
    print(f" Best params for fold : [{fold}/{k_folds}]")
    print(best_params)
    train_index = fold_dict[fold]["train"]
    test_index = fold_dict[fold]["test"]
    X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
    # print(X_train.shape, X_test.shape)
    X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
    Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
    Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
    clf_model = Ridge(**study.best_params)
    clf_model.fit(X_train,Y_train)
    Y_pred = clf_model.predict(X_test)
    error =  clf_model.predict(X_test)
    error = {"mse_error" : mean_squared_error(Y_pred, Y_test, squared=False),
    "mae_error" : mean_absolute_error(Y_pred,Y_test),
    "rmse_error" : mean_squared_error(Y_pred, Y_test),
    "r2_score" : r2_score(Y_pred,Y_test) }
    # try:
    print("[++] Saving the model and parameters in corresponding directories")
    make_save_cv_model(fold,model_name,clf_model,optim,mse_loss=error,trial_data=trial_data)
    return trial_data,best_params

In [6]:
def train(fold_dict,fold,model_name,sc_df,tar_col,optim,k_folds,best_params,tar_cols="",verbose=1):

    ''' this function is used to train the model with parameters optimization using optuna and cross validation using stratified k_folds'''

    y = sc_df[tar_col]
    print(y.shape)
    x = sc_df.drop([tar_col],axis=1)
    print(x.shape)
    model_name = model_name 
    train_index = fold_dict[fold]["train"]
    test_index = fold_dict[fold]["test"]
    X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
    # print(X_train.shape, X_test.shape)
    X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
    Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
    Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
    clf_model = Ridge(**best_params)
    Y_train = Y_train.reshape(-1,1)
    clf_model.fit(X_train,Y_train)
    Y_pred = clf_model.predict(X_test)
    error = {
    "mse_error" : mean_squared_error(Y_pred, Y_test, squared=False),
    "mae_error" : mean_absolute_error(Y_pred,Y_test),
    "rmse_error" : mean_squared_error(Y_pred, Y_test),
    "r2_score" : r2_score(Y_pred,Y_test) }
    # try:
    print("[++] Saving the model and parameters in corresponding directories")
    save_cv_model(fold,model_name,clf_model,optim,mse_loss=error)

In [7]:
use_df = pd.read_csv("../input/perov-scaled-data/scaled_trainable.csv")
tar_col = "JV_default_PCE_numeric"
model_name = "ridge_reg"
fold_dict = joblib.load("../input/perov-fold-data/fold_data_export.z")
optim = "no_optim"
k_folds = 20
trial_fold = random.choice([x for x in range(k_folds)])
num_trials = 15

trial_data,best_params = train_trial(fold_dict = fold_dict,
      fold = trial_fold,
      k_folds=k_folds,
      model_name=model_name,
      sc_df=use_df,
      tar_col=tar_col,
      optim = optim,
      optim_trial = num_trials)
for key,value in trial_data.items():
  print(f"{key}: {value['rmse']}")
print(f"[++] Ended the training process for fold {trial_fold}")


main_folds = [x for x in range(k_folds)]
for fold in main_folds:
    train(fold_dict = fold_dict,
          fold = fold,
          k_folds=k_folds,
          model_name=model_name,
          sc_df=use_df,
          tar_col=tar_col,
          optim = optim,
          best_params = best_params)
    print(f"[++] Ended the training process for fold {fold}")

[32m[I 2023-01-01 12:54:47,791][0m A new study created in memory with name: no-name-ed55acaf-47f1-4cfc-9ffd-55beadebe20b[0m


(46820,)
(46820, 103)
Starting optimization for fold : [1/20]
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:00:27,672][0m Trial 0 finished with value: 6.58362344468907 and parameters: {'max_iter': 4000, 'tol': 1e-05, 'solver': 'sag'}. Best is trial 0 with value: 6.58362344468907.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:01:09,521][0m Trial 1 finished with value: 6.597167727345976 and parameters: {'max_iter': 2000, 'tol': 0.001, 'solver': 'saga'}. Best is trial 0 with value: 6.58362344468907.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:01:09,760][0m Trial 2 finished with value: 6.5835290602572245 and parameters: {'max_iter': 4000, 'tol': 1e-05, 'solver': 'svd'}. Best is trial 2 with value: 6.5835290602572245.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:01:51,488][0m Trial 3 finished with value: 6.597167727345976 and parameters: {'max_iter': 2000, 'tol': 0.001, 'solver': 'saga'}. Best is trial 2 with value: 6.5835290602572245.[0m
[32m[I 2023-01-01 13:01:51,569][0m Trial 4 finished with value: 6.583513741973652 and parameters: {'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 4 with value: 6.583513741973652.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:04:59,264][0m Trial 5 finished with value: 6.584440212197521 and parameters: {'max_iter': 2000, 'tol': 0.0001, 'solver': 'sag'}. Best is trial 4 with value: 6.583513741973652.[0m
[32m[I 2023-01-01 13:04:59,343][0m Trial 6 finished with value: 6.583540253589651 and parameters: {'max_iter': 2000, 'tol': 1e-05, 'solver': 'lsqr'}. Best is trial 4 with value: 6.583513741973652.[0m
[32m[I 2023-01-01 13:04:59,428][0m Trial 7 finished with value: 6.583540253589651 and parameters: {'max_iter': 2000, 'tol': 1e-05, 'solver': 'lsqr'}. Best is trial 4 with value: 6.583513741973652.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:10:40,382][0m Trial 8 finished with value: 6.58362344468907 and parameters: {'max_iter': 4000, 'tol': 1e-05, 'solver': 'sag'}. Best is trial 4 with value: 6.583513741973652.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:14:05,237][0m Trial 9 finished with value: 6.586980254476669 and parameters: {'max_iter': 2000, 'tol': 1e-05, 'solver': 'saga'}. Best is trial 4 with value: 6.583513741973652.[0m
[32m[I 2023-01-01 13:14:05,314][0m Trial 10 finished with value: 6.583513741973652 and parameters: {'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 4 with value: 6.583513741973652.[0m
[32m[I 2023-01-01 13:14:05,385][0m Trial 11 finished with value: 6.583513741973652 and parameters: {'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 4 with value: 6.583513741973652.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-01-01 13:14:05,463][0m Trial 12 finished with value: 6.583513741973652 and parameters: {'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 4 with value: 6.583513741973652.[0m
[32m[I 2023-01-01 13:14:05,544][0m Trial 13 finished with value: 6.583513741973652 and parameters: {'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 4 with value: 6.583513741973652.[0m
[32m[I 2023-01-01 13:14:05,639][0m Trial 14 finished with value: 6.583529060257231 and parameters: {'max_iter': 5000, 'tol': 0.0001, 'solver': 'cholesky'}. Best is trial 4 with value: 6.583513741973652.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)
{'max_iter': 4000, 'tol': 1e-05, 'solver': 'sag'}
0 : [6.58362344468907]
{'max_iter': 2000, 'tol': 0.001, 'solver': 'saga'}
1 : [6.597167727345976]
{'max_iter': 4000, 'tol': 1e-05, 'solver': 'svd'}
2 : [6.5835290602572245]
{'max_iter': 2000, 'tol': 0.001, 'solver': 'saga'}
3 : [6.597167727345976]
{'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}
4 : [6.583513741973652]
{'max_iter': 2000, 'tol': 0.0001, 'solver': 'sag'}
5 : [6.584440212197521]
{'max_iter': 2000, 'tol': 1e-05, 'solver': 'lsqr'}
6 : [6.583540253589651]
{'max_iter': 2000, 'tol': 1e-05, 'solver': 'lsqr'}
7 : [6.583540253589651]
{'max_iter': 4000, 'tol': 1e-05, 'solver': 'sag'}
8 : [6.58362344468907]
{'max_iter': 2000, 'tol': 1e-05, 'solver': 'saga'}
9 : [6.586980254476669]
{'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}
10 : [6.583513741973652]
{'max_iter': 5000, 'tol': 0.0001, 'solver': 'sparse_cg'}
11 : [6.583513741973652]
