In [1]:
import pandas as pd
import numpy as np
# from xgboost import XGBRegressor
# import torch
# from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics import accuracy_score,mean_squared_error,mean_absolute_error,r2_score
# from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import optuna as opt
import joblib
import random
import os

In [2]:
! pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1
[0m

In [3]:
def make_save_cv_model(i,model_name,model,optim,mse_loss,trial_data,output_path="./"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}")):
        joblib.dump(model, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/losses.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
        joblib.dump(trial_data, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_trial_data.z"))
    else:
        os.mkdir(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}"))
        joblib.dump(model, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/losses.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
        joblib.dump(trial_data, os.path.join(output_path,f"trial_{i}_{model_name}_{optim}/{i}_trial_data.z"))

In [4]:
def save_cv_model(i,model_name,model,optim,mse_loss,output_path="./"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"{i}_{model_name}_{optim}")):
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/losses_{fold}.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
    else:
        os.mkdir(os.path.join(output_path,f"{i}_{model_name}_{optim}"))
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/losses_{fold}.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")

In [5]:
def get_trial_data(trial) -> list:
  ''' This function takes the trial objects and returns the dictionary containing the trial details for plotting and comparing purposes '''
  trial_data = trial.get_trials()
  value_dict = {}
  for i in trial_data:
    print(i.params)
    value_dict[i.number] = {"params": i.params , "rmse": i.values}
    print(f"{i.number} : {i.values}")
  return value_dict

In [6]:
def save_optuna_plots(study,dirname):
    optim_hist = opt.visualization.plot_optimization_history(study)
    intermediate = opt.visualization.plot_intermediate_values(study)
    parallel = opt.visualization.plot_parallel_coordinate(study)
    plot_slice = opt.visualization.plot_slice(study)
    name_lists = ["optim_hist","intermediate","parallel", "plot_slice"]
    plot_lists = [optim_hist,intermediate,parallel, plot_slice]
    for name,plot in zip(name_lists,plot_lists):
        if os.path.exists(f"./{dirname}"):
            print("getting into if block")
            plot.write_image(f"./{dirname}/{name}.jpg",width=2, height=2)
        else:
            print("getting into else block")
            os.mkdir(f"./{dirname}")
            plot.write_image(f"./{dirname}/{name}.jpg",width=2, height=2)

In [7]:
def train_trial(fold_dict,fold,model_name,sc_df,tar_col,optim,optim_trial,k_folds,tar_cols="",verbose=1):

    ''' this function is used to train the model with parameters optimization using optuna and cross validation using stratified k_folds'''
    print(f"running for fold :{fold}")
    y = sc_df[tar_col]
    print(y.shape)
    x = sc_df.drop([tar_col],axis=1)
    print(x.shape)
    model_name = model_name 
    def objective(trial):
      train_index = fold_dict[fold]["train"]
      test_index = fold_dict[fold]["test"]
      clf = Ridge(max_iter = trial.suggest_categorical("max_iter",[2000,4000,5000]),
                 tol = trial.suggest_categorical("tol",[1e-4,1e-3,1e-5]),
                 solver = trial.suggest_categorical("solver",["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
                 random_state=1234)
      # print(f" train_index :: {train_index}")
      # print(f" test_index :: {test_index}")
      X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
      # print(X_train.shape, X_test.shape)
      X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
      Y_train, Y_test = y.iloc[train_index].to_numpy(dtype=np.float64), y.iloc[test_index].to_numpy(np.float64)
      # Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
      print(X_train.shape)
      print(Y_train.shape)
      print(X_test.shape)
      print(Y_test.shape)
      clf.fit(X_train, Y_train)
      Y_pred = clf.predict(X_test)
      mse_error = mean_squared_error(Y_pred, Y_test, squared=False)
      return mse_error

    print(f"Starting optimization for fold : [{fold}/{k_folds}]")
    study = opt.create_study(direction='minimize')
    study.optimize(objective, n_trials=optim_trial)
    save_optuna_plots(study,"optuna_plots")
    best_params = study.best_params
    trial_data = get_trial_data(study)
    print(f" Best params for fold : [{fold}/{k_folds}]")
    print(best_params)
    train_index = fold_dict[fold]["train"]
    test_index = fold_dict[fold]["test"]
    X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
    # print(X_train.shape, X_test.shape)
    X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
    Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
    Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
    clf_model = Ridge(**study.best_params)
    clf_model.fit(X_train,Y_train)
    Y_pred = clf_model.predict(X_test)
    error =  clf_model.predict(X_test)
    error = {"mse_error" : mean_squared_error(Y_pred, Y_test, squared=False),
    "mae_error" : mean_absolute_error(Y_pred,Y_test),
    "rmse_error" : mean_squared_error(Y_pred, Y_test),
    "r2_score" : r2_score(Y_pred,Y_test) }
    # try:
    print("[++] Saving the model and parameters in corresponding directories")
    make_save_cv_model(fold,model_name,clf_model,optim,mse_loss=error,trial_data=trial_data)
    return trial_data,best_params

In [8]:
def train(fold_dict,fold,model_name,sc_df,tar_col,optim,k_folds,best_params,tar_cols="",verbose=1):

    ''' this function is used to train the model with parameters optimization using optuna and cross validation using stratified k_folds'''

    y = sc_df[tar_col]
    print(y.shape)
    x = sc_df.drop([tar_col],axis=1)
    print(x.shape)
    model_name = model_name 
    train_index = fold_dict[fold]["train"]
    test_index = fold_dict[fold]["test"]
    X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
    # print(X_train.shape, X_test.shape)
    X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
    Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
    Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
    clf_model = Ridge(**best_params)
    Y_train = Y_train.reshape(-1,1)
    clf_model.fit(X_train,Y_train)
    Y_pred = clf_model.predict(X_test)
    error = {
    "mse_error" : mean_squared_error(Y_pred, Y_test, squared=False),
    "mae_error" : mean_absolute_error(Y_pred,Y_test),
    "rmse_error" : mean_squared_error(Y_pred, Y_test),
    "r2_score" : r2_score(Y_pred,Y_test) }
    # try:
    print("[++] Saving the model and parameters in corresponding directories")
    save_cv_model(fold,model_name,clf_model,optim,mse_loss=error)

In [9]:
use_df = pd.read_csv("../input/perov-scaled-data/scaled_trainable.csv")
tar_col = "JV_default_PCE_numeric"
model_name = "ridge_reg"
fold_dict = joblib.load("../input/perov-fold-data/fold_data_export.z")
optim = "no_optim"
k_folds = 20
trial_fold = random.choice([x for x in range(k_folds)])
num_trials = 20

trial_data,best_params = train_trial(fold_dict = fold_dict,
      fold = trial_fold,
      k_folds=k_folds,
      model_name=model_name,
      sc_df=use_df,
      tar_col=tar_col,
      optim = optim,
      optim_trial = num_trials)
for key,value in trial_data.items():
  print(f"{key}: {value['rmse']}")
print(f"[++] Ended the training process for fold {trial_fold}")


main_folds = [x for x in range(k_folds)]
for fold in main_folds:
    train(fold_dict = fold_dict,
          fold = fold,
          k_folds=k_folds,
          model_name=model_name,
          sc_df=use_df,
          tar_col=tar_col,
          optim = optim,
          best_params = best_params)
    print(f"[++] Ended the training process for fold {fold}")

[32m[I 2023-02-17 10:10:51,765][0m A new study created in memory with name: no-name-2845fa9a-ea6c-4a33-941c-9d8e21c1c133[0m
[32m[I 2023-02-17 10:10:51,955][0m Trial 0 finished with value: 6.7534243245664864 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m


running for fold :11
(46820,)
(46820, 103)
Starting optimization for fold : [11/20]
(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:10:52,450][0m Trial 1 finished with value: 6.753584816605659 and parameters: {'max_iter': 2000, 'tol': 0.0001, 'solver': 'svd'}. Best is trial 0 with value: 6.7534243245664864.[0m
[32m[I 2023-02-17 10:10:52,630][0m Trial 2 finished with value: 6.753584816605664 and parameters: {'max_iter': 4000, 'tol': 0.001, 'solver': 'cholesky'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:10:52,868][0m Trial 3 finished with value: 6.753761274419008 and parameters: {'max_iter': 5000, 'tol': 0.001, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:10:53,380][0m Trial 4 finished with value: 6.753584816605659 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'svd'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:15:50,287][0m Trial 5 finished with value: 6.755457102350513 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'saga'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:15:50,760][0m Trial 6 finished with value: 6.753584816605659 and parameters: {'max_iter': 5000, 'tol': 1e-05, 'solver': 'svd'}. Best is trial 0 with value: 6.7534243245664864.[0m
[32m[I 2023-02-17 10:15:50,910][0m Trial 7 finished with value: 6.753584816605664 and parameters: {'max_iter': 2000, 'tol': 1e-05, 'solver': 'cholesky'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:15:51,125][0m Trial 8 finished with value: 6.753761274419008 and parameters: {'max_iter': 2000, 'tol': 0.001, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m
[32m[I 2023-02-17 10:15:51,262][0m Trial 9 finished with value: 6.753584816605664 and parameters: {'max_iter': 5000, 'tol': 1e-05, 'solver': 'cholesky'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:15:51,461][0m Trial 10 finished with value: 6.753584756876941 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 0 with value: 6.7534243245664864.[0m
[32m[I 2023-02-17 10:15:51,660][0m Trial 11 finished with value: 6.753584756876941 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:18:56,759][0m Trial 12 finished with value: 6.754559480493831 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'sag'}. Best is trial 0 with value: 6.7534243245664864.[0m
[32m[I 2023-02-17 10:18:56,966][0m Trial 13 finished with value: 6.753584756876941 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:18:57,160][0m Trial 14 finished with value: 6.753584756876941 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'sparse_cg'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:18:57,366][0m Trial 15 finished with value: 6.7534243245664864 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m
[32m[I 2023-02-17 10:18:57,612][0m Trial 16 finished with value: 6.7534243245664864 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:18:57,821][0m Trial 17 finished with value: 6.7534243245664864 and parameters: {'max_iter': 4000, 'tol': 0.0001, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:18:58,027][0m Trial 18 finished with value: 6.753583961123179 and parameters: {'max_iter': 5000, 'tol': 1e-05, 'solver': 'lsqr'}. Best is trial 0 with value: 6.7534243245664864.[0m


(44479, 103)
(44479,)
(2341, 103)
(2341,)
(44479, 103)
(44479,)
(2341, 103)
(2341,)


[32m[I 2023-02-17 10:19:43,202][0m Trial 19 finished with value: 6.766972137425879 and parameters: {'max_iter': 2000, 'tol': 0.001, 'solver': 'saga'}. Best is trial 0 with value: 6.7534243245664864.[0m
[33m[W 2023-02-17 10:19:43,347][0m You need to set up the pruning feature to utilize `plot_intermediate_values()`[0m


getting into else block
getting into if block
getting into if block
getting into if block
{'max_iter': 4000, 'tol': 0.0001, 'solver': 'lsqr'}
0 : [6.7534243245664864]
{'max_iter': 2000, 'tol': 0.0001, 'solver': 'svd'}
1 : [6.753584816605659]
{'max_iter': 4000, 'tol': 0.001, 'solver': 'cholesky'}
2 : [6.753584816605664]
{'max_iter': 5000, 'tol': 0.001, 'solver': 'lsqr'}
3 : [6.753761274419008]
{'max_iter': 4000, 'tol': 0.0001, 'solver': 'svd'}
4 : [6.753584816605659]
{'max_iter': 4000, 'tol': 0.0001, 'solver': 'saga'}
5 : [6.755457102350513]
{'max_iter': 5000, 'tol': 1e-05, 'solver': 'svd'}
6 : [6.753584816605659]
{'max_iter': 2000, 'tol': 1e-05, 'solver': 'cholesky'}
7 : [6.753584816605664]
{'max_iter': 2000, 'tol': 0.001, 'solver': 'lsqr'}
8 : [6.753761274419008]
{'max_iter': 5000, 'tol': 1e-05, 'solver': 'cholesky'}
9 : [6.753584816605664]
{'max_iter': 4000, 'tol': 0.0001, 'solver': 'sparse_cg'}
10 : [6.753584756876941]
{'max_iter': 4000, 'tol': 0.0001, 'solver': 'sparse_cg'}
11 : [6