In [None]:
! nvidia-smi

In [None]:
!pip install optuna xgboost==1.6.1 

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
# from pytorch_tabnet import TabNetRegressor
from sklearn.metrics import accuracy_score,mean_squared_error
import optuna as opt
import os
import joblib

In [None]:
def make_save_cv_model(i,model_name,model,best_params,optim,mse_loss,trial_data,output_path="./"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"{i}_{model_name}_{optim}")):
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/model_params.txt"),"w+") as file:file.write(str(best_params))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/mse_loss.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
        joblib.dump(trial_data, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_trial_data.z"))
    else:
        os.mkdir(os.path.join(output_path,f"{i}_{model_name}_{optim}"))
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/model_params.txt"),"w+") as file:file.write(str(best_params))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/mse_loss.txt"),"w+") as file:file.write(f" mse_loss :: {str(mse_loss)}")
        joblib.dump(trial_data, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_trial_data.z"))

In [None]:
def get_trial_data(trial) -> list:
  ''' This function takes the trial objects and returns the dictionary containing the trial details for plotting and comparing purposes '''
  trial_data = trial.get_trials()
  value_dict = {}
  for i in trial_data:
    print(i.params)
    value_dict[i.number] = {"params": i.params , "rmse": i.values}
    print(f"{i.number} : {i.values}")
  return value_dict

In [None]:
def train(fold_dict,fold,model_name,sc_df,tar_col,optim,optim_trial,k_folds,tar_cols="",verbose=1):

    ''' this function is used to train the model with parameters optimization using optuna and cross validation using stratified k_folds'''

    y = sc_df[tar_col]
    print(y.shape)
    x = sc_df.drop([tar_col],axis=1)
    print(x.shape)
    model_name = model_name 
    def objective(trial):
      train_index = fold_dict[fold]["train"]
      test_index = fold_dict[fold]["test"]
      clf = XGBRegressor(n_estimators=trial.suggest_categorical("xgb_est",[4500,5000,6000]),
                         learning_rate=trial.suggest_categorical("xgb_lr",[0.01,3e-4,0.1]),
                         booster = trial.suggest_categorical("xgb_booster",["gbtree","gblinear","dart"]),
                         tree_method = "gpu_hist",
                         predictor = "gpu_predictor")
      # print(f" train_index :: {train_index}")
      # print(f" test_index :: {test_index}")
      X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
      # print(X_train.shape, X_test.shape)
      X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
      Y_train, Y_test = y.iloc[train_index].to_numpy(dtype=np.float64), y.iloc[test_index].to_numpy(np.float64)
      # Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
      print(X_train.shape)
      print(Y_train.shape)
      print(X_test.shape)
      print(Y_test.shape)
      clf.fit(X_train, Y_train,
              eval_set=[(X_test, Y_test)],
              eval_metric=["rmse"])
      Y_pred = clf.predict(X_test)
      mse_error = mean_squared_error(Y_pred, Y_test, squared=False)
      return mse_error

    print(f"Starting optimization for fold : [{fold}/{k_folds}]")
    study = opt.create_study(direction='minimize')
    study.optimize(objective, n_trials=optim_trial)
    best_params = study.best_params
    trial_data = get_trial_data(study)
    print(f" Best params for fold : [{fold}/{k_folds}]")
    print(best_params)
    train_index = fold_dict[fold]["train"]
    test_index = fold_dict[fold]["test"]
    X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
    # print(X_train.shape, X_test.shape)
    X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
    Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
    Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
    clf_model = XGBRegressor(**study.best_params)
    clf_model.fit(X_train,Y_train)
    Y_pred = clf_model.predict(X_test)
    error = mean_squared_error(Y_pred, Y_test, squared=False)
    # try:
    print("[++] Saving the model and parameters in corresponding directories")
    make_save_cv_model(fold,model_name,clf_model,best_params,optim,mse_loss=error,trial_data=trial_data)
    return trial_data
    # except:
    #     print("[-] Failed to save the model")

In [None]:
use_df = pd.read_csv("../input/sclaed-perov-trainable/scaled_trainable.csv")
tar_col = "JV_default_PCE_numeric"
model_name = "xg_boost_reg"
fold_dict = joblib.load("../input/sclaed-perov-trainable/fold_data_export.z")
optim = "no_optim"
folds = [0]
k_folds = 20
num_trials = 20

for num_fold in folds:
    trial_data = train(fold_dict = fold_dict,
          fold = num_fold,
          k_folds=k_folds,
          model_name=model_name,
          sc_df=use_df,
          tar_col=tar_col,
          optim = optim,
          optim_trial = num_trials)
    for key,value in trial_data.items():
      print(f"{key}: {value['rmse']}")
    print(f"[++] Ended the training process for fold {num_fold}")