In [41]:
! nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [42]:
!pip install pytorch-tabnet optuna xgboost==1.6.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [50]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,classification_report
import optuna as opt
import torch
import os
import joblib

In [51]:
def make_save_cv_model(i,model_name,model,best_params,optim,clf_report,accuracy,output_path="./drive/MyDrive/SOLAR_CELL/ML_PROCESSED_DATA/outputs/xg_boost_output"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"{i}_{model_name}_{optim}")):
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/model_params.txt"),"w+") as file:file.write(str(best_params))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/classification_report.txt"),"w+") as file:file.write(str(clf_report))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/accuracy_score.txt"),"w+") as file:file.write(f" accuracy :: {str(accuracy)}")
    else:
        os.mkdir(os.path.join(output_path,f"{i}_{model_name}_{optim}"))
        joblib.dump(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/model_params.txt"),"w+") as file:file.write(str(best_params))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/classification_report.txt"),"w+") as file:file.write(str(clf_report))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/accuracy_score.txt"),"w+") as file:file.write(f" accuracy :: {str(accuracy)}")

In [52]:
def train(fold_dict,fold,model_name,sc_df,tar_col,optim_trial,k_folds=10,tar_cols="",verbose=1):

    ''' this function is used to train the model with parameters optimization using optuna and cross validation using stratified k_folds'''

    y = sc_df[tar_col]
    print(y.shape)
    x = sc_df.drop([tar_col],axis=1)
    print(x.shape)
    model_name = model_name 
    def objective(trial):
      train_index = fold_dict[fold]["train"]
      test_index = fold_dict[fold]["test"]
      clf = XGBClassifier(n_estimators=trial.suggest_categorical("xgb_est",[100,200,300,400,500]),
                        learning_rate=trial.suggest_categorical("xgb_lr",[0.1,0.01,0.001]),
                        booster = trial.suggest_categorical("xgb_booster",["gbtree","gblinear","dart"]),
                        tree_method = "auto",
                        predictor = "cpu_predictor")
      # print(f" train_index :: {train_index}")
      # print(f" test_index :: {test_index}")
      X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
      # print(X_train.shape, X_test.shape)
      X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
      Y_train, Y_test = y.iloc[train_index].to_numpy(dtype=np.float64), y.iloc[test_index].to_numpy(np.float64)
      # Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
      print(X_train.shape)
      print(Y_train.shape)
      print(X_test.shape)
      print(Y_test.shape)
      clf.fit(X_train, Y_train,
              eval_set=[(X_test, Y_test)],
              eval_metric=['mlogloss'])
      Y_pred = clf.predict(X_test)
      print(classification_report(Y_test, Y_pred, labels=[x for x in range(6)]))
      acc = accuracy_score(Y_pred, Y_test)
      return acc

    print(f"Starting optimization for fold : [{fold}/{k_folds}]")
    study = opt.create_study(direction='maximize')
    study.optimize(objective, n_trials=optim_trial)
    best_params = study.best_params
    print(f" Best params for fold : [{fold}/{k_folds}]")
    print(best_params)
    train_index = fold_dict[fold]["train"]
    test_index = fold_dict[fold]["test"]
    X_train,X_test = x.iloc[train_index,:], x.iloc[test_index,:]
    # print(X_train.shape, X_test.shape)
    X_train, X_test = X_train.to_numpy(dtype=np.float64), X_test.to_numpy(dtype=np.float64)
    Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
    Y_train, Y_test = Y_train.to_numpy(dtype=np.float64), Y_test.to_numpy(dtype=np.float64)
    clf_model = XGBClassifier(**study.best_params)
    clf_model.fit(X_train,Y_train)
    Y_pred = clf_model.predict(X_test)
    clf_report = classification_report(Y_test, Y_pred, labels=[x for x in range(6)])
    accuracy = accuracy_score(Y_pred, Y_test)
    try:
        print("[++] Saving the model and parameters in corresponding directories")
        make_save_cv_model(fold,model_name,clf_model,best_params,clf_report,accuracy)
    except:
        print("[-] Failed to save the model")

**Total Number of Fold == 20**

In [53]:
use_df = pd.read_csv("./drive/MyDrive/SOLAR_CELL/ML_PROCESSED_DATA/outputs/data/trainable_scaled_balanced.csv")
tar_col = "PCE_categorical"
model_name = "xg_boost"
fold_dict = joblib.load("./drive/MyDrive/SOLAR_CELL/ML_PROCESSED_DATA/inputs/fold_vals/fold_data_xg_boost.z")
fold = 0

In [54]:
print(len(fold_dict[0]["train"]))
print(len(fold_dict[0]["test"]))

44004
2316


In [None]:
train(fold_dict = fold_dict,
      fold = fold,
      model_name=model_name,
      sc_df=use_df,
      tar_col=tar_col,
      optim_trial = 15)
print(f"[++] Ended the training process for fold {fold}")

[32m[I 2022-06-03 12:15:17,930][0m A new study created in memory with name: no-name-d42c63e0-d5df-4b19-9c82-82cfc66a6272[0m


(46320,)
(46320, 114)
Starting optimization for fold : [0/10]
(44004, 114)
(44004,)
(2316, 114)
(2316,)
[0]	validation_0-mlogloss:1.75719
[1]	validation_0-mlogloss:1.72567
[2]	validation_0-mlogloss:1.69701
[3]	validation_0-mlogloss:1.67098
[4]	validation_0-mlogloss:1.64735
[5]	validation_0-mlogloss:1.62587
[6]	validation_0-mlogloss:1.60633
[7]	validation_0-mlogloss:1.58852
[8]	validation_0-mlogloss:1.57225
[9]	validation_0-mlogloss:1.55735
[10]	validation_0-mlogloss:1.54367
[11]	validation_0-mlogloss:1.53107
[12]	validation_0-mlogloss:1.51944
[13]	validation_0-mlogloss:1.50866
[14]	validation_0-mlogloss:1.49865
[15]	validation_0-mlogloss:1.48932
[16]	validation_0-mlogloss:1.48062
[17]	validation_0-mlogloss:1.47246
[18]	validation_0-mlogloss:1.46481
[19]	validation_0-mlogloss:1.4576
[20]	validation_0-mlogloss:1.4508
[21]	validation_0-mlogloss:1.44437
[22]	validation_0-mlogloss:1.43829
[23]	validation_0-mlogloss:1.4325
[24]	validation_0-mlogloss:1.42701
[25]	validation_0-mlogloss:1.42176

Fold 0 has started running on 20-05-22 


Fold 0 has completed sucessfully on 17:00 20-05-22

Fold 1 has started running at 15:15 21-05-22

Fold 2 has started running at 09:45 22-05-22

Fold 2 has completed sucessfully on 10:58 22-05-22

Fold 3 has started running at 18:40 22-05-22

Fold 3 has completed sucessfully on 22-05-22

Fold 4 completed sucessfully on 21:04 on 22-05-22

Fold 5 started at 18:21 on 23-05-22

Fold 5 completed sucessfully on 19:44 on 23-05-22

Fold 6 started at 12:53 on 24-05-22

Fold 6 has completed at 14:14 on 24-05-22

Fold 7 started at 14:18 on 24-05-22

Fold 7 execution failed due to colab gpu time limit

Fold 7 trial 1 started at 11:00 on 25-05-22

Fold 7 has completed sucessfully at 12:14 on 25-05-22 

Fold 8 has started at 9:38 on 26-05-22

Fold 8 filed due to interrupted internet connection

Fold 8 trial 1 started at 13:38 on 26-05-22

Fold 8 has successfully executed at 15:33 on 26-05-22

Fold 9 has started at 13:35 on 27-05-22

Fold 9 has completed at 14:55 on 27-05-22

Editing with rectified dataset witout duplicacy because of space values

Fold 0 started at 13:21 on 28-05-22

Fold 0 completed sucessfully at 14:46 on 28-05-22

Fold 1 Failed to run due to some index error

_____ Restarting the training process again due to data distribution failure____



Fold 0 started at 10:47 on 30-05-22

Fold 0 completed successfully at 12:30 on 30-05-22

Fold 1 started at 8:27 on 31-05-22

Fold 1 execution failed due to runtime disconnection

Fold 1 started again at 9:38 on 31-05-22

Fold 1 execution failed due to gpu disconnect 

Fold 1 started again at 8:36 on 1-06-22

Fold 1 execution failed due to network disconnection

Fold 1 started again at 13:11 on 01-06-22

Fold 1 has succesfully executed at 14:25 on 01-06-22

Fold 2 started at 14:29 on 01-06-22

Fold 2 completed succesfully at 16:00 on 01-06-22

Fold 3 started at 09:43 on 02-06-22

Fold 3 execution failed due to gpu server disconnection 

Fold 3 started again at 13:34 on 02-06-22

Fold 3 ran successfully at 14:47 on 02-06-22

Fold 4 started at 14:48 on 02-06-22

Fold 4 ran successfully at 04:01 on 02-06-22

Fold 5 started at 13:51 on 03-06-22

Fold 5 ran successfully at 03:06 on 02-06-22

Fold 6 started at 15:11 on 03-06-22