In [1]:
# Import Library
import pandas as pd
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
from sklearn import svm
from sklearn.model_selection import cross_validate

from imblearn.over_sampling import SMOTE

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# 📂DATA : 2 Classes

In [3]:
fold = 3

In [4]:
path = "/home/kannika/code/Rheology2023/Rheology_Blood/DataBlood_Viscosity_TrainML_3Fold_EMclass.csv"
data_feature = pd.read_csv(path)
print(data_feature.shape)
print("-"*100)
print(f"All Fold : {set(data_feature.fold)}")
## Split Train data Set
feature_train = data_feature[data_feature["fold"]!=fold].reset_index(drop=True)
print(f"Train Set : Fold ==> {set(feature_train.fold)}")
print("Train = ", feature_train.shape)
## Split Valid data Set
feature_test = data_feature[data_feature["fold"]==fold].reset_index(drop=True)
print(f"Valiadtion Set : Fold ==> {set(feature_test.fold)}")
print("Validation = ", feature_test.shape)
## Print DataFrame
feature_train.head()

(33, 11)
----------------------------------------------------------------------------------------------------
All Fold : {1, 2, 3}
Train Set : Fold ==> {1, 2}
Train =  (22, 11)
Valiadtion Set : Fold ==> {3}
Validation =  (11, 11)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Code,classes,subclass,classes_binary,MCV,MCH,Hb,typeBEvsBM,fold
0,0,42,HN44,HN,Splenectomy,1.0,61.7,19.7,7.0,E,1
1,1,17,HN21,HN,No_Splenectomy,1.0,64.8,19.7,6.5,E,1
2,2,8,HN13,HN,Splenectomy,1.0,72.6,23.4,5.8,E,1
3,3,43,HN29,HN,No_Splenectomy,1.0,64.2,19.1,6.6,E,1
4,4,50,HN39,HN,Splenectomy,1.0,76.4,22.5,6.4,E,1


In [5]:
X_train = feature_train[['MCV','MCH','Hb']]
y_train = feature_train["typeBEvsBM"]
print(X_train.shape)
print(y_train.shape)
print(y_train[0])

(22, 3)
(22,)
E


In [6]:
print(len(list(set(y_train))))
print(set(y_train))

2
{'M', 'E'}


In [7]:
print("Before OverSampling, counts of label 'BE': {}".format(sum(y_train=='E')))
print("Before OverSampling, counts of label 'BM': {} \n".format(sum(y_train=='M')))

Before OverSampling, counts of label 'BE': 16
Before OverSampling, counts of label 'BM': 6 



# 💡Dealing with Class Imbalance with SMOTE

In [8]:
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train, y_train.ravel())
# sm = SMOTE(random_state=100)
# X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_smote.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_smote.shape))

print("After OverSampling, counts of label 'BE': {}".format(sum(y_train_smote=='E')))
print("After OverSampling, counts of label 'BM': {}".format(sum(y_train_smote=='M')))

After OverSampling, the shape of train_X: (32, 3)
After OverSampling, the shape of train_y: (32,) 

After OverSampling, counts of label 'BE': 16
After OverSampling, counts of label 'BM': 16


## 🩸 Parameter:  XGboost

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [10]:
from xgboost import XGBClassifier

## Seting parameter to find the best
learning_rates=[0.01, 0.1]
max_depths = [5, 10]
gammas = [0, 0.5, 0.8]
## Find optimiz paremeter 
#xgboost = XGBClassifier(n_estimator = 100, tree_method='gpu_hist', silent=True)
xgboost = XGBClassifier(random_state = 1, tree_method='gpu_hist',  objective= 'binary:logistic')
param_grid = {'gamma' : gammas, 'max_depth' : max_depths, 'learning_rate' : learning_rates}
grid_search = GridSearchCV(xgboost, param_grid, cv=10, n_jobs=-1) 
XGboostModel = grid_search.fit(X_train_smote, y_train_smote.ravel())  ##print(bestF)



In [11]:
# view the results as a pandas DataFrame
best_xgboostdf = pd.DataFrame(XGboostModel.cv_results_)[['rank_test_score', 'mean_test_score', 'std_test_score', 'params']]
best_xgboostdf = best_xgboostdf.sort_values(by='rank_test_score', ascending=True)
best_xgboostdf = best_xgboostdf.reset_index(drop=True)
best_xgboostdf.head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
0,1,0.808333,0.217466,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth..."
1,1,0.808333,0.217466,"{'gamma': 0, 'learning_rate': 0.01, 'max_depth..."
2,1,0.808333,0.217466,"{'gamma': 0, 'learning_rate': 0.1, 'max_depth'..."
3,1,0.808333,0.217466,"{'gamma': 0, 'learning_rate': 0.1, 'max_depth'..."
4,1,0.808333,0.217466,"{'gamma': 0.5, 'learning_rate': 0.01, 'max_dep..."


In [12]:
# examine the first result
print("**examine the first result","\n")

print(XGboostModel.cv_results_['params'][0])
print(XGboostModel.cv_results_['mean_test_score'][0])

# print the array of mean scores only
print("\n","**print the array of mean scores only","\n")

grid_mean_scores = XGboostModel.cv_results_['mean_test_score']
print(grid_mean_scores)

# examine the best model
print("\n","**examine the best model","\n")

print(XGboostModel.best_score_)
print(XGboostModel.best_params_)
print(XGboostModel.best_estimator_)

**examine the first result 

{'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5}
0.8083333333333332

 **print the array of mean scores only 

[0.80833333 0.80833333 0.80833333 0.80833333 0.80833333 0.80833333
 0.80833333 0.80833333 0.80833333 0.80833333 0.80833333 0.80833333]

 **examine the best model 

0.8083333333333332
{'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=24,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbo

In [13]:
#Print the tured parameters and score
print("Tuned Decision Tree Parameters: {}".format(XGboostModel.best_params_))
print("Best score is {}".format(XGboostModel.best_score_))

Tuned Decision Tree Parameters: {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5}
Best score is 0.8083333333333332


In [14]:
best_params_ =  XGboostModel.best_params_
gamma_ = best_params_['gamma'] ##<--- best_params = gammas
learning_rate_ = best_params_['learning_rate']  ##<--- best_params = learning_rate
max_depth_ = best_params_['max_depth'] ##<--- best_params = max_depth    

> ## 🚀 Fit Model and setting parameters

In [15]:
## **- train Fit Model XGBOOST
XGBOOST= XGBClassifier(random_state = 1,tree_method='gpu_hist', gamma=gamma_, learning_rate=learning_rate_, max_depth=max_depth_)
modelXGBOOST= XGBOOST.fit(X_train_smote, y_train_smote.ravel()) # train XGBOOST model
modelXGBOOST



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=24,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [16]:
### Testing Check Parameter
print(modelXGBOOST.get_params())

{'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': 0, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 100, 'n_jobs': 24, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 1, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'gpu_hist', 'validate_parameters': 1, 'verbosity': None}


> ## #️⃣ Save model

In [17]:
nameclass = "typeBEvsBM_SmoteClasses"

In [18]:
#Save Model
import os
import pickle
import imageio

save_pathimg = f'/media/tohn/HDD/rheology2023/ML_Model/Blood_Viscosity/{nameclass}/fold{fold}'
##**Mkdir Directory 
os.makedirs(save_pathimg, exist_ok=True)     
filename = f"XGboost_Model_{nameclass}_fold{fold}.pkl"
Model2Save =  f"{save_pathimg}/{filename}"
print(f"[INFO]: Done!! Save Model as : {Model2Save}")

with open(Model2Save, 'wb') as file:
    pickle.dump(modelXGBOOST, file)

[INFO]: Done!! Save Model as : /media/tohn/HDD/rheology2023/ML_Model/Blood_Viscosity/typeBEvsBM_SmoteClasses/fold3/XGboost_Model_typeBEvsBM_SmoteClasses_fold3.pkl
