In [2]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE

# 📂DATA : 2 Classes

In [50]:
fold = 3

In [51]:
path = "/home/kannika/code/Rheology2023/Rheology_Blood/DataBlood_Viscosity_TrainML_3Fold_EMclass.csv"
data_feature = pd.read_csv(path)
print(data_feature.shape)
print("-"*100)
print(f"All Fold : {set(data_feature.fold)}")
## Split Train data Set
feature_train = data_feature[data_feature["fold"]!=fold].reset_index(drop=True)
print(f"Train Set : Fold ==> {set(feature_train.fold)}")
print("Train = ", feature_train.shape)
## Split Valid data Set
feature_test = data_feature[data_feature["fold"]==fold].reset_index(drop=True)
print(f"Valiadtion Set : Fold ==> {set(feature_test.fold)}")
print("Validation = ", feature_test.shape)
## Print DataFrame
feature_train.head()

(33, 11)
----------------------------------------------------------------------------------------------------
All Fold : {1, 2, 3}
Train Set : Fold ==> {1, 2}
Train =  (22, 11)
Valiadtion Set : Fold ==> {3}
Validation =  (11, 11)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Code,classes,subclass,classes_binary,MCV,MCH,Hb,typeBEvsBM,fold
0,0,42,HN44,HN,Splenectomy,1.0,61.7,19.7,7.0,E,1
1,1,17,HN21,HN,No_Splenectomy,1.0,64.8,19.7,6.5,E,1
2,2,8,HN13,HN,Splenectomy,1.0,72.6,23.4,5.8,E,1
3,3,43,HN29,HN,No_Splenectomy,1.0,64.2,19.1,6.6,E,1
4,4,50,HN39,HN,Splenectomy,1.0,76.4,22.5,6.4,E,1


In [52]:
X_train = feature_train[['MCV','MCH','Hb']]
y_train = feature_train["typeBEvsBM"]
print(X_train.shape)
print(y_train.shape)
print(y_train[0])

(22, 3)
(22,)
E


In [53]:
print(len(list(set(y_train))))
print(set(y_train))

2
{'M', 'E'}


In [54]:
print("Before OverSampling, counts of label 'BE': {}".format(sum(y_train=='E')))
print("Before OverSampling, counts of label 'BM': {} \n".format(sum(y_train=='M')))

Before OverSampling, counts of label 'BE': 16
Before OverSampling, counts of label 'BM': 6 



# 💡Dealing with Class Imbalance with SMOTE

In [55]:
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train, y_train.ravel())
# sm = SMOTE(random_state=100)
# X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_smote.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_smote.shape))

print("After OverSampling, counts of label 'BE': {}".format(sum(y_train_smote=='E')))
print("After OverSampling, counts of label 'BM': {}".format(sum(y_train_smote=='M')))

After OverSampling, the shape of train_X: (32, 3)
After OverSampling, the shape of train_y: (32,) 

After OverSampling, counts of label 'BE': 16
After OverSampling, counts of label 'BM': 16


## 🩸 Parameter:  Random Forest

In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [57]:
n_estimators = [400]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100] 

forest = RandomForestClassifier(random_state = 1)

hyperF ={'n_estimators' : n_estimators, 'max_depth' : max_depth, 'min_samples_split' : min_samples_split}
# hyperF ={'max_depth' : max_depth, 'min_samples_split' : min_samples_split}

gridF = GridSearchCV(forest, hyperF, cv = 10, verbose = 1, n_jobs = -1)
bestF = gridF.fit(X_train_smote, y_train_smote.ravel())

Fitting 10 folds for each of 25 candidates, totalling 250 fits


In [58]:
# view the results as a pandas DataFrame
bestF_df = pd.DataFrame(bestF.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
bestF_df = bestF_df.sort_values(by='mean_test_score', ascending=False)
bestF_df = bestF_df.reset_index(drop=True)
bestF_df.head()

Unnamed: 0,mean_test_score,std_test_score,params
0,0.816667,0.152753,"{'max_depth': 5, 'min_samples_split': 2, 'n_es..."
1,0.816667,0.152753,"{'max_depth': 5, 'min_samples_split': 5, 'n_es..."
2,0.816667,0.152753,"{'max_depth': 30, 'min_samples_split': 5, 'n_e..."
3,0.816667,0.152753,"{'max_depth': 30, 'min_samples_split': 2, 'n_e..."
4,0.816667,0.152753,"{'max_depth': 8, 'min_samples_split': 2, 'n_es..."


In [59]:
# examine the first result
print("**examine the first result","\n")

print(bestF.cv_results_['params'][0])
print(bestF.cv_results_['mean_test_score'][0])

# print the array of mean scores only
print("\n","**print the array of mean scores only","\n")

grid_mean_scores = bestF.cv_results_['mean_test_score']
print(grid_mean_scores)

# examine the best model
print("\n","**examine the best model","\n")

print(bestF.best_score_)
print(bestF.best_params_)
print(bestF.best_estimator_)

**examine the first result 

{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 400}
0.8166666666666668

 **print the array of mean scores only 

[0.81666667 0.81666667 0.78333333 0.75       0.36666667 0.81666667
 0.81666667 0.78333333 0.75       0.36666667 0.81666667 0.81666667
 0.78333333 0.75       0.36666667 0.81666667 0.81666667 0.78333333
 0.75       0.36666667 0.81666667 0.81666667 0.78333333 0.75
 0.36666667]

 **examine the best model 

0.8166666666666668
{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 400}
RandomForestClassifier(max_depth=5, n_estimators=400, random_state=1)


In [60]:
#Print the tured parameters and score
print("Tuned Decision Tree Parameters: {}".format(bestF.best_params_))
print("Best score is {}".format(bestF.best_score_))

Tuned Decision Tree Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 400}
Best score is 0.8166666666666668


In [61]:
depth = bestF.best_params_['max_depth']
samples_split = bestF.best_params_['min_samples_split']
estimators = bestF.best_params_['n_estimators']

> ## 🚀 Fit Model and setting parameters

In [62]:
forestOpt = RandomForestClassifier(random_state=1, max_depth=depth, n_estimators=estimators, min_samples_split=samples_split)
modelOpt = forestOpt.fit(X_train_smote, y_train_smote.ravel())#

In [63]:
### Testing Check Parameter
params = modelOpt.get_params()
print(params)

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 400, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}


> ## #️⃣ Save model

In [64]:
nameclass = "typeBEvsBM_SmoteClasses"

In [65]:
#Save Model

import pickle
import imageio

save_pathimg = f'/media/tohn/HDD/rheology2023/ML_Model/Blood_Viscosity/{nameclass}/fold{fold}'
##**Mkdir Directory 
os.makedirs(save_pathimg, exist_ok=True)     
filename = f"RFModel_Blood_{nameclass}_fold{fold}.pkl"
Model2Save =  f"{save_pathimg}/{filename}"
print(f"[INFO]: Save Nodel as : [{Model2Save}]")

with open(Model2Save, 'wb') as file:
    pickle.dump(modelOpt, file)

[INFO]: Save Nodel as : [/media/tohn/HDD/rheology2023/ML_Model/Blood_Viscosity/typeBEvsBM_SmoteClasses/fold3/RFModel_Blood_typeBEvsBM_SmoteClasses_fold3.pkl]
