<center><h2>
<a href="https://rebrand.ly/github-of-masum">Kindly visit my portfolio to see more of my works</a>
</h2></center>

In [4]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from dask import dataframe as dd
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score, StratifiedKFold
import joblib
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import ExtraTreesClassifier

warnings.filterwarnings("ignore")

In [5]:
def readCSV_Function(file_path):
    dask_df = dd.read_csv(file_path, blocksize=1e6)
    dask_df = dask_df.repartition(npartitions=8)
    return dask_df.compute(scheduler='threads')


filePath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\For_Model_Creation\\"
fileName = "Repeated_Edited_Nearest_Neighbors_Under-sampled_Dataset.csv"
df = readCSV_Function(filePath + fileName)
df = shuffle(df)
df.shape

(206860, 43)

In [6]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

SEED = 23
optuna_trial_num = 50
weights = df['Label'].value_counts() / len(df)
y = df['Label']
X = df.drop(columns='Label')

X_std = StandardScaler().fit_transform(X)  # data standardized / scaled here
X_norm = preprocessing.normalize(X_std)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=SEED,
                                                    test_size=0.2,
                                                    shuffle=True)

#### before applying hyperparameter tuning, let just see without how much accuracy it will provide

In [17]:
et_classifier = ExtraTreesClassifier(random_state=SEED,
                                    max_depth=10,
                                    bootstrap=True,
                                    oob_score=True,
                                    criterion='entropy',
                                    class_weight='balanced',
                                    warm_start=True,
                                    n_estimators=1000)

sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=23)
# Performing cross-validation
cv_accuracy_wth = cross_val_score(estimator=et_classifier,
                         X=X_train, y=y_train,
                         cv=sKfold, scoring='accuracy',
                         n_jobs=-1, verbose=0).mean()

print("Extra Tree mean accuracy using cross validation of 8 fold: %0.4f " % cv_accuracy_wth)

Extra Tree mean accuracy using cross validation of 8 fold: 0.9332 


In [7]:
import gc

gc.collect()

42

<center><h3 style="background:yellow;color:black">
Finding out best hyper-parameter for Random Forest
</h3></center>

In [8]:
def objective_etc(trial):
    params = {
        "random_state": SEED,
        "bootstrap": True,
        "oob_score": True,
        "class_weight": 'balanced',
        "criterion": trial.suggest_categorical("criterion", ["entropy", "gini", "log_loss"]),
        "n_estimators": trial.suggest_int("n_estimators", 800, 1500, step=100),
        "max_depth": trial.suggest_int("max_depth", 8, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"])
    }

    classifier = ExtraTreesClassifier(**params)
    sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=SEED)
    cv_accuracy = cross_val_score(classifier, X_train, y_train,
                                  cv=sKfold,
                                  verbose=1,
                                  n_jobs=-1,
                                  scoring='accuracy').mean()

    return -cv_accuracy

In [9]:
sampler_etc = TPESampler(seed=SEED)
study_etc = optuna.create_study(study_name="Extra_Tree", 
                                direction="minimize", 
                                sampler=sampler_etc,
                               load_if_exists=True)
study_etc.optimize(objective_etc, n_trials=optuna_trial_num)

[32m[I 2023-02-24 02:42:23,918][0m A new study created in memory with name: Extra_Tree[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:  1.2min remaining:   44.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  1.2min finished
[32m[I 2023-02-24 02:43:38,916][0m Trial 0 finished with value: -0.9240670018369912 and parameters: {'criterion': 'gini', 'n_estimators': 1000, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: -0.9240670018369912.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.4s finished
[33m[W 2023-02-24 02:43:40,783][0m Trial 1 failed with parameters: {'criterion': 'log_loss', 'n_estimators': 1500, 'max_depth': 11, 'min_samples_split': 7, 'mi

In [10]:
print(f"Best parameters: \n{study_etc.best_params}\n\n'Best value: {study_etc.best_value}")

Best parameters: 
{'criterion': 'entropy', 'n_estimators': 1200, 'max_depth': 14, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt'}

'Best value: -0.9455730929130813


#### After Hyper parameter tuning, now Extra Tree model training & onwards

In [11]:
from sklearn.metrics import classification_report

params_etc_best = {
        "random_state": SEED,
        "bootstrap": True,
        "oob_score": True,
        "class_weight": 'balanced',
        "criterion": 'entropy',
        "n_estimators": 1200,
        "max_depth": 14,
        "min_samples_split": 6,
        "min_samples_leaf": 2,
        "max_features": 'sqrt'
    }

et_classifier_best = ExtraTreesClassifier(**params_etc_best)
sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=SEED)
cv_accuracy_rf = cross_val_score(et_classifier_best,
                                 X_train, y_train,
                                 cv=sKfold,
                                 n_jobs=-1,
                                 scoring='accuracy').mean()
print(f"Stratified 8 fold cross validated mean accuracy: {cv_accuracy_rf}")

Stratified 8 fold cross validated mean accuracy: 0.9455730929130813


In [12]:
modelPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\Generated_Models\\"
modelName = "Extra_Tree_model_on_RENN.pkl"

# Fitting the model on the full dataset
et_classifier_best.fit(X, y)

joblib.dump(value=et_classifier_best, filename=modelPath + modelName)

['D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\Generated_Models\\Extra_Tree_model_on_RENN.pkl']