<center><h2>
<a href="https://masumbhai.me">Kindly visit my portfolio to see more of my works</a>
</h2></center>

In [None]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from dask import dataframe as dd
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score,StratifiedKFold
import joblib
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

In [None]:
def readCSV_Function(file_path):
    dask_df = dd.read_csv(file_path, blocksize=1e6)
    dask_df = dask_df.repartition(npartitions=8)
    return dask_df.compute(scheduler='threads')

filePath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\For_Model_Creation\\"
fileName = "Repeated_Edited_Nearest_Neighbors_Under-sampled_Dataset.csv"
df = readCSV_Function(filePath+fileName)
df = shuffle(df)
df.shape

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

SEED = 23
optuna_trial_num = 20
weights = df['Label'].value_counts()/len(df)
y = df['Label']
X = df.drop(columns='Label')

X_std = StandardScaler().fit_transform(X) # data standardized / scaled here
X_norm = preprocessing.normalize(X_std)

X_train, X_test, y_train, y_test = train_test_split(X, y ,
                                                    random_state=SEED,
                                                    test_size=0.2,
                                                    shuffle=True)

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

imgPath = "G:\\Brig_Gen_Razzak_Sir_Thesis_Group\\Thesis-on-DDOS-main\\images\\"
imgName = "Visualizing_DDoS_using_PCA_tSNE_in_RENN_dataset.png"

pca = PCA(n_components=0.999)
principalComponents = pca.fit_transform(X_norm)

tsne_ = TSNE(random_state = 23,
             n_components=2,
             verbose=0,
             perplexity=40,
             learning_rate='warn',
             n_iter=500,
             n_jobs=-1).fit_transform(principalComponents)

plt.figure(figsize=(16,16))
sns.scatterplot(x=tsne_[:, 0], y=tsne_[:, 1], s= 100,
                hue=df['Label'],
                cmap='Spectral',alpha=0.7,
                sizes=(20, 200), legend="full",palette="deep")
plt.title('Visualizing DDoS through t-SNE (in RENN dataset)', fontsize=24)
plt.savefig(imgPath+imgName)

before_encoding_dataframe = df.copy(deep=True)
le = preprocessing.LabelEncoder()
df["Label"] = le.fit_transform(df["Label"])
df["Label"] = pd.to_numeric(df['Label'], downcast='integer')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,StratifiedKFold

dt_classifier = DecisionTreeClassifier(criterion="entropy",
                                       min_samples_split=5,
                                       random_state=23,
                                       max_depth=20,
                                       max_features='auto')

sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=23)
# Performing cross-validation
scores = cross_val_score(estimator=dt_classifier, X=X,y=y, cv=sKfold, scoring='accuracy',n_jobs=-1,verbose=0)

print("Decision Tree mean accuracy using cross validation of 8 fold: %0.4f " % (scores.mean()))

In [None]:
modelPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\Generated_Models\\"
modelName = "Decision_Tree_model_on_RENN.pkl"

# Fitting the model on the full dataset
dt_classifier.fit(X,y)

joblib.dump(value=dt_classifier,filename=modelPath+modelName)

with open(modelPath+modelName,'rb') as f:
    loaded_model = joblib.load(f)

print(dir(loaded_model))

### plotting decision tree

In [None]:
# %%capture
# for not showing the huge plot inside the notebook
from sklearn import tree
import graphviz

imgPath = "G:\\Brig_Gen_Razzak_Sir_Thesis_Group\\Thesis-on-DDOS-main\\images\\"
imgName = "decision_tree_of_RENN.png"

labels = ["DrDoS_SSDP","DrDoS_UDP","DrDoS_NetBIOS","BENIGN","DrDoS_LDAP","DrDoS_DNS","DrDoS_MSSQL","DrDoS_SNMP","UDP-lag","DrDoS_NTP","Syn","TFTP"]
features = X.columns

# Generate a visualization of the decision tree
dot_data = tree.export_graphviz(decision_tree=dt_classifier,
                                out_file=None,
                                feature_names=features,
                                class_names=labels,
                                filled=True,
                                rounded=True,
                                proportion=True,
                                special_characters=True)
graph = graphviz.Source(dot_data,format='png')
graph
# graph.render(directory=imgPath,filename="Visualizing_DDoS_through_Decision_Tree.png")
#
# # fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
# fig = plt.figure(figsize=(15,10))
# # plt.figure(figsize=(60, 60))
# # plt.title('Visualizing DDoS through Decision Tree', fontsize=24)
# tree.plot_tree(decision_tree=dt_classifier,
#           feature_names=features,
#           class_names=labels,
#           max_depth=5,
#           rounded=True, # Rounded node edges
#           filled=True, # Adds color according to class
#           proportion=True) # Displays the proportions of class samples instead of the whole number of samples
# fig.savefig(imgPath+imgName)

In [None]:
import gc
gc.collect()

### before applying hyperparameter tuning, let just see without how much accuracy it will provide

In [None]:
# creating new model instance
rf_classifier = RandomForestClassifier(random_state=SEED,
                                       max_depth=10,
                                       bootstrap=True,
                                       oob_score=True,
                                       criterion='entropy',
                                       class_weight='balanced',
                                       n_estimators=1000)

sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=SEED)
cv_accuracy = cross_val_score(rf_classifier, X_train, y_train,
                              cv=sKfold,
                              verbose=1,
                              n_jobs=-1,
                              scoring='accuracy').mean()
cv_accuracy

<center><h3 style="background:yellow;color:black">
Finding out best hyper-parameter for Random Forest
</h3></center>

In [None]:
def objective_rf(trial):
    params = {
        "random_state": SEED,
        "bootstrap" : True,
        "oob_score": True,
        "verbose": 2,
        "criterion": 'entropy',
        "class_weight": 'balanced',
        "n_estimators": trial.suggest_int("n_estimators", 800, 1500, step=100),
        "max_depth" : trial.suggest_int("max_depth", 10, 20),
        "min_samples_split" : trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features" : trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"])
    }

    classifier = RandomForestClassifier(**params)
    sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=SEED)
    cv_accuracy = cross_val_score(classifier, X_train, y_train,
                                  cv=sKfold,
                                  verbose=1,
                                  n_jobs=-1,
                                  scoring='accuracy').mean()

    return cv_accuracy

In [None]:
sampler_rf = TPESampler(seed=SEED)
study_rf = optuna.create_study(study_name="Random_Forest", direction="maximize", sampler=sampler_rf,load_if_exists=True)
study_rf.optimize(objective_rf, n_trials=optuna_trial_num)

In [None]:
print(f"Best parameters: \n{study_rf.best_params}\n'Best value: {study_rf.best_value}")

#### After Hyper parameter tuning, now Random Forest model training & onwards

In [None]:
from sklearn.metrics import classification_report

rf_best_params = {
    "random_state": SEED,
    "bootstrap" : True,
    "oob_score": True,
    "criterion": 'entropy',
    "class_weight": 'balanced',
    "n_estimators": 1000,
    "max_depth" : 20,
    "min_samples_split" : 8,
    "min_samples_leaf" : 1,
    "max_features" : 'log2'
}

rf_classifier_best = RandomForestClassifier(**rf_best_params)
sKfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=SEED)
cv_accuracy_rf = cross_val_score(rf_classifier_best,
                              X_train, y_train,
                              cv=sKfold,
                              n_jobs=-1,
                              scoring='accuracy').mean()
print(f"Stratified 8 fold cross validated mean accuracy: {cv_accuracy_rf}")

In [None]:
modelPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\Generated_Models\\"
modelName = "Random_Forest_model_on_RENN.pkl"

# Fitting the model on the full dataset
rf_classifier_best.fit(X,y)

joblib.dump(value=rf_classifier_best,filename=modelPath+modelName)