<a href="https://colab.research.google.com/github/PinKem253/Baseline_framework/blob/main/merged_framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Library

In [None]:
import os
import time
import random
import warnings
from collections import Counter
from typing import Dict, List

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor, Lambda, Compose

import tensorflow as tf
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc,
    confusion_matrix, classification_report, matthews_corrcoef, average_precision_score
)
from sklearn.metrics import ConfusionMatrixDisplay

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.mixture import GaussianMixture

# Outlier Detection
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# Advanced ML Models
!pip install xgboost
!pip install lightgbm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Imbalanced Data Handling
!pip install imblearn
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

warnings.filterwarnings('ignore')

try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    pass


try:
  from imblearn.over_sampling import SMOTE
except:
  !pip install imblearn
  from imblearn.over_sampling import SMOTE

from collections import Counter

Mounted at /content/drive


#Data Preprocessing

In [None]:
def OC_preprocess(datapath:str,
                  train_size:float,
                  ):

  #Assume target_class as label
  target_column = "Label"

  #read data and rename last column
  df = pd.read_csv(datapath)
  df.rename(columns={df.columns[-1]: target_column},inplace=True)

  #Clean data
  df.drop_duplicates().reset_index(drop=True,inplace =True)
  df.replace([-np.inf,np.inf],np.nan,inplace =True)
  df.dropna(inplace=True)

  cate_col = [col for col in df.columns if df[col].dtype =='object' and col != target_column]
  df.drop(columns=cate_col,inplace=True)

  #Encoding for oneclass task
  df[target_column] = df[target_column].apply(lambda x: x.upper())
  df[target_column] = df[target_column].apply(lambda x:1 if "BENIGN" in str(x) else -1)

  #Remove unimportant features
  corr_threshold = 0.05
  correlations = df.corr()
  target_correlations = correlations.iloc[:-1,-1]
  unimportant_features = target_correlations[abs(target_correlations) < corr_threshold].index
  df = df.drop(columns=unimportant_features)

  #separate trainset: 0 and testset 0 and 1
  X = df.iloc[:,:-1]
  y = df.iloc[:,-1]

  X_train = df[df[target_column] == 1].iloc[:, :-1]
  y_train = df[df[target_column] == 1][target_column]

  X_test = df[df[target_column] == -1].iloc[:, :-1]
  y_test = df[df[target_column] == -1][target_column]

  X_train,X_benign_test,y_train,y_benign_test = train_test_split(X_train,y_train,random_state = 42,train_size= train_size,shuffle=True)
  X_test = pd.concat([X_test,X_benign_test],axis=0).reset_index(drop=True)
  y_test = pd.concat([y_test,y_benign_test],axis=0).reset_index(drop=True)

  #Resampling data
  #smote = SMOTE()
  #X_train,y_train = smote.fit_resample(X_train,y_train)

  #Scaling data
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train,X_test,y_train,y_test


In [None]:
def MND_preprocess(datapath:str,
                   train_size:float,
                   ):
  #Assume target_class as label
  target_column = "Label"


  #read data and rename last column
  df = pd.read_csv(datapath)
  df.rename(columns={df.columns[-1]: "Label"},inplace=True)

  #Clean data
  df.drop_duplicates().reset_index(drop=True,inplace=True)
  df.replace([-np.inf,np.inf],np.nan,inplace=True)
  df.dropna(inplace=True)

  cate_col = [col for col in df.columns if df[col].dtype =='object' and col != target_column]
  df.drop(columns=cate_col,inplace=True)

  #Encode Benign as 0, attack from 1 onwards
  df[target_column] = df[target_column].apply(lambda x: x.upper())
  df[target_column] = df[target_column].apply(lambda x:0 if "BENIGN" in str(x) else x)

  benign = df[df[target_column] == 0]
  attack = df[df[target_column] != 0]

  label_encoder = LabelEncoder()
  attack[target_column] = label_encoder.fit_transform(attack[target_column])
  attack[target_column] += 1

  df = pd.concat([benign,attack])

  #Remove unimportant features
  corr_threshold = 0.05
  correlations = df.corr()
  target_correlations = correlations.iloc[:-1,-1]
  unimportant_features = target_correlations[abs(target_correlations) < corr_threshold].index
  df = df.drop(columns=unimportant_features)

  #Separate train and test
  X=df.iloc[:,:-1]
  y=df.iloc[:,-1]

  X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,train_size=train_size,shuffle=True)

  #Resampling
  #y_train = y_train.astype(int)
  #smote = SMOTE()
  #X_train,y_train = smote.fit_resample(X_train,y_train)

  #Scale
  #scaler = StandardScaler()
  #X_train = scaler.fit_transform(X_train)
  #X_test = scaler.transform(X_test)

  return X_train,X_test,y_train,y_test


#Evaluation and get models

In [None]:
def evaluation_metrics(model, X_test, y_test, y_pred):

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average="macro"),
        'Recall': recall_score(y_test, y_pred, average="macro"),
        'F1': f1_score(y_test, y_pred, average="macro"),
        'MCC': matthews_corrcoef(y_test, y_pred),
    }
    return metrics


In [None]:
def get_model(model_name):
  model_dict = {
      #OneClass task
      "oneclass_svm": OneClassSVM(),
      "iso_forest": IsolationForest(),
      "lof": LocalOutlierFactor(novelty=True),
      "robust_covariance": EllipticEnvelope(),
      "gaussian_mixture": GaussianMixture(),

      #MND
      "random_forest": RandomForestClassifier(),
      "svm": SVC(),
      "knn": KNeighborsClassifier(),
      "xgb": XGBClassifier(),
      "lgb": LGBMClassifier(),
      "logistic_regression": LogisticRegression(),
      "decision_tree": DecisionTreeClassifier(),
      "gradient_boost": GradientBoostingClassifier(),
      "naive_bayes": GaussianNB(),
  }

  if model_name in model_dict:
    return model_dict[model_name]
  else:
    raise ValueError(f"{model_name} not found")


#Training function for each task

In [None]:
def oneclass_train(dataset:Dict,
                  train_size:float,
                  models:List,
                  output_result:str,
                  task: str,
                  ):
  for dataset_title,dataset_path in dataset.items():
    results =[]

    #Benign:1 and Attack:-1
    X_train,X_test,y_train,y_test = OC_preprocess(dataset_path,train_size)

    for model_name in models:
      model = get_model(model_name)
      print("-"*100)
      print(f"Start training using model {model_name}")


      start_train = time.time()
      model.fit(X_train,y_train)
      end_train = time.time()
      time_train = end_train - start_train

      start_test = time.time()
      y_pred = model.predict(X_test)

      end_test = time.time()
      time_test = end_test - start_test
      print("End training")

      metrics = evaluation_metrics(model,X_test,y_test,y_pred)
      metrics.update({
        "Dataset": dataset_title,
        "Task": task,
        "Model": model_name,
        "Time train": time_train,
        "Time test": time_test,
      })

      results.append(metrics)

    df_results = pd.DataFrame(results)
    print(df_results)

    if not os.path.exists(output_result):
      df_results.to_csv(output_result, index=False)
    else:
      df_results.to_csv(output_result, mode='a', index=False, header=False)


In [None]:
def multiclass_novelty(dataset: Dict,
                      train_size: float,
                      models: List,
                      output_result: str,
                      task: str,
                      ):

  for dataset_title,dataset_path in dataset.items():
    X_train,X_test,y_train,y_test = MND_preprocess(dataset_path,train_size)

    for attack in range(1,len(y_train.unique())):
      results = []
      print("-"*100)

      #exclude attack from trainset
      X_train_fit = X_train[y_train != attack]
      y_train_fit = y_train[y_train != attack]

      #mark excluded attack as -1
      y_valid = y_test.copy()
      y_valid[y_valid == attack] = -1
      X_valid = X_test.loc[y_valid.index]

      #Resampling
      y_train_fit = y_train_fit.astype(int)
      smote = SMOTE()
      X_train_fit,y_train_fit = smote.fit_resample(X_train_fit,y_train_fit)

      #Scale data
      scaler = StandardScaler()
      X_train_fit = scaler.fit_transform(X_train_fit)
      X_valid = scaler.transform(X_valid)

      for model_name in models:
        model = get_model(model_name)
        print(f"\nStart training with class {attack} dropped using model {model_name}")

        start_train = time.time()
        model.fit(X_train_fit,y_train_fit)
        end_train = time.time()
        time_train = end_train - start_train

        start_test = time.time()
        y_pred = model.predict(X_valid)

        end_test = time.time()
        time_test = end_test - start_test
        print("End training")

        y_valid = y_valid.astype(int)
        y_pred = y_pred.astype(int)

        metrics = evaluation_metrics(model,X_valid,y_valid,y_pred)
        metrics.update({
          "Dataset": dataset_title,
          "Task": task,
          "Model": model_name,
          "Time train": time_train,
          "Time test": time_test,
        })
        results.append(metrics)

      df_results = pd.DataFrame(results)
      print(df_results)

      if not os.path.exists(output_result):
        df_results.to_csv(output_result, index=False)
      else:
        df_results.to_csv(output_result, mode='a', index=False, header=False)



In [None]:
def multiclass_q_novelty(dataset:str,
                        train_size:float,
                        models: List,
                        output_result: str,
                        task: str,
                        ):
  loop_time = int(input(f"Number of experiments: "))
  remove_number = int(input(f"Q classes to remove each run: "))

  for i in range(1,loop_time+1):
    results = []
    print("-"*100)
    print(f"Running {i} time")

    for dataset_title,dataset_path in dataset.items():
      X_train,X_test,y_train,y_test = MND_preprocess(dataset_path,train_size)

      #Generate a random Q label list to remove from training
      remove_list = np.random.choice(range(1,len(y_train.unique())),size=remove_number,replace=False).tolist()
      print(f"Remove {len(remove_list)} class {remove_list} in {i} time")

      #Remove all selected attack from training
      X_train_fit = X_train[~y_train.isin(remove_list)]
      y_train_fit = y_train[~y_train.isin(remove_list)]


      #Mark selected remove list as -1
      y_valid = y_test.copy()
      y_valid[y_valid.isin(remove_list)] = -1
      X_valid = X_test[y_valid]

      y_train = y_train.astype(int)
      smote = SMOTE()
      X_train_fit,y_train_fit = smote.fit_resample(X_train_fit,y_train_fit)

      scaler = StandardScaler()
      X_train_fit = scaler.fit_transform(X_train_fit)
      X_valid = scaler.transform(X_valid)

      for model_name in models:
        model = get_model(model_name)

        print(f"\nStart fitting using model {model_name}")
        start_train = time.time()
        model.fit(X_train_fit,y_train_fit)
        end_train = time.time()
        time_train = end_train - start_train

        start_test = time.time()
        y_pred = model.predict(X_valid)

        end_test = time.time()
        time_test = end_test - start_test
        print(f"End testing \n")

        metrics = evaluation_metrics(model,X_valid,y_valid.astype(str),y_pred.astype(str))
        metrics.update({
          "Dataset": dataset_title,
          "Task": task,
          "Model": model_name,
          "Time train": time_train,
          "Time test": time_test,
        })
        results.append(metrics)

      df_results = pd.DataFrame(results)
      print(df_results)

      if not os.path.exists(output_result):
        df_results.to_csv(output_result, index=False)
      else:
        df_results.to_csv(output_result, mode='a', index=False, header=False)

    print(f"End training {i} time")



#Test

In [None]:
def run_experiment(datasets:Dict,
                   train_size:float,
                   task:str,
                   output_result:str
                   ):

  oneclass_models =[
      "oneclass_svm",
      "iso_forest",
      "lof",
      "robust_covariance",
      #"gaussian_mixture",
  ]
  MND_models = [
      "random_forest",
      "svm",
      "knn",
      "xgb",
      "lgb",
      "logistic_regression",
      "decision_tree",
      "gradient_boost",
      "naive_bayes",
  ]
  if(task=="one class"):
    return oneclass_train(datasets,train_size,oneclass_models,output_result,task)

  elif(task=="multiclass novelty"):
    multiclass_novelty(datasets,train_size,MND_models,output_result,task)


  elif task == "multiclass q novelty":
    multiclass_q_novelty(datasets,train_size,MND_models,output_result,task)

  else:
    raise ValueError(f"Invalid task")

#Run experiment

In [None]:
datasets ={
    #"CIC IOT": "/content/drive/MyDrive/Colab Notebooks/CIC IoT dataset 2023.csv",
    #"Attack Type": "/content/drive/MyDrive/Colab Notebooks/AttackType.csv",
    "IOT Intrusion": "/content/drive/MyDrive/Colab Notebooks/IoT_Intrusion.csv",
}
output_result = "/content/drive/MyDrive/Colab Notebooks/merge_result_mmb.csv"


train_size = 0.8
task = "multiclass novelty"
run_experiment(datasets,train_size,task,output_result)

----------------------------------------------------------------------------------------------------

Start training with class 1 dropped using model random_forest
