<a href="https://colab.research.google.com/github/Noob919/IoT_Security_Project/blob/main/IoT_Security_logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [3]:
# Set the dataset directory
DATASET_DIRECTORY = '../content/drive/MyDrive/Data/'

# Importing Dataset

df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
training_sets = df_sets[:int(len(df_sets) * 0.8)]
test_sets = df_sets[int(len(df_sets) * 0.8):]

X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [5]:
# Classification: 2 (1+1) classes
scaler = StandardScaler()

for train_set in tqdm(training_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_2classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_2classes['DDoS-SYN_Flood'] = 'DDoS'
dict_2classes['DDoS-UDP_Flood'] = 'DDoS'
dict_2classes['DDoS-TCP_Flood'] = 'DDoS'
dict_2classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_2classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_2classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_2classes['DDoS-SlowLoris'] = 'DDoS'
dict_2classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_2classes['DoS-UDP_Flood'] = 'DoS'
dict_2classes['DoS-SYN_Flood'] = 'DoS'
dict_2classes['DoS-TCP_Flood'] = 'DoS'
dict_2classes['DoS-HTTP_Flood'] = 'DoS'

dict_2classes['Mirai-greeth_flood'] = 'other'
dict_2classes['Mirai-greip_flood'] = 'other'
dict_2classes['Mirai-udpplain'] = 'other'

dict_2classes['Recon-PingSweep'] = 'other'
dict_2classes['Recon-OSScan'] = 'other'
dict_2classes['Recon-PortScan'] = 'other'
dict_2classes['VulnerabilityScan'] = 'other'
dict_2classes['Recon-HostDiscovery'] = 'other'

dict_2classes['DNS_Spoofing'] = 'other'
dict_2classes['MITM-ArpSpoofing'] = 'other'

dict_2classes['BenignTraffic'] = 'other'

dict_2classes['BrowserHijacking'] = 'other'
dict_2classes['Backdoor_Malware'] = 'other'
dict_2classes['XSS'] = 'other'
dict_2classes['Uploading_Attack'] = 'other'
dict_2classes['SqlInjection'] = 'other'
dict_2classes['CommandInjection'] = 'other'

dict_2classes['DictionaryBruteForce'] = 'other'

ML_models = [
    LogisticRegression(n_jobs=-1),
]

ML_names = [
    "LogisticRegression",
]

for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y

    for model in ML_models:
        model.fit(d[X_columns], d[y_column])
    del d

y_test = []
preds = {i: [] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred

for k, v in preds.items():
    y_pred = v
    print(f"##### {ML_names[k]} (8 classes) #####")
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print()

100%|██████████| 135/135 [06:21<00:00,  2.83s/it]
100%|██████████| 135/135 [31:44<00:00, 14.11s/it]
100%|██████████| 34/34 [02:26<00:00,  4.30s/it]


##### LogisticRegression (8 classes) #####
accuracy_score =  0.843560559646992
recall_score =  0.8694509312660809
precision_score =  0.7053377685297316
f1_score =  0.7127350547547285

