# Centralised Learning and Federated Learning on the CICIoT2023 dataset

This notebook extends on the functionality of the CICIoT2023 example notebook, to account for improvement to the centralised training of all data instances.

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


In [3]:
DATASET_DIRECTORY = './Train/'

In [4]:
df_sets = [k for i, k in enumerate(os.listdir(DATASET_DIRECTORY)) if k.endswith('.csv') and i % 3 == 0]
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [5]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

# Create a new DataFrame that consists of all CSV data

This is **memory intensive** as it will create a DataFrame with 36 million rows.

In [6]:
# x values only

df = []

count = 0
for train_set in tqdm(training_sets):
    if count == 0:
        df = pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns]
    else:
        df_new = pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns]
        df = pd.concat([df, df_new], ignore_index=True)
    count = count + 1
    
df.to_pickle('training_data-X_values.pkl')

100%|██████████| 18/18 [00:40<00:00,  2.26s/it]


In [7]:
# y values only

y_df = []

count = 0
for train_set in tqdm(training_sets):
    if count == 0:
        y_df = pd.read_csv(DATASET_DIRECTORY + train_set)[y_column]
    else:
        y_df_new = pd.read_csv(DATASET_DIRECTORY + train_set)[y_column]
        y_df = pd.concat([y_df, y_df_new], ignore_index=True)
    count = count + 1
  

100%|██████████| 18/18 [00:21<00:00,  1.18s/it]


In [8]:
y_df.to_pickle('training_data-y_value.pkl')

In [9]:
df = []

count = 0
for train_set in tqdm(training_sets):
    if count == 0:
        df = pd.read_csv(DATASET_DIRECTORY + train_set)
    else:
        df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
        df = pd.concat([df, df_new], ignore_index=True)
    count = count + 1

100%|██████████| 18/18 [00:24<00:00,  1.37s/it]


In [None]:
df

# Save this output to a Pickle file

In [10]:
df.to_pickle('training_data-new_copy.pkl')

In [11]:
df = pd.read_pickle('./training_data-new_copy.pkl')

In [12]:
y_df = pd.read_pickle('./training_data-y_value.pkl')

In [None]:
df

In [None]:
y_df



---

# Scale the input features

In [13]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
df[X_columns] = scaler.fit_transform(df[X_columns])

In [None]:
df

# Classification Problem (2-class, 8-class, or 34-class)

In [17]:
binary_classifier = False
group_classifier = False
individual_classifier = True

if group_classifier:
    
    dict_7classes = {}
    dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
    dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
    dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
    dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
    dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
    dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
    dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
    dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-SlowLoris'] = 'DDoS'
    dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'
    dict_7classes['DoS-UDP_Flood'] = 'DoS'
    dict_7classes['DoS-SYN_Flood'] = 'DoS'
    dict_7classes['DoS-TCP_Flood'] = 'DoS'
    dict_7classes['DoS-HTTP_Flood'] = 'DoS'
    dict_7classes['Mirai-greeth_flood'] = 'Mirai'
    dict_7classes['Mirai-greip_flood'] = 'Mirai'
    dict_7classes['Mirai-udpplain'] = 'Mirai'
    dict_7classes['Recon-PingSweep'] = 'Recon'
    dict_7classes['Recon-OSScan'] = 'Recon'
    dict_7classes['Recon-PortScan'] = 'Recon'
    dict_7classes['VulnerabilityScan'] = 'Recon'
    dict_7classes['Recon-HostDiscovery'] = 'Recon'
    dict_7classes['DNS_Spoofing'] = 'Spoofing'
    dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'
    dict_7classes['BenignTraffic'] = 'Benign'
    dict_7classes['BrowserHijacking'] = 'Web'
    dict_7classes['Backdoor_Malware'] = 'Web'
    dict_7classes['XSS'] = 'Web'
    dict_7classes['Uploading_Attack'] = 'Web'
    dict_7classes['SqlInjection'] = 'Web'
    dict_7classes['CommandInjection'] = 'Web'
    dict_7classes['DictionaryBruteForce'] = 'BruteForce'

    new_y = [dict_7classes[k] for k in y_df]
    y_df = new_y
    
elif binary_classifier:
    dict_2classes = {}
    dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
    dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
    dict_2classes['DDoS-SYN_Flood'] = 'Attack'
    dict_2classes['DDoS-UDP_Flood'] = 'Attack'
    dict_2classes['DDoS-TCP_Flood'] = 'Attack'
    dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
    dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
    dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-SlowLoris'] = 'Attack'
    dict_2classes['DDoS-HTTP_Flood'] = 'Attack'
    dict_2classes['DoS-UDP_Flood'] = 'Attack'
    dict_2classes['DoS-SYN_Flood'] = 'Attack'
    dict_2classes['DoS-TCP_Flood'] = 'Attack'
    dict_2classes['DoS-HTTP_Flood'] = 'Attack'
    dict_2classes['Mirai-greeth_flood'] = 'Attack'
    dict_2classes['Mirai-greip_flood'] = 'Attack'
    dict_2classes['Mirai-udpplain'] = 'Attack'
    dict_2classes['Recon-PingSweep'] = 'Attack'
    dict_2classes['Recon-OSScan'] = 'Attack'
    dict_2classes['Recon-PortScan'] = 'Attack'
    dict_2classes['VulnerabilityScan'] = 'Attack'
    dict_2classes['Recon-HostDiscovery'] = 'Attack'
    dict_2classes['DNS_Spoofing'] = 'Attack'
    dict_2classes['MITM-ArpSpoofing'] = 'Attack'
    dict_2classes['BenignTraffic'] = 'Benign'
    dict_2classes['BrowserHijacking'] = 'Attack'
    dict_2classes['Backdoor_Malware'] = 'Attack'
    dict_2classes['XSS'] = 'Attack'
    dict_2classes['Uploading_Attack'] = 'Attack'
    dict_2classes['SqlInjection'] = 'Attack'
    dict_2classes['CommandInjection'] = 'Attack'
    dict_2classes['DictionaryBruteForce'] = 'Attack'

    new_y = [dict_2classes[k] for k in y_df]
    y_df = new_y
else:
    print ("Assuming individual_classifier...")
    pass
    

Assuming individual_classifier...


# Model Creation (LR, RF, MLP)

In [20]:
import pickle
from datetime import datetime

logreg = True
perceptron = True
adaboost = True
random_forest = True
mlp = True

logreg_model_filename = "./Models-34class/new-logreg-34class-model.pkl"
perceptron_model_filename = "./Models-34class/new-perceptron-34class-model.pkl"
adaboost_model_filename = "./Models-34class/new-adaboost-34class-model.pkl"
rf_model_filename = "./Models-34class/new-rf-34class-model.pkl"
mlp_model_filename = "./Models-2class/new-mlp-34class-model.pkl"

In [None]:
if logreg:
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(random_state=42)

    print (datetime.now(), " : Fit LR model...")
    model.fit(df[X_columns], y_df)
    print (datetime.now(), " : Fit LR model complete...")
    
    with open(logreg_model_filename, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(1)}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    

In [None]:
if perceptron:
    from sklearn.linear_model import Perceptron
    model = Perceptron(random_state=42)

    print (datetime.now(), " : Fit Perceptron model...")
    model.fit(df[X_columns], y_df)
    print (datetime.now(), " : Fit Perceptron model complete...")
    
    with open(perceptron_model_filename, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(1)}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    

In [None]:
if adaboost:
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier(random_state=42)

    print (datetime.now(), " : Fit AdaBoost model...")
    model.fit(df[X_columns], y_df)
    print (datetime.now(), " : Fit AdaBoost model complete...")
    
    with open(adaboost_model_filename, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(1)}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    

In [22]:
if random_forest:
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=200, class_weight="balanced")

    print (datetime.now(), " : Fit RF model...")
    model.fit(df[X_columns], y_df)
    print (datetime.now(), " : Fit RF model complete...")
    
    

    with open(rf_model_filename, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(1)}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
    for k,v in preds.items():
        y_pred = v
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print('Confusion Matrix:\n', cm)
        print('\n')
 

2024-01-08 00:41:05.387334  : Fit RF model...
2024-01-08 01:11:14.377028  : Fit RF model complete...


100%|██████████| 5/5 [00:18<00:00,  3.70s/it]


accuracy_score:  0.0
recall_score:  0.0
precision_score:  0.0
f1_score:  0.0
Confusion Matrix:
 [[   0    0    0 ...    0    0    0]
 [  84    0   14 ...    0    0    0]
 [   0    0    0 ...    0    0    0]
 ...
 [  36    0    6 ...    0    0    0]
 [1358    0    1 ...    0    0    0]
 [ 129    0   24 ...    0    0    0]]




In [None]:
if mlp:
    from sklearn.neural_network import MLPClassifier
    model = MLPClassifier(random_state=42)
    print (datetime.now(), " : Fit MLP model...")
    model.fit(df[X_columns], y_df)
    print (datetime.now(), " : Fit MLP model complete...")
    
    with open(mlp_model_filename, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(1)}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))

In [None]:
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    #d_test[X_columns] = scaler.transform(d_test[X_columns])


    y_test += list(d_test[y_column].values)

    #y_pred = list(model.predict(d_test[X_columns]))
    #preds[0] = preds[0] + y_pred

In [None]:
len(y_test)


---
* 43 minutes to complete LR for 34 class - 14 minutes for 8 class - 7 minutes 2class
* 1 hour 56 minutes to complete RF for 34 class - 2 hours 21 for 8 class.
* 2 hours 39 minutes to complete MLP for 34 class - over 5 hours for 8 class - 
---