In [1]:
# @title Setup

competition = "IoT Attack"  # @param
# @markdown ---

from google.colab import userdata
import json

# Get the Kaggle credentials from Colab's userdata
username = userdata.get("KAGGLE_USER")
key = userdata.get("KAGGLE_KEY")

# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

competition_id = "super-ai-engineer-5-internet-of-things-attack"
!kaggle competitions download -c {competition_id}
!unzip /content/{competition_id}.zip

Archive:  /content/super-ai-engineer-5-internet-of-things-attack.zip
  inflating: IOT/sample_submission.csv  
  inflating: IOT/test.csv            
  inflating: IOT/train.csv           


In [2]:
!pip install pycaret[full]

Collecting pycaret[full]
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting numpy<1.27,>=1.21 (from pycaret[full])
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m567.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<2.2.0 (from pycaret[full])
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret[full])
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret[full])
  Downloading pyod-2.0.3.tar.gz (169 kB)
[2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/IOT/train.csv')
print(df.shape)
df.head()

(53600, 47)


Unnamed: 0,flow_duration,header_length,protocol_type,duration,rate,srate,drate,fin_flag_number,syn_flag_number,rst_flag_number,...,std,tot_size,iat,number,magnitue,radius,covariance,variance,weight,attack_type
0,17.831617,10207.2,8.2,87.3,9.326997,9.326997,0.0,0.0,0.0,0.0,...,411.641148,446.3,0.01486859,5.5,29.321747,582.148495,234266.940305,0.9,38.5,Benign
1,1.159867,664511.92,17.0,64.0,1689.541039,1689.541039,0.0,0.0,0.0,0.0,...,0.0,554.0,83711490.0,9.5,33.286634,0.0,0.0,0.0,141.55,Mirai
2,0.0,0.0,1.0,64.0,261.784047,261.784047,0.0,0.0,0.0,0.0,...,0.0,42.0,83124510.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS
3,0.146458,643.3,6.0,64.0,51.782747,51.782747,0.0,1.0,0.0,0.0,...,13.428908,82.8,0.0001641989,5.5,12.049703,18.991344,453.551556,0.4,38.5,BruteForce
4,39.307538,1125.9,7.6,154.9,1.13736,1.13736,0.0,0.0,0.0,0.0,...,55.837477,106.4,0.0250088,5.5,15.275293,78.966117,3508.229337,0.9,38.5,BruteForce


In [7]:
pd.crosstab(df['attack_type'], 'N')

col_0,N
attack_type,Unnamed: 1_level_1
Benign,6700
BruteForce,6700
DDoS,6700
DoS,6700
Mirai,6700
Recon,6700
Spoofing,6700
Web-based,6700


In [3]:
null_col = df.isnull().sum()
null_col[ null_col > 0 ]

Unnamed: 0,0


## Data Preprocessing

In [4]:
bin_col = []
for col in df.columns:
    distinct_val = df[col].unique()

    if len(distinct_val) <= 2:
        bin_col.append(col)
        print(f'{col}: {distinct_val}')

drate: [0.]
fin_flag_number: [0. 1.]
syn_flag_number: [0. 1.]
rst_flag_number: [0. 1.]
psh_flag_number: [0. 1.]
ack_flag_number: [1. 0.]
ece_flag_number: [0.]
cwr_flag_number: [0.]
http: [0. 1.]
https: [1. 0.]
dns: [0. 1.]
telnet: [0.]
smtp: [0. 1.]
ssh: [0. 1.]
irc: [0.]
tcp: [1. 0.]
udp: [0. 1.]
dhcp: [0.]
arp: [0. 1.]
icmp: [0. 1.]
ipv: [1. 0.]
llc: [1. 0.]


In [5]:
df_prep = df.copy()
for col in bin_col:
    df_prep[col] = df_prep[col].astype('int').astype('category')

df_prep.drop(columns=['drate', 'telnet', 'irc', 'dhcp', 'ece_flag_number', 'cwr_flag_number'], inplace=True)

df_prep.head()

Unnamed: 0,flow_duration,header_length,protocol_type,duration,rate,srate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,std,tot_size,iat,number,magnitue,radius,covariance,variance,weight,attack_type
0,17.831617,10207.2,8.2,87.3,9.326997,9.326997,0,0,0,0,...,411.641148,446.3,0.01486859,5.5,29.321747,582.148495,234266.940305,0.9,38.5,Benign
1,1.159867,664511.92,17.0,64.0,1689.541039,1689.541039,0,0,0,0,...,0.0,554.0,83711490.0,9.5,33.286634,0.0,0.0,0.0,141.55,Mirai
2,0.0,0.0,1.0,64.0,261.784047,261.784047,0,0,0,0,...,0.0,42.0,83124510.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS
3,0.146458,643.3,6.0,64.0,51.782747,51.782747,1,0,0,0,...,13.428908,82.8,0.0001641989,5.5,12.049703,18.991344,453.551556,0.4,38.5,BruteForce
4,39.307538,1125.9,7.6,154.9,1.13736,1.13736,0,0,0,0,...,55.837477,106.4,0.0250088,5.5,15.275293,78.966117,3508.229337,0.9,38.5,BruteForce


## Model Selection

In [6]:
from pycaret.classification import *
import torch

clf_setup = setup(data=df_prep, target='attack_type', session_id=42,
                  train_size=0.8, use_gpu=torch.cuda.is_available())
best_model = compare_models(sort='Accuracy', include=['dt', 'xgboost', 'lightgbm', 'catboost'])
best_model = create_model(best_model)
evaluate_model(best_model)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Description,Value
0,Session id,42
1,Target,attack_type
2,Target type,Multiclass
3,Target mapping,"Benign: 0, BruteForce: 1, DDoS: 2, DoS: 3, Mirai: 4, Recon: 5, Spoofing: 6, Web-based: 7"
4,Original data shape,"(53600, 41)"
5,Transformed data shape,"(53600, 41)"
6,Transformed train set shape,"(42880, 41)"
7,Transformed test set shape,"(10720, 41)"
8,Numeric features,24
9,Categorical features,16


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9247,0.9946,0.9247,0.9264,0.9245,0.914,0.9143,2.914
lightgbm,Light Gradient Boosting Machine,0.9229,0.9943,0.9229,0.9251,0.9226,0.9119,0.9123,6.382
dt,Decision Tree Classifier,0.9042,0.9453,0.9042,0.9039,0.9038,0.8905,0.8906,2.022
catboost,CatBoost Classifier,0.9037,0.9913,0.9037,0.9052,0.9032,0.8899,0.8903,10.359


Processing:   0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9219,0.9942,0.9219,0.924,0.9216,0.9107,0.9111
1,0.9235,0.9942,0.9235,0.9254,0.9231,0.9126,0.913
2,0.9212,0.9944,0.9212,0.9232,0.9209,0.9099,0.9103
3,0.9207,0.9945,0.9207,0.9223,0.9203,0.9094,0.9097
4,0.9291,0.9954,0.9291,0.9301,0.9288,0.919,0.9192
5,0.9296,0.9947,0.9296,0.9309,0.9293,0.9195,0.9198
6,0.9319,0.9952,0.9319,0.9328,0.9318,0.9222,0.9223
7,0.9205,0.9941,0.9205,0.923,0.9202,0.9091,0.9096
8,0.9251,0.9943,0.9251,0.9271,0.925,0.9144,0.9148
9,0.9237,0.9949,0.9237,0.9257,0.9236,0.9128,0.9132


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
tuned_model = tune_model(best_model)
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8687,0.9864,0.8687,0.8731,0.8686,0.8499,0.8506
1,0.8734,0.9875,0.8734,0.8788,0.8732,0.8553,0.8561
2,0.8692,0.9864,0.8692,0.8736,0.8691,0.8505,0.8511
3,0.8729,0.9872,0.8729,0.877,0.8732,0.8547,0.8552
4,0.8839,0.9886,0.8839,0.8875,0.8836,0.8673,0.8678
5,0.8738,0.9868,0.8738,0.8784,0.8739,0.8558,0.8565
6,0.8864,0.9891,0.8864,0.8895,0.8865,0.8702,0.8706
7,0.8736,0.9868,0.8736,0.8773,0.8734,0.8555,0.8561
8,0.8703,0.9865,0.8703,0.8744,0.8705,0.8518,0.8523
9,0.8776,0.9878,0.8776,0.8819,0.8777,0.8601,0.8606


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## Prediction

In [None]:
test_df = pd.read_csv('/content/IOT/test.csv', index_col='id')
print(test_df.shape)
test_df.head()

(26400, 46)


Unnamed: 0_level_0,flow_duration,header_length,protocol_type,duration,rate,srate,drate,fin_flag_number,syn_flag_number,rst_flag_number,...,avg,std,tot_size,iat,number,magnitue,radius,covariance,variance,weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.693902,505.3,7.6,115.3,65.834868,65.834868,0.0,0.0,0.0,0.0,...,74.218095,12.698493,79.2,0.04859049,5.5,12.178254,17.958382,198.110765,0.9,38.5
1,0.0,54.0,6.0,64.0,1.721838,1.721838,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83067430.0,9.5,10.392305,0.0,0.0,0.0,141.55
2,0.002493,6.51,45.63,64.53,23.285224,23.285224,0.0,0.0,0.0,0.0,...,571.703766,42.884295,571.57,83693880.0,9.5,33.787513,60.741769,8028.38097,0.24,141.55
3,23.733227,427931.3,6.6,70.0,67.341624,67.341624,0.0,0.0,0.0,0.0,...,1464.023452,845.605854,956.9,0.009741879,5.5,52.809756,1195.867266,810047.434873,0.9,38.5
4,13.081981,25450.0,7.6,93.7,48.811474,48.811474,0.0,0.0,0.0,0.0,...,104.971627,96.566181,122.9,0.00507009,5.5,14.23665,136.565202,16964.596451,0.9,38.5


In [None]:
test_prep = test_df.copy()
for col in bin_col:
    test_prep[col] = test_prep[col].astype('int').astype('category')

test_prep.drop(columns=['drate', 'telnet', 'irc', 'dhcp', 'ece_flag_number', 'cwr_flag_number'], inplace=True)
test_prep.head()

Unnamed: 0_level_0,flow_duration,header_length,protocol_type,duration,rate,srate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,avg,std,tot_size,iat,number,magnitue,radius,covariance,variance,weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.693902,505.3,7.6,115.3,65.834868,65.834868,0,0,0,0,...,74.218095,12.698493,79.2,0.04859049,5.5,12.178254,17.958382,198.110765,0.9,38.5
1,0.0,54.0,6.0,64.0,1.721838,1.721838,0,0,0,0,...,54.0,0.0,54.0,83067430.0,9.5,10.392305,0.0,0.0,0.0,141.55
2,0.002493,6.51,45.63,64.53,23.285224,23.285224,0,0,0,0,...,571.703766,42.884295,571.57,83693880.0,9.5,33.787513,60.741769,8028.38097,0.24,141.55
3,23.733227,427931.3,6.6,70.0,67.341624,67.341624,0,0,0,0,...,1464.023452,845.605854,956.9,0.009741879,5.5,52.809756,1195.867266,810047.434873,0.9,38.5
4,13.081981,25450.0,7.6,93.7,48.811474,48.811474,0,0,0,0,...,104.971627,96.566181,122.9,0.00507009,5.5,14.23665,136.565202,16964.596451,0.9,38.5


In [None]:
holdout_pred = predict_model(final_model, test_prep)
y_pred = holdout_pred['prediction_label'].tolist()
y_pred[:5]

['BruteForce', 'DDoS', 'Mirai', 'Benign', 'Spoofing']

## Submission

In [None]:
submission = pd.read_csv('/content/IOT/sample_submission.csv')
submission.loc[2:, 'attack_type'] = y_pred[2:]
submission.head()

Unnamed: 0,id,attack_type
0,0,BruteForce
1,1,DDoS
2,2,Mirai
3,3,Benign
4,4,Spoofing


In [None]:
submission.to_csv('catboost_tuned.csv', index=False)