In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score


In [3]:
df=pd.read_csv('F:\\Cyprus\\CSV-01-12\\01-12\\Syn.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df.columns

Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,281052,172.16.0.5-192.168.50.1-53058-53058-6,172.16.0.5,53058,192.168.50.1,53058,6,2018-12-01 13:30:30.741451,115799309,19,...,646237.483665,1709809.0,1.0,14261170.0,3220326.0,21714933.0,11043464.0,0,1,Syn
1,450424,172.16.0.5-192.168.50.1-32237-32237-6,172.16.0.5,32237,192.168.50.1,32237,6,2018-12-01 13:30:30.741452,113973933,16,...,19.595918,49.0,1.0,16281980.0,2573891.0,20019405.0,11993631.0,0,1,Syn
2,182979,172.16.0.5-192.168.50.1-60495-9840-6,172.16.0.5,60495,192.168.50.1,9840,6,2018-12-01 13:30:30.741501,112,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn
3,41540,172.16.0.5-192.168.50.1-59724-59724-6,172.16.0.5,59724,192.168.50.1,59724,6,2018-12-01 13:30:30.741563,105985004,16,...,17.705259,48.0,1.0,15140710.0,3077366.0,20954123.0,11120336.0,0,1,Syn
4,358711,172.16.0.5-192.168.50.1-60496-32538-6,172.16.0.5,60496,192.168.50.1,32538,6,2018-12-01 13:30:30.741565,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn


In [6]:
ack = df[' ACK Flag Count']
pd.unique(ack)


array([1, 0], dtype=int64)

In [7]:
init = df['Init_Win_bytes_forward']
pd.unique(init)
print(max(init))
print(min(init))

65535
-1


In [8]:
seg = df[' min_seg_size_forward']
pd.unique(seg)

array([20,  0, 28, 32, 40, 24], dtype=int64)

In [9]:
iat = df['Fwd IAT Total']
pd.unique(iat)
print(max(iat))
print(min(iat))
print(len(iat))

119999653.0
0.0
1582681


In [10]:
flow = df[' Flow Duration']
pd.unique(flow)
print(max(flow))
print(min(flow))
print(len(flow))

119999653
0
1582681


In [11]:
label = df[' Label']
pd.unique(label)
print(max(label))
print(min(label))
print(len(label))

Syn
BENIGN
1582681


In [12]:
data = [df[" ACK Flag Count"], df["Init_Win_bytes_forward"],df[" min_seg_size_forward"], df["Fwd IAT Total"],df[" Flow Duration"], df[" Label"]]

In [13]:
headers = ["ack", "init","seg","iat","flow","label"]

In [14]:
df1 = pd.concat(data, axis=1, keys=headers)

In [15]:
df1

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,115799309.0,115799309,Syn
1,1,5840,20,113973933.0,113973933,Syn
2,1,5840,20,1.0,112,Syn
3,1,5840,20,105985004.0,105985004,Syn
4,1,5840,20,1.0,1,Syn
...,...,...,...,...,...,...
1582676,1,5840,20,1.0,1,Syn
1582677,1,5840,20,1.0,1,Syn
1582678,1,5840,20,1.0,1,Syn
1582679,1,5840,20,1.0,1,Syn


In [16]:
df1.to_csv("syn_attributes.csv")


## Model Creation

In [17]:
df1=pd.read_csv('syn_attributes.csv')

In [18]:
from pycaret.classification import *
from sklearn.preprocessing import LabelEncoder

In [19]:
# label encoding for Y values
Encoder = LabelEncoder()
le = Encoder.fit(df1["label"])
df1["label"] = le.transform(df1["label"])

In [20]:
pd.unique(df1['label'])
# 1 = Syn
# 0 = Benign

array([1, 0])

In [21]:
df1

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow,label
0,0,1,5840,20,115799309.0,115799309,1
1,1,1,5840,20,113973933.0,113973933,1
2,2,1,5840,20,1.0,112,1
3,3,1,5840,20,105985004.0,105985004,1
4,4,1,5840,20,1.0,1,1
...,...,...,...,...,...,...,...
1582676,1582676,1,5840,20,1.0,1,1
1582677,1582677,1,5840,20,1.0,1,1
1582678,1582678,1,5840,20,1.0,1,1
1582679,1582679,1,5840,20,1.0,1,1


In [22]:
# sample 10000 points from majority class to create new dataset
df_maj = df1[df1['label']==1]
df_min = df1[df1['label']==0]
df_maj = df_maj.sample(n=10000)
df_maj = pd.concat([df_maj,df_min], axis=0)


In [23]:
len(df_maj)

10392

In [24]:
X = df_maj.drop('label',axis=1)
y = df_maj['label']

In [25]:
X

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow
1496683,1496683,1,5840,20,0.0,0
207732,207732,1,5840,20,1.0,1
963187,963187,1,5840,20,1.0,1
790578,790578,1,5840,20,1.0,99
1326914,1326914,1,5840,20,1.0,1
...,...,...,...,...,...,...
1479185,1479185,0,4253,20,499664.0,499664
1482435,1482435,0,4253,20,0.0,217
1482436,1482436,0,4253,20,0.0,194
1502348,1502348,0,-1,20,2.0,20808


In [26]:
X['init'].value_counts()

 5840     9995
-1         166
 8192       42
 256        18
 252        15
 64240      14
 254        13
 255        10
 246         9
 258         8
 1024        6
 253         6
 16385       5
 237         5
 4253        5
 1892        4
 16402       4
 0           4
 16224       4
 1959        3
 2093        3
 65535       3
 257         3
 238         3
 16362       2
 2160        2
 581         2
 268         2
 315         2
 16425       2
 16508       2
 16476       2
 325         2
 4272        2
 16321       2
 16275       2
 16439       2
 259         1
 419         1
 16346       1
 16560       1
 243         1
 16369       1
 245         1
 1825        1
 119         1
 2026        1
 114         1
 122         1
 121         1
 16324       1
 16127       1
 297         1
 335         1
 607         1
Name: init, dtype: int64

In [27]:
# train and test split from newly created dataset 
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=32, shuffle=True)

In [28]:
y_test.value_counts()

1    1985
0      94
Name: label, dtype: int64

In [29]:
# perform smote for the dataset to solve imbalance
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)


In [30]:
y_train.value_counts()

1    8015
0    8015
Name: label, dtype: int64

In [31]:
x_train

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow
0,1350122,1,5840,20,1.000000e+00,63
1,909726,1,8192,20,5.908991e+07,59089908
2,921297,1,5840,20,1.000000e+00,1
3,136541,1,5840,20,0.000000e+00,0
4,358172,1,5840,20,0.000000e+00,0
...,...,...,...,...,...,...
16025,836784,0,241,20,3.842346e-01,0
16026,609067,0,252,20,3.094220e-01,171
16027,70899,0,38,20,6.574314e-01,13692
16028,291775,0,15283,20,9.136592e-01,17


In [32]:
# concatenate dataset for passing to pycaret
train_dataset = pd.concat([x_train,y_train],axis=1)
test_dataset = pd.concat([x_test,y_test],axis=1)

In [33]:
train_dataset.drop(['Unnamed: 0'],axis=1,inplace=True)
test_dataset.drop(['Unnamed: 0'],axis=1,inplace=True)

In [34]:
train_dataset

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,1.000000e+00,63,1
1,1,8192,20,5.908991e+07,59089908,0
2,1,5840,20,1.000000e+00,1,1
3,1,5840,20,0.000000e+00,0,1
4,1,5840,20,0.000000e+00,0,1
...,...,...,...,...,...,...
16025,0,241,20,3.842346e-01,0,0
16026,0,252,20,3.094220e-01,171,0
16027,0,38,20,6.574314e-01,13692,0
16028,0,15283,20,9.136592e-01,17,0


In [35]:
test_dataset

Unnamed: 0,ack,init,seg,iat,flow,label
904807,1,5840,20,80232156.0,80232156,1
1093463,1,5840,20,1.0,1,1
1126558,1,5840,20,1.0,1,1
1536489,1,5840,20,1.0,1,1
842556,1,5840,20,1.0,1,1
...,...,...,...,...,...,...
1462950,1,5840,20,1.0,1,1
96285,1,5840,20,1.0,1,1
928965,1,5840,20,1.0,1,1
484783,1,5840,20,1.0,53,1


In [36]:
len(train_dataset['init'].value_counts())

1851

In [37]:
# pycaret init
exp = setup(data = train_dataset, target='label', train_size = 1.0, preprocess = True, test_data = test_dataset, session_id=1, use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,1
1,Target,label
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(16030, 6)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [38]:
xgb_model = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9996,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
tuned_xgb = tune_model(xgb_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9998,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9999,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
ada_model = create_model('ada')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9963
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9992,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9993,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
tuned_ada = tune_model(ada_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9998,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9999,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
rf_model = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9994,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9994,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
tuned_rf = tune_model(rf_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,1.0,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
lgbm_model=create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9963
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9988,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [45]:
tuned_lgbm = tune_model(lgbm_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9996,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,0.9999,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988


In [46]:
lr_model=create_model('lr') #JUST TO CHECK IF ITS SHOWING GOOD RESULTS FOR ALL

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7049,0.7713,0.9102,0.6454,0.7553,0.4097,0.4493
1,0.6999,0.7564,0.9214,0.6387,0.7545,0.3997,0.4459
2,0.7031,0.7585,0.9339,0.6391,0.7589,0.4059,0.4577
3,0.6887,0.7677,0.914,0.6303,0.7461,0.3772,0.4226
4,0.8334,0.8879,0.7282,0.9226,0.8139,0.6669,0.6823
5,0.6725,0.764,0.9176,0.6156,0.7368,0.3452,0.396
6,0.6744,0.7516,0.9226,0.6163,0.739,0.3489,0.4019
7,0.8135,0.8729,0.6941,0.9115,0.7881,0.6269,0.6455
8,0.6837,0.7619,0.9026,0.6276,0.7404,0.3676,0.4089
9,0.8203,0.8794,0.7154,0.9052,0.7992,0.6406,0.6552


## Manual training

In [47]:
x_test.drop(['Unnamed: 0'],axis=1,inplace=True)
x_train.drop(['Unnamed: 0'],axis=1,inplace=True)

In [48]:
x_test

Unnamed: 0,ack,init,seg,iat,flow
904807,1,5840,20,80232156.0,80232156
1093463,1,5840,20,1.0,1
1126558,1,5840,20,1.0,1
1536489,1,5840,20,1.0,1
842556,1,5840,20,1.0,1
...,...,...,...,...,...
1462950,1,5840,20,1.0,1
96285,1,5840,20,1.0,1
928965,1,5840,20,1.0,1
484783,1,5840,20,1.0,53


In [49]:
x_train

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,1.000000e+00,63
1,1,8192,20,5.908991e+07,59089908
2,1,5840,20,1.000000e+00,1
3,1,5840,20,0.000000e+00,0
4,1,5840,20,0.000000e+00,0
...,...,...,...,...,...
16025,0,241,20,3.842346e-01,0
16026,0,252,20,3.094220e-01,171
16027,0,38,20,6.574314e-01,13692
16028,0,15283,20,9.136592e-01,17


### Training Different Models

In [50]:
xgb = XGBClassifier(scale_pos_weight=1)
xgb_clf = xgb.fit(x_train,y_train)
predictions_xgb = xgb_clf.predict(x_test)



In [51]:
x_train

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,1.000000e+00,63
1,1,8192,20,5.908991e+07,59089908
2,1,5840,20,1.000000e+00,1
3,1,5840,20,0.000000e+00,0
4,1,5840,20,0.000000e+00,0
...,...,...,...,...,...
16025,0,241,20,3.842346e-01,0
16026,0,252,20,3.094220e-01,171
16027,0,38,20,6.574314e-01,13692
16028,0,15283,20,9.136592e-01,17


In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions_xgb))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      1.00      1.00      2079
weighted avg       1.00      1.00      1.00      2079



In [53]:
print("xgb Score -> ",accuracy_score(predictions_xgb, y_test)*100)
print("xgb roc_auc Score -> ",roc_auc_score(y_test,predictions_xgb))
print("f1 score ->", f1_score(y_test,predictions_xgb))

xgb Score ->  99.95189995189995
xgb roc_auc Score ->  0.9997481108312343
f1 score -> 0.999748047367095


# ADA BOOST

In [54]:
from sklearn.ensemble import AdaBoostClassifier

In [55]:
ada = AdaBoostClassifier()
ada_clf = ada.fit(x_train,y_train)
predictions_ada = ada_clf.predict(x_test)

In [56]:
print(classification_report(y_test,predictions_ada))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      1.00      1.00      2079
weighted avg       1.00      1.00      1.00      2079



In [57]:
print("ada Score -> ",accuracy_score(predictions_ada, y_test)*100)
print("ada roc_auc Score -> ",roc_auc_score(y_test,predictions_ada))
print("ada f1 score ->", f1_score(y_test,predictions_ada))

ada Score ->  99.95189995189995
ada roc_auc Score ->  0.9997481108312343
ada f1 score -> 0.999748047367095


# RF

In [58]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
rf_clf = rf.fit(x_train,y_train)
predictions_rf = rf_clf.predict(x_test)

In [59]:
print(classification_report(y_test,predictions_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      0.99      0.99      2079
weighted avg       1.00      1.00      1.00      2079



In [60]:
print("rf Score -> ",accuracy_score(predictions_rf, y_test)*100)
print("rf roc_auc Score -> ",roc_auc_score(y_test,predictions_rf))
print("rf f1 score ->", f1_score(y_test,predictions_rf))

rf Score ->  99.90379990379991
rf roc_auc Score ->  0.9944289618950641
rf f1 score -> 0.9994962216624685


# LGBM

In [61]:
import lightgbm

In [62]:
lgbm= lightgbm.LGBMClassifier()
lgbm_clf = lgbm.fit(x_train,y_train)
predictions_lgbm = lgbm_clf.predict(x_test)

In [63]:
print(classification_report(y_test,predictions_lgbm))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      1.00      1.00      2079
weighted avg       1.00      1.00      1.00      2079



In [64]:
print("rf Score -> ",accuracy_score(predictions_lgbm, y_test)*100)
print("rf roc_auc Score -> ",roc_auc_score(y_test,predictions_lgbm))
print("rf f1 score ->", f1_score(y_test,predictions_lgbm))

rf Score ->  99.95189995189995
rf roc_auc Score ->  0.9997481108312343
rf f1 score -> 0.999748047367095


# Running on Whole Dataset


In [65]:
whole_df = df1.drop(['Unnamed: 0'],axis=1)
whole_df

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,115799309.0,115799309,1
1,1,5840,20,113973933.0,113973933,1
2,1,5840,20,1.0,112,1
3,1,5840,20,105985004.0,105985004,1
4,1,5840,20,1.0,1,1
...,...,...,...,...,...,...
1582676,1,5840,20,1.0,1,1
1582677,1,5840,20,1.0,1,1
1582678,1,5840,20,1.0,1,1
1582679,1,5840,20,1.0,1,1


In [66]:
whole_y = whole_df['label']
whole_x = whole_df.drop(['label'],axis=1)

In [67]:
whole_x

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,115799309.0,115799309
1,1,5840,20,113973933.0,113973933
2,1,5840,20,1.0,112
3,1,5840,20,105985004.0,105985004
4,1,5840,20,1.0,1
...,...,...,...,...,...
1582676,1,5840,20,1.0,1
1582677,1,5840,20,1.0,1
1582678,1,5840,20,1.0,1
1582679,1,5840,20,1.0,1


In [68]:
whole_y

0          1
1          1
2          1
3          1
4          1
          ..
1582676    1
1582677    1
1582678    1
1582679    1
1582680    1
Name: label, Length: 1582681, dtype: int32

In [69]:
whole_predict_xgb = xgb_clf.predict(whole_x)

In [70]:
print(classification_report(whole_y,whole_predict_xgb))

              precision    recall  f1-score   support

           0       0.32      1.00      0.48       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.66      1.00      0.74   1582681
weighted avg       1.00      1.00      1.00   1582681



In [71]:
whole_predict_ada = ada_clf.predict(whole_x)
print(classification_report(whole_y,whole_predict_ada))

              precision    recall  f1-score   support

           0       0.50      1.00      0.66       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.75      1.00      0.83   1582681
weighted avg       1.00      1.00      1.00   1582681



In [72]:
whole_predict_rf = rf_clf.predict(whole_x)
print(classification_report(whole_y,whole_predict_rf))

              precision    recall  f1-score   support

           0       0.48      1.00      0.65       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.74      1.00      0.82   1582681
weighted avg       1.00      1.00      1.00   1582681



In [73]:
whole_predict_lgbm = lgbm_clf.predict(whole_x)
print(classification_report(whole_y,whole_predict_lgbm))

              precision    recall  f1-score   support

           0       0.49      1.00      0.66       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.75      1.00      0.83   1582681
weighted avg       1.00      1.00      1.00   1582681

