In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score


In [3]:
df=pd.read_csv('F:\\Cyprus\\CSV-01-12\\01-12\\Syn.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df.columns

Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,281052,172.16.0.5-192.168.50.1-53058-53058-6,172.16.0.5,53058,192.168.50.1,53058,6,2018-12-01 13:30:30.741451,115799309,19,...,646237.483665,1709809.0,1.0,14261170.0,3220326.0,21714933.0,11043464.0,0,1,Syn
1,450424,172.16.0.5-192.168.50.1-32237-32237-6,172.16.0.5,32237,192.168.50.1,32237,6,2018-12-01 13:30:30.741452,113973933,16,...,19.595918,49.0,1.0,16281980.0,2573891.0,20019405.0,11993631.0,0,1,Syn
2,182979,172.16.0.5-192.168.50.1-60495-9840-6,172.16.0.5,60495,192.168.50.1,9840,6,2018-12-01 13:30:30.741501,112,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn
3,41540,172.16.0.5-192.168.50.1-59724-59724-6,172.16.0.5,59724,192.168.50.1,59724,6,2018-12-01 13:30:30.741563,105985004,16,...,17.705259,48.0,1.0,15140710.0,3077366.0,20954123.0,11120336.0,0,1,Syn
4,358711,172.16.0.5-192.168.50.1-60496-32538-6,172.16.0.5,60496,192.168.50.1,32538,6,2018-12-01 13:30:30.741565,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn


In [6]:
ack = df[' ACK Flag Count']
pd.unique(ack)


array([1, 0], dtype=int64)

In [7]:
init = df['Init_Win_bytes_forward']
pd.unique(init)
print(max(init))
print(min(init))

65535
-1


In [8]:
seg = df[' min_seg_size_forward']
pd.unique(seg)

array([20,  0, 28, 32, 40, 24], dtype=int64)

In [9]:
iat = df['Fwd IAT Total']
pd.unique(iat)
print(max(iat))
print(min(iat))
print(len(iat))

119999653.0
0.0
1582681


In [10]:
flow = df[' Flow Duration']
pd.unique(flow)
print(max(flow))
print(min(flow))
print(len(flow))

119999653
0
1582681


In [11]:
label = df[' Label']
pd.unique(label)
print(max(label))
print(min(label))
print(len(label))

Syn
BENIGN
1582681


In [12]:
data = [df[" ACK Flag Count"], df["Init_Win_bytes_forward"],df[" min_seg_size_forward"], df["Fwd IAT Total"],df[" Flow Duration"], df[" Label"]]

In [13]:
headers = ["ack", "init","seg","iat","flow","label"]

In [14]:
df1 = pd.concat(data, axis=1, keys=headers)

In [15]:
df1

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,115799309.0,115799309,Syn
1,1,5840,20,113973933.0,113973933,Syn
2,1,5840,20,1.0,112,Syn
3,1,5840,20,105985004.0,105985004,Syn
4,1,5840,20,1.0,1,Syn
...,...,...,...,...,...,...
1582676,1,5840,20,1.0,1,Syn
1582677,1,5840,20,1.0,1,Syn
1582678,1,5840,20,1.0,1,Syn
1582679,1,5840,20,1.0,1,Syn


In [16]:
df1.to_csv("syn_attributes.csv")


## Model Creation

In [46]:
df1=pd.read_csv('syn_attributes.csv')

In [3]:
from pycaret.classification import *
from sklearn.preprocessing import LabelEncoder

In [47]:
# label encoding for Y values
Encoder = LabelEncoder()
le = Encoder.fit(df1["label"])
df1["label"] = le.transform(df1["label"])

In [48]:
pd.unique(df1['label'])
# 1 = Syn
# 0 = Benign

array([1, 0])

In [49]:
df1

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow,label
0,0,1,5840,20,115799309.0,115799309,1
1,1,1,5840,20,113973933.0,113973933,1
2,2,1,5840,20,1.0,112,1
3,3,1,5840,20,105985004.0,105985004,1
4,4,1,5840,20,1.0,1,1
...,...,...,...,...,...,...,...
1582676,1582676,1,5840,20,1.0,1,1
1582677,1582677,1,5840,20,1.0,1,1
1582678,1582678,1,5840,20,1.0,1,1
1582679,1582679,1,5840,20,1.0,1,1


In [51]:
# sample 10000 points from majority class to create new dataset
df_maj = df1[df1['label']==1]
df_min = df1[df1['label']==0]
df_maj = df_maj.sample(n=10000)
df_maj = pd.concat([df_maj,df_min], axis=0)


In [52]:
len(df_maj)

10392

In [53]:
X = df_maj.drop('label',axis=1)
y = df_maj['label']

In [54]:
X

Unnamed: 0,ack,init,seg,iat,flow
712130,1,5840,20,1.0,1
206107,1,5840,20,1.0,1
763942,1,5840,20,1.0,1
1038239,1,5840,20,1.0,1
299493,1,5840,20,1.0,1
...,...,...,...,...,...
1479185,0,4253,20,499664.0,499664
1482435,0,4253,20,0.0,217
1482436,0,4253,20,0.0,194
1502348,0,-1,20,2.0,20808


In [55]:
X['init'].value_counts()

 5840     9995
-1         169
 8192       42
 256        18
 252        15
 64240      14
 254        13
 255        10
 246         9
 258         8
 1024        6
 253         6
 4253        5
 16385       5
 237         5
 16224       4
 16402       4
 1892        4
 0           3
 238         3
 2093        3
 1959        3
 65535       3
 257         3
 16362       2
 325         2
 16425       2
 2160        2
 16439       2
 16321       2
 581         2
 16508       2
 16275       2
 315         2
 16476       2
 4272        2
 268         2
 245         1
 2026        1
 16127       1
 607         1
 335         1
 16560       1
 119         1
 16369       1
 121         1
 114         1
 297         1
 1825        1
 419         1
 16346       1
 122         1
 16324       1
Name: init, dtype: int64

In [56]:
# train and test split from newly created dataset 
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=32, shuffle=True)

In [57]:
y_test.value_counts()

1    1985
0      94
Name: label, dtype: int64

In [58]:
# perform smote for the dataset to solve imbalance
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)


In [59]:
y_train.value_counts()

0    8015
1    8015
Name: label, dtype: int64

In [60]:
x_train

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,1.000000e+00,110
1,1,8192,20,5.908991e+07,59089908
2,1,5840,20,1.000000e+00,1
3,1,5840,20,1.000000e+00,1
4,1,5840,20,1.019999e+08,101999853
...,...,...,...,...,...
16025,0,-1,20,1.000000e+00,20992
16026,0,16352,20,0.000000e+00,54839
16027,1,8192,20,1.089716e+07,15885725
16028,0,7324,20,5.992063e+07,59925068


In [61]:
# concatenate dataset for passing to pycaret
train_dataset = pd.concat([x_train,y_train],axis=1)
test_dataset = pd.concat([x_test,y_test],axis=1)

In [63]:
# train_dataset.drop(['Unnamed: 0'],axis=1,inplace=True)
# test_dataset.drop(['Unnamed: 0'],axis=1,inplace=True)

In [64]:
train_dataset

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,1.000000e+00,110,1
1,1,8192,20,5.908991e+07,59089908,0
2,1,5840,20,1.000000e+00,1,1
3,1,5840,20,1.000000e+00,1,1
4,1,5840,20,1.019999e+08,101999853,1
...,...,...,...,...,...,...
16025,0,-1,20,1.000000e+00,20992,0
16026,0,16352,20,0.000000e+00,54839,0
16027,1,8192,20,1.089716e+07,15885725,0
16028,0,7324,20,5.992063e+07,59925068,0


In [65]:
test_dataset

Unnamed: 0,ack,init,seg,iat,flow,label
827752,1,5840,20,1.0,1,1
258170,1,5840,20,1.0,1,1
253367,1,5840,20,1.0,1,1
1139216,1,5840,20,1.0,1,1
538790,1,5840,20,1.0,1,1
...,...,...,...,...,...,...
1541531,1,5840,20,1.0,1,1
138172,1,5840,20,1.0,1,1
970390,1,5840,20,0.0,0,1
1470023,1,5840,20,1.0,1,1


In [21]:
len(train_dataset['init'].value_counts())

1852

In [22]:
# pycaret init
exp = setup(data = train_dataset, target='label', train_size = 1.0, preprocess = True, test_data = test_dataset, session_id=1, use_gpu=True)

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
ack,Categorical
init,Numeric
seg,Numeric
iat,Numeric
flow,Numeric
label,Label


 quit


SystemExit: Read the documentation of setup to learn how to overwrite data types over the inferred types. setup function must run again before you continue modeling.

In [38]:
xgb_model = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9996,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
tuned_xgb = tune_model(xgb_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9998,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9999,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
ada_model = create_model('ada')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9963
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9992,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9993,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
tuned_ada = tune_model(ada_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9998,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9999,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
rf_model = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9994,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,0.9994,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
tuned_rf = tune_model(rf_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,1.0,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
lgbm_model=create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9963
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9988,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [45]:
tuned_lgbm = tune_model(lgbm_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9988,0.9996,0.9975,1.0,0.9988,0.9975,0.9975
4,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9994,0.9999,0.9988,1.0,0.9994,0.9988,0.9988
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988


In [46]:
lr_model=create_model('lr') #JUST TO CHECK IF ITS SHOWING GOOD RESULTS FOR ALL

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7049,0.7713,0.9102,0.6454,0.7553,0.4097,0.4493
1,0.6999,0.7564,0.9214,0.6387,0.7545,0.3997,0.4459
2,0.7031,0.7585,0.9339,0.6391,0.7589,0.4059,0.4577
3,0.6887,0.7677,0.914,0.6303,0.7461,0.3772,0.4226
4,0.8334,0.8879,0.7282,0.9226,0.8139,0.6669,0.6823
5,0.6725,0.764,0.9176,0.6156,0.7368,0.3452,0.396
6,0.6744,0.7516,0.9226,0.6163,0.739,0.3489,0.4019
7,0.8135,0.8729,0.6941,0.9115,0.7881,0.6269,0.6455
8,0.6837,0.7619,0.9026,0.6276,0.7404,0.3676,0.4089
9,0.8203,0.8794,0.7154,0.9052,0.7992,0.6406,0.6552


## Manual training

In [47]:
x_test.drop(['Unnamed: 0'],axis=1,inplace=True)
x_train.drop(['Unnamed: 0'],axis=1,inplace=True)

In [48]:
x_test

Unnamed: 0,ack,init,seg,iat,flow
904807,1,5840,20,80232156.0,80232156
1093463,1,5840,20,1.0,1
1126558,1,5840,20,1.0,1
1536489,1,5840,20,1.0,1
842556,1,5840,20,1.0,1
...,...,...,...,...,...
1462950,1,5840,20,1.0,1
96285,1,5840,20,1.0,1
928965,1,5840,20,1.0,1
484783,1,5840,20,1.0,53


In [44]:
x_train

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow
0,529784,1,5840,20,1.000000e+00,1
1,909726,1,8192,20,5.908991e+07,59089908
2,1139835,1,5840,20,1.000000e+00,1
3,1341910,1,5840,20,1.000000e+00,1
4,646999,1,5840,20,1.000000e+00,1
...,...,...,...,...,...,...
16025,1481320,0,2960,20,0.000000e+00,131
16026,337934,0,-1,20,1.558596e+00,20826
16027,965751,0,100,20,1.587907e+00,12262
16028,851445,1,8192,20,1.072399e+07,15713638


### Training Different Models

In [66]:
xgb = XGBClassifier(scale_pos_weight=1)
xgb_clf = xgb.fit(x_train,y_train)
predictions_xgb = xgb_clf.predict(x_test)





In [67]:
x_train

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,1.000000e+00,110
1,1,8192,20,5.908991e+07,59089908
2,1,5840,20,1.000000e+00,1
3,1,5840,20,1.000000e+00,1
4,1,5840,20,1.019999e+08,101999853
...,...,...,...,...,...
16025,0,-1,20,1.000000e+00,20992
16026,0,16352,20,0.000000e+00,54839
16027,1,8192,20,1.089716e+07,15885725
16028,0,7324,20,5.992063e+07,59925068


In [68]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions_xgb))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.98      1.00      0.99      2079
weighted avg       1.00      1.00      1.00      2079



In [69]:
print("xgb Score -> ",accuracy_score(predictions_xgb, y_test)*100)
print("xgb roc_auc Score -> ",roc_auc_score(y_test,predictions_xgb))
print("f1 score ->", f1_score(y_test,predictions_xgb))

xgb Score ->  99.85569985569985
xgb roc_auc Score ->  0.9992443324937028
f1 score -> 0.9992437610284849


In [70]:
print(classification_report(y_test,predictions_xgb))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.98      1.00      0.99      2079
weighted avg       1.00      1.00      1.00      2079



In [71]:
from sklearn.metrics import confusion_matrix

In [72]:
conf_mat_xgb = confusion_matrix(y_test,predictions_xgb)
print(conf_mat_xgb)

[[  94    0]
 [   3 1982]]


# ADA BOOST

In [84]:
from sklearn.ensemble import AdaBoostClassifier

In [85]:
ada = AdaBoostClassifier()
ada_clf = ada.fit(x_train,y_train)
predictions_ada = ada_clf.predict(x_test)

In [86]:
print(classification_report(y_test,predictions_ada))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      0.99      0.99      2079
weighted avg       1.00      1.00      1.00      2079



In [87]:
print("ada Score -> ",accuracy_score(predictions_ada, y_test)*100)
print("ada roc_auc Score -> ",roc_auc_score(y_test,predictions_ada))
print("ada f1 score ->", f1_score(y_test,predictions_ada))

ada Score ->  99.85569985569985
ada roc_auc Score ->  0.9941770727262983
ada f1 score -> 0.9992441421012849


# RF

In [92]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
rf_clf = rf.fit(x_train,y_train)
predictions_rf = rf_clf.predict(x_test)

In [93]:
print(classification_report(y_test,predictions_rf))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      0.99      0.99      2079
weighted avg       1.00      1.00      1.00      2079



In [94]:
print("rf Score -> ",accuracy_score(predictions_rf, y_test)*100)
print("rf roc_auc Score -> ",roc_auc_score(y_test,predictions_rf))
print("rf f1 score ->", f1_score(y_test,predictions_rf))

rf Score ->  99.85569985569985
rf roc_auc Score ->  0.9941770727262983
rf f1 score -> 0.9992441421012849


# LGBM

In [95]:
import lightgbm

In [96]:
lgbm= lightgbm.LGBMClassifier()
lgbm_clf = lgbm.fit(x_train,y_train)
predictions_lgbm = lgbm_clf.predict(x_test)

In [97]:
print(classification_report(y_test,predictions_lgbm))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      1.00      0.99      2079
weighted avg       1.00      1.00      1.00      2079



In [98]:
print("rf Score -> ",accuracy_score(predictions_lgbm, y_test)*100)
print("rf roc_auc Score -> ",roc_auc_score(y_test,predictions_lgbm))
print("rf f1 score ->", f1_score(y_test,predictions_lgbm))

rf Score ->  99.90379990379991
rf roc_auc Score ->  0.9994962216624685
rf f1 score -> 0.9994959677419354


# Running on Whole Dataset


In [74]:
whole_df = df1
whole_df

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,115799309.0,115799309,1
1,1,5840,20,113973933.0,113973933,1
2,1,5840,20,1.0,112,1
3,1,5840,20,105985004.0,105985004,1
4,1,5840,20,1.0,1,1
...,...,...,...,...,...,...
1582676,1,5840,20,1.0,1,1
1582677,1,5840,20,1.0,1,1
1582678,1,5840,20,1.0,1,1
1582679,1,5840,20,1.0,1,1


In [75]:
whole_y = whole_df['label']
whole_x = whole_df.drop(['label'],axis=1)

In [76]:
whole_x

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,115799309.0,115799309
1,1,5840,20,113973933.0,113973933
2,1,5840,20,1.0,112
3,1,5840,20,105985004.0,105985004
4,1,5840,20,1.0,1
...,...,...,...,...,...
1582676,1,5840,20,1.0,1
1582677,1,5840,20,1.0,1
1582678,1,5840,20,1.0,1
1582679,1,5840,20,1.0,1


In [77]:
whole_x

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,115799309.0,115799309
1,1,5840,20,113973933.0,113973933
2,1,5840,20,1.0,112
3,1,5840,20,105985004.0,105985004
4,1,5840,20,1.0,1
...,...,...,...,...,...
1582676,1,5840,20,1.0,1
1582677,1,5840,20,1.0,1
1582678,1,5840,20,1.0,1
1582679,1,5840,20,1.0,1


In [78]:
whole_predict_xgb = xgb_clf.predict(whole_x)

In [89]:
print(classification_report(whole_y,whole_predict_xgb))
conf_mat_xgb = confusion_matrix(whole_y,whole_predict_xgb)
print(conf_mat_xgb)

              precision    recall  f1-score   support

           0       0.31      1.00      0.48       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.66      1.00      0.74   1582681
weighted avg       1.00      1.00      1.00   1582681

[[    392       0]
 [    863 1581426]]


In [91]:
whole_predict_ada = ada_clf.predict(whole_x)
print(classification_report(whole_y,whole_predict_ada))
conf_mat_xgb = confusion_matrix(whole_y,whole_predict_ada)
print(conf_mat_xgb)

              precision    recall  f1-score   support

           0       0.46      1.00      0.63       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.73      1.00      0.81   1582681
weighted avg       1.00      1.00      1.00   1582681

[[    391       1]
 [    466 1581823]]


In [99]:
whole_predict_rf = rf_clf.predict(whole_x)
print(classification_report(whole_y,whole_predict_rf))
conf_mat_xgb = confusion_matrix(whole_y,whole_predict_rf)
print(conf_mat_xgb)

              precision    recall  f1-score   support

           0       0.45      1.00      0.62       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.72      1.00      0.81   1582681
weighted avg       1.00      1.00      1.00   1582681

[[    391       1]
 [    479 1581810]]


In [100]:
whole_predict_lgbm = lgbm_clf.predict(whole_x)
print(classification_report(whole_y,whole_predict_lgbm))
conf_mat_xgb = confusion_matrix(whole_y,whole_predict_lgbm)
print(conf_mat_xgb)

              precision    recall  f1-score   support

           0       0.46      1.00      0.63       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.73      1.00      0.81   1582681
weighted avg       1.00      1.00      1.00   1582681

[[    392       0]
 [    467 1581822]]
