In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score


In [2]:
df=pd.read_csv('F:\\Cyprus\\CSV-01-12\\01-12\\Syn.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df.columns

Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,281052,172.16.0.5-192.168.50.1-53058-53058-6,172.16.0.5,53058,192.168.50.1,53058,6,2018-12-01 13:30:30.741451,115799309,19,...,646237.483665,1709809.0,1.0,14261170.0,3220326.0,21714933.0,11043464.0,0,1,Syn
1,450424,172.16.0.5-192.168.50.1-32237-32237-6,172.16.0.5,32237,192.168.50.1,32237,6,2018-12-01 13:30:30.741452,113973933,16,...,19.595918,49.0,1.0,16281980.0,2573891.0,20019405.0,11993631.0,0,1,Syn
2,182979,172.16.0.5-192.168.50.1-60495-9840-6,172.16.0.5,60495,192.168.50.1,9840,6,2018-12-01 13:30:30.741501,112,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn
3,41540,172.16.0.5-192.168.50.1-59724-59724-6,172.16.0.5,59724,192.168.50.1,59724,6,2018-12-01 13:30:30.741563,105985004,16,...,17.705259,48.0,1.0,15140710.0,3077366.0,20954123.0,11120336.0,0,1,Syn
4,358711,172.16.0.5-192.168.50.1-60496-32538-6,172.16.0.5,60496,192.168.50.1,32538,6,2018-12-01 13:30:30.741565,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,Syn


In [5]:
ack = df[' ACK Flag Count']
pd.unique(ack)


array([1, 0], dtype=int64)

In [6]:
init = df['Init_Win_bytes_forward']
pd.unique(init)
print(max(init))
print(min(init))

65535
-1


In [7]:
seg = df[' min_seg_size_forward']
pd.unique(seg)

array([20,  0, 28, 32, 40, 24], dtype=int64)

In [8]:
iat = df['Fwd IAT Total']
pd.unique(iat)
print(max(iat))
print(min(iat))
print(len(iat))

119999653.0
0.0
1582681


In [9]:
flow = df[' Flow Duration']
pd.unique(flow)
print(max(flow))
print(min(flow))
print(len(flow))

119999653
0
1582681


In [10]:
label = df[' Label']
pd.unique(label)
print(max(label))
print(min(label))
print(len(label))

Syn
BENIGN
1582681


In [11]:
data = [df[" ACK Flag Count"], df["Init_Win_bytes_forward"],df[" min_seg_size_forward"], df["Fwd IAT Total"],df[" Flow Duration"], df[" Label"]]

In [12]:
headers = ["ack", "init","seg","iat","flow","label"]

In [13]:
df1 = pd.concat(data, axis=1, keys=headers)

In [14]:
df1

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,115799309.0,115799309,Syn
1,1,5840,20,113973933.0,113973933,Syn
2,1,5840,20,1.0,112,Syn
3,1,5840,20,105985004.0,105985004,Syn
4,1,5840,20,1.0,1,Syn
...,...,...,...,...,...,...
1582676,1,5840,20,1.0,1,Syn
1582677,1,5840,20,1.0,1,Syn
1582678,1,5840,20,1.0,1,Syn
1582679,1,5840,20,1.0,1,Syn


In [15]:
df1.to_csv("syn_attributes.csv")


## Model Creation

In [16]:
df1=pd.read_csv('syn_attributes.csv')

In [17]:
from pycaret.classification import *
from sklearn.preprocessing import LabelEncoder

In [18]:
# label encoding for Y values
Encoder = LabelEncoder()
le = Encoder.fit(df1["label"])
df1["label"] = le.transform(df1["label"])

In [19]:
pd.unique(df1['label'])
# 1 = Syn
# 0 = Benign

array([1, 0])

In [49]:
df1

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow,label
0,0,1,5840,20,115799309.0,115799309,1
1,1,1,5840,20,113973933.0,113973933,1
2,2,1,5840,20,1.0,112,1
3,3,1,5840,20,105985004.0,105985004,1
4,4,1,5840,20,1.0,1,1
...,...,...,...,...,...,...,...
1582676,1582676,1,5840,20,1.0,1,1
1582677,1582677,1,5840,20,1.0,1,1
1582678,1582678,1,5840,20,1.0,1,1
1582679,1582679,1,5840,20,1.0,1,1


In [21]:
# sample 10000 points from majority class to create new dataset
df_maj = df1[df1['label']==1]
df_min = df1[df1['label']==0]
df_maj = df_maj.sample(n=10000)
df_maj = pd.concat([df_maj,df_min], axis=0)


In [22]:
len(df_maj)

10392

In [23]:
X = df_maj.drop('label',axis=1)
y = df_maj['label']

In [24]:
X

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow
1423032,1423032,1,5840,20,1.0,1
50600,50600,1,5840,20,1.0,1
488924,488924,1,5840,20,1.0,1
1079307,1079307,1,5840,20,1.0,1
523604,523604,1,5840,20,1.0,1
...,...,...,...,...,...,...
1479185,1479185,0,4253,20,499664.0,499664
1482435,1482435,0,4253,20,0.0,217
1482436,1482436,0,4253,20,0.0,194
1502348,1502348,0,-1,20,2.0,20808


In [25]:
X['init'].value_counts()

 5840     9990
-1         170
 8192       42
 256        18
 252        15
 64240      14
 254        13
 255        10
 246         9
 258         8
 0           7
 253         6
 1024        6
 16385       5
 4253        5
 237         5
 16402       4
 1892        4
 16224       4
 257         3
 1959        3
 2093        3
 238         3
 65535       3
 315         2
 2160        2
 581         2
 268         2
 16508       2
 16362       2
 16321       2
 16275       2
 16439       2
 325         2
 4272        2
 16476       2
 16425       2
 245         1
 419         1
 16346       1
 16560       1
 114         1
 16369       1
 2026        1
 16127       1
 607         1
 16324       1
 122         1
 121         1
 119         1
 297         1
 1825        1
 335         1
Name: init, dtype: int64

In [26]:
# train and test split from newly created dataset 
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=32, shuffle=True)

In [27]:
y_test.value_counts()

1    1985
0      94
Name: label, dtype: int64

In [28]:
# perform smote for the dataset to solve imbalance
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)


In [29]:
y_train.value_counts()

1    8015
0    8015
Name: label, dtype: int64

In [47]:
x_train

Unnamed: 0.1,Unnamed: 0,ack,init,seg,iat,flow
0,718149,1,5840,20,2.000000e+00,2
1,909726,1,8192,20,5.908991e+07,59089908
2,427660,1,5840,20,1.000000e+00,1
3,213114,1,5840,20,0.000000e+00,0
4,1364370,1,5840,20,1.000000e+00,1
...,...,...,...,...,...,...
16025,965645,0,-1,20,1.766674e+00,20793
16026,682444,0,-1,20,2.907547e-01,20803
16027,598054,0,250,20,8.235029e-01,34
16028,664113,0,-1,21,1.739008e+00,20824


In [31]:
# concatenate dataset for passing to pycaret
train_dataset = pd.concat([x_train,y_train],axis=1)
test_dataset = pd.concat([x_test,y_test],axis=1)

In [32]:
train_dataset.drop(['Unnamed: 0'],axis=1,inplace=True)
test_dataset.drop(['Unnamed: 0'],axis=1,inplace=True)

In [33]:
train_dataset

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,2.000000e+00,2,1
1,1,8192,20,5.908991e+07,59089908,0
2,1,5840,20,1.000000e+00,1,1
3,1,5840,20,0.000000e+00,0,1
4,1,5840,20,1.000000e+00,1,1
...,...,...,...,...,...,...
16025,0,-1,20,1.766674e+00,20793,0
16026,0,-1,20,2.907547e-01,20803,0
16027,0,250,20,8.235029e-01,34,0
16028,0,-1,21,1.739008e+00,20824,0


In [34]:
test_dataset

Unnamed: 0,ack,init,seg,iat,flow,label
1170874,1,5840,20,1.0,1,1
355771,1,5840,20,1.0,1,1
600192,1,5840,20,1.0,1,1
1231156,1,5840,20,1.0,1,1
5059,1,5840,20,102466405.0,102466405,1
...,...,...,...,...,...,...
968551,1,5840,20,0.0,0,1
1439621,1,5840,20,1.0,1,1
1419072,1,5840,20,1.0,1,1
641380,1,5840,20,1.0,1,1


In [35]:
len(train_dataset['init'].value_counts())

1859

In [36]:
# pycaret init
exp = setup(data = train_dataset, target='label', train_size = 1.0, preprocess = True, test_data = test_dataset, session_id=1, use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,1
1,Target,label
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(16030, 6)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [37]:
xgb_model = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9981,1.0,0.9988,0.9975,0.9981,0.9963,0.9963
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.9988,1.0,0.9975,1.0,0.9988,0.9975,0.9975
3,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
4,0.9988,1.0,0.9975,1.0,0.9988,0.9975,0.9975
5,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [38]:
tuned_xgb = tune_model(xgb_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9963
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
5,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
6,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988


In [39]:
ada_model = create_model('ada')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
4,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
5,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
tuned_ada = tune_model(ada_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
rf_model = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
3,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
4,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
tuned_rf = tune_model(rf_model, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.9988,1.0,0.9975,1.0,0.9988,0.9975,0.9975
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9988,1.0,0.9975,1.0,0.9988,0.9975,0.9975
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
lgbm_model=create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9988,1.0,1.0,0.9975,0.9988,0.9975,0.9975
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
3,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
4,0.9994,1.0,1.0,0.9988,0.9994,0.9988,0.9988
5,0.9994,1.0,0.9988,1.0,0.9994,0.9988,0.9988
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
tuned_lgbm = tune_model(lgbm_model, optimize = 'AUC')

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.








KeyboardInterrupt: 

In [45]:
lr_model=create_model('lr') #JUST TO CHECK IF ITS SHOWING GOOD RESULTS FOR ALL

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7024,0.7737,0.919,0.6414,0.7555,0.4047,0.449
1,0.6918,0.763,0.9202,0.6318,0.7492,0.3835,0.4311
2,0.8185,0.8557,0.7007,0.9168,0.7943,0.637,0.6555
3,0.6962,0.7931,0.9377,0.6325,0.7554,0.3922,0.4479
4,0.6837,0.758,0.9065,0.6273,0.7415,0.3673,0.4102
5,0.6981,0.7554,0.9213,0.6368,0.7531,0.3963,0.4429
6,0.6831,0.7672,0.9226,0.6236,0.7442,0.3664,0.4174
7,0.6625,0.7623,0.9201,0.6071,0.7315,0.3252,0.3794
8,0.6962,0.7905,0.9101,0.6372,0.7496,0.3926,0.4343
9,0.6812,0.7411,0.9201,0.6225,0.7426,0.3626,0.4128


## Manual training

In [61]:
x_test.drop(['Unnamed: 0'],axis=1,inplace=True)
x_train.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
x_test

In [None]:
x_train

### Training Different Models

In [62]:
xgb = XGBClassifier(scale_pos_weight=1)
xgb_clf = xgb.fit(x_train,y_train)
predictions_xgb = xgb_clf.predict(x_test)



In [63]:
x_train

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,2.000000e+00,2
1,1,8192,20,5.908991e+07,59089908
2,1,5840,20,1.000000e+00,1
3,1,5840,20,0.000000e+00,0
4,1,5840,20,1.000000e+00,1
...,...,...,...,...,...
16025,0,-1,20,1.766674e+00,20793
16026,0,-1,20,2.907547e-01,20803
16027,0,250,20,8.235029e-01,34
16028,0,-1,21,1.739008e+00,20824


In [66]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions_xgb))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        94
           1       1.00      1.00      1.00      1985

    accuracy                           1.00      2079
   macro avg       0.99      1.00      1.00      2079
weighted avg       1.00      1.00      1.00      2079



In [None]:
print("xgb Score -> ",accuracy_score(predictions_xgb, y_test)*100)
print("xgb roc_auc Score -> ",roc_auc_score(y_test,predictions_xgb))
print("f1 score ->", f1_score(y_test,predictions_xgb))

# ADA BOOST

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()
ada_clf = ada.fit(x_train,y_train)
predictions_ada = ada_clf.predict(x_test)

In [None]:
print(classification_report(y_test,predictions_ada))

In [None]:
print("ada Score -> ",accuracy_score(predictions_ada, y_test)*100)
print("ada roc_auc Score -> ",roc_auc_score(y_test,predictions_ada))
print("ada f1 score ->", f1_score(y_test,predictions_ada))

# RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
rf_clf = rf.fit(x_train,y_train)
predictions_rf = rf_clf.predict(x_test)

In [None]:
print(classification_report(y_test,predictions_rf))

In [None]:
print("rf Score -> ",accuracy_score(predictions_rf, y_test)*100)
print("rf roc_auc Score -> ",roc_auc_score(y_test,predictions_rf))
print("rf f1 score ->", f1_score(y_test,predictions_rf))

# LGBM

In [None]:
import lightgbm

In [None]:
lgbm= lightgbm.LGBMClassifier()
lgbm_clf = lgbm.fit(x_train,y_train)
predictions_lgbm = lgbm_clf.predict(x_test)

In [None]:
print(classification_report(y_test,predictions_lgbm))

In [None]:
print("rf Score -> ",accuracy_score(predictions_lgbm, y_test)*100)
print("rf roc_auc Score -> ",roc_auc_score(y_test,predictions_lgbm))
print("rf f1 score ->", f1_score(y_test,predictions_lgbm))

# Running on Whole Dataset


In [52]:
whole_df = df1.drop(['Unnamed: 0'],axis=1)
whole_df

Unnamed: 0,ack,init,seg,iat,flow,label
0,1,5840,20,115799309.0,115799309,1
1,1,5840,20,113973933.0,113973933,1
2,1,5840,20,1.0,112,1
3,1,5840,20,105985004.0,105985004,1
4,1,5840,20,1.0,1,1
...,...,...,...,...,...,...
1582676,1,5840,20,1.0,1,1
1582677,1,5840,20,1.0,1,1
1582678,1,5840,20,1.0,1,1
1582679,1,5840,20,1.0,1,1


In [54]:
whole_y = whole_df['label']
whole_x = whole_df.drop(['label'],axis=1)

In [55]:
whole_x

Unnamed: 0,ack,init,seg,iat,flow
0,1,5840,20,115799309.0,115799309
1,1,5840,20,113973933.0,113973933
2,1,5840,20,1.0,112
3,1,5840,20,105985004.0,105985004
4,1,5840,20,1.0,1
...,...,...,...,...,...
1582676,1,5840,20,1.0,1
1582677,1,5840,20,1.0,1
1582678,1,5840,20,1.0,1
1582679,1,5840,20,1.0,1


In [56]:
whole_y

0          1
1          1
2          1
3          1
4          1
          ..
1582676    1
1582677    1
1582678    1
1582679    1
1582680    1
Name: label, Length: 1582681, dtype: int32

In [64]:
whole_predict_xgb = xgb_clf.predict(whole_x)

In [68]:
print(classification_report(whole_y,whole_predict_xgb))

              precision    recall  f1-score   support

           0       0.55      0.99      0.70       392
           1       1.00      1.00      1.00   1582289

    accuracy                           1.00   1582681
   macro avg       0.77      1.00      0.85   1582681
weighted avg       1.00      1.00      1.00   1582681

