In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

le = LabelEncoder()

In [4]:
st_path = 'C:/Users/lenovo/DataSet/CIC-IDS-2017/GeneratedLabelledFlows/TrafficLabelling/'
st_file = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
encoding = 'utf_8'
df_data = pd.read_csv(os.path.join(st_path, st_file), encoding=encoding)
df_data.head()

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-104.16.207.165-54865-443-6,104.16.207.165,443,192.168.10.5,54865,6,7/7/2017 3:30,3,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,192.168.10.5-104.16.28.216-55054-80-6,104.16.28.216,80,192.168.10.5,55054,6,7/7/2017 3:30,109,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,192.168.10.5-104.16.28.216-55055-80-6,104.16.28.216,80,192.168.10.5,55055,6,7/7/2017 3:30,52,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,192.168.10.16-104.17.241.25-46236-443-6,104.17.241.25,443,192.168.10.16,46236,6,7/7/2017 3:30,34,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,192.168.10.5-104.19.196.102-54863-443-6,104.19.196.102,443,192.168.10.5,54863,6,7/7/2017 3:30,3,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [5]:
df_data.columns

Index(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Pa

In [7]:
df_features = df_data.drop(['Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', ' Label', 'Flow Bytes/s', ' Flow Packets/s'], axis=1)  # Features
df_target = df_data[' Label']  # Target variable

In [8]:
# Iterate over the columns in the dataframe to check if they are strings
for st_col in df_features.columns:
    if df_features[st_col].dtypes not in ['int64', 'float64']:
        print(df_features[st_col].dtypes)
        df_features[st_col] = le.fit_transform(df_features[st_col])

In [9]:
lt_columns = df_features[df_features.columns[df_features.max() == np.inf]].columns

In [10]:
# modify infinite values (10 x max)
for st_column_inf in lt_columns:
    print(st_column_inf)
    df_column_aux = df_features[st_column_inf]
    # identify the max value
    vl_max_aux = df_column_aux[df_column_aux < np.inf].max()
    print(vl_max_aux)
    # .loc is important to modify the value in the dataframe
    df_features.loc[df_features[st_column_inf] == np.inf, st_column_inf] = 10*vl_max_aux

In [11]:
lt_columns = df_features[df_features.columns[df_features.max() == np.inf]].columns
print('columns inf', lt_columns)

columns inf Index([], dtype='object')


In [12]:

# Search for the columns with NaN values
for st_column_nan in df_features.columns:
    df_column_aux = df_features[df_features[st_column_nan].isna()].copy()
    if len(df_column_aux) > 0:
        print(df_column_aux.transpose())
        print(df_target[df_features[st_column_nan].isna()].transpose())
        print(st_column_nan)
        print('The total amount of NaNs are', len(df_features[df_features[st_column_nan].isna()]))
        print(df_features[st_column_nan].describe())
# Drop the rows with NaN values
df_features.dropna(inplace=True)
df_target = df_target[df_target.index.isin(df_features.index)]

In [13]:
# Scale numerical features
scaler = StandardScaler()
mt_features_scaled = scaler.fit_transform(df_features)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(mt_features_scaled,
                                                    df_target,
                                                    test_size=0.2,
                                                    random_state=42)

In [15]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [16]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      BENIGN       1.00      1.00      1.00     19405
        DDoS       1.00      1.00      1.00     25744

    accuracy                           1.00     45149
   macro avg       1.00      1.00      1.00     45149
weighted avg       1.00      1.00      1.00     45149



In [17]:
# Confusion Matrix
mt_results = np.concatenate((np.matrix(y_pred), np.matrix(y_test)))
df_results = pd.DataFrame(mt_results, index=['pred', 'test']).transpose()
df_results['equals'] = df_results['test'] == df_results['pred']

vl_equals = len(df_results[df_results['equals'] == True])
vl_len_data = len(df_results)
print('total', vl_equals, vl_len_data, vl_equals/vl_len_data)

df_ddos = df_results[df_results['test'] != 'BENIGN'].copy()
vl_equals = len(df_ddos[df_ddos['equals'] == True])
vl_len_data = len(df_ddos)
print('Attack', vl_equals, vl_len_data, vl_equals/vl_len_data)

total 45143 45149 0.9998671066911781
Attack 25741 25744 0.9998834679925419


In [18]:
df_results = pd.DataFrame(y_pred, columns=['pred'])
df_results = pd.concat((df_results, pd.DataFrame(y_test, columns=['test'])), axis=1)
df_results

Unnamed: 0,pred,test
0,DDoS,
1,BENIGN,
2,BENIGN,
3,DDoS,
4,BENIGN,
...,...,...
45144,DDoS,
45145,BENIGN,
45146,BENIGN,
45147,DDoS,
