### Check input files



In [143]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('../datasets'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../datasets/sample.csv
../datasets/firstDataset.csv


### Import train data and create DataFrame object

In [144]:
numeric_data = pd.read_csv('../datasets/firstDataset.csv')
print("The files have been imported")


The files have been imported


### Find and display unique data
#### This function is used to search a Data Frame and then display unique values for each feature. Thanks to this, you can find features that have the same values, e.q: `OnlineSecurity` which contains the answers: 'Yes', 'No' and 'No internet service'.

In [None]:
def printUniqueValues(dataToCheck):
    for column in dataToCheck:
        uniqueValues = dataToCheck[column].unique()
        uniqueCount = len(uniqueValues)
        print(f"{column} = {uniqueValues} = {uniqueCount}")


printUniqueValues(numeric_data)

### DataFrame information
#### The `info()` method provides us a basic information about featurs, like: feature's amout of data (how many values are NaN values) and the types of data (e.q int or float). It is very important information for future data processing beacuse, e.q: fetures which contains variables of type 'int' will be labeled differently than features which contains variables of type 'float'.

In [None]:
numeric_data.info()


### Preprocess training data.

In [None]:
display(numeric_data)

Drop unnecessary features

In [145]:
def dropUnnecessaryFeatures(dataToTransform):
    dataToTransform = dataToTransform.drop(['Timestamp'], axis=1)

    dataToTransform = dataToTransform.drop(['Fwd Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Fwd Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Bwd Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Bwd Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Flow IAT Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Flow IAT Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Fwd IAT Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Fwd IAT Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Bwd IAT Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Bwd IAT Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Idle Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Idle Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Active Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Active Min'], axis=1)

    # "Protocol","PSH Flag Cnt","Init Fwd Win Byts","Flow Byts/s","Flow Pkts/s"
    return dataToTransform


numeric_data = dropUnnecessaryFeatures(numeric_data)
display(numeric_data)


Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Mean,...,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Idle Mean,Idle Std,Label
0,0,0,112641719,3,0,0,0,0.000000,0.000000,0.000000,...,0,-1,-1,0,0,0.0,0.0,56320859.5,139.300036,Benign
1,0,0,112641466,3,0,0,0,0.000000,0.000000,0.000000,...,0,-1,-1,0,0,0.0,0.0,56320733.0,114.551299,Benign
2,0,0,112638623,3,0,0,0,0.000000,0.000000,0.000000,...,0,-1,-1,0,0,0.0,0.0,56319311.5,301.934596,Benign
3,22,6,6453966,15,10,1239,2273,82.600000,196.741237,227.300000,...,2273,65535,233,6,32,0.0,0.0,0.0,0.000000,Benign
4,22,6,8804066,14,11,1143,2209,81.642857,203.745545,200.818182,...,2209,5808,233,6,32,0.0,0.0,0.0,0.000000,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,80,6,10156986,5,5,1089,1923,217.800000,299.745225,384.600000,...,1923,8192,31111,2,20,0.0,0.0,0.0,0.000000,Benign
1048571,80,6,117,2,0,0,0,0.000000,0.000000,0.000000,...,0,64240,-1,0,20,0.0,0.0,0.0,0.000000,Benign
1048572,80,6,5095331,3,1,0,0,0.000000,0.000000,0.000000,...,0,8192,29200,0,20,0.0,0.0,0.0,0.000000,Benign
1048573,80,6,5235511,3,1,0,0,0.000000,0.000000,0.000000,...,0,8192,42780,0,20,0.0,0.0,0.0,0.000000,Benign


In [None]:

# print(numeric_data['Init Bwd Win Byts'])


In [None]:
print(numeric_data['Init Fwd Win Byts'])


### Heatmap
#### Heatmap provide us information on how much the individual features are interpedent. Based on heatmap, we droped the feature that are high corelated with each other, beacuse, such data do not provide additional information and can increase the model load. For example, we drop `TotalCharges` because it's has high corellation with `MonthlyCharges`.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

f, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(abs(numeric_data.corr()), annot=True,
            linewidths=0.5, fmt='.2f', ax=ax)


#### Process labels

#### Before preprocess

In [147]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
import plotly.express as px
init_notebook_mode(connected=True)


def showAttackDistribution(data):
    normal = 'Benign'
    ftpAttack = 'FTP-BruteForce'
    sshAttack = 'SSH-Bruteforce'
    fig = go.Figure(data=[
        go.Bar(name=normal,
               y=data.value_counts().values[0:1],
               x=[normal],
               text=data.value_counts()[0:1],
               orientation='v',
               textposition='outside',),
        go.Bar(name=ftpAttack,
               y=data.value_counts().values[1:2],
               x=[ftpAttack],
               text=data.value_counts()[1:2],
               orientation='v',
               textposition='outside',),
        go.Bar(name=sshAttack,
               y=data.value_counts().values[2:],
               x=[sshAttack],
               text=data.value_counts()[2:],
               orientation='v',
               textposition='outside',)
    ])
    # Change the bar mode
    fig.update_layout(
        width=800,
        height=600,
        title=f'Labels Distribution',
        yaxis_title='Number of attacks',
        xaxis_title='Attack Name',)
    iplot(fig)


In [None]:
showAttackDistribution(numeric_data['Label'])

In [146]:
def shortData(data, featureName):
    shortedNormal = data[data[featureName] == "Benign"][:5000]
    shortedFTP = data[data[featureName] == "FTP-BruteForce"][:5000]
    shortedSSH = data[data[featureName] == "SSH-Bruteforce"][:5000]
    return pd.concat([shortedNormal, shortedFTP, shortedSSH], axis=0)


#### After preprocess

In [148]:
numeric_data = shortData(numeric_data, 'Label')
showAttackDistribution(numeric_data['Label'])

In [149]:
from sklearn.preprocessing import LabelEncoder


def makeOnlyAttackOrNot(dataToTransform):
    columnToTransform = ['Label']
    for colKey in columnToTransform:
        dataToTransform[colKey] = dataToTransform[colKey].map(
            lambda i: 0 if i == 'Benign' else 1)

    return dataToTransform

In [150]:
numeric_data = makeOnlyAttackOrNot(numeric_data)



# unique, counts = np.unique(numeric_data['Label'], return_counts=True)
# dict(zip(unique, counts))

In [None]:
# numeric_data.info()

# numeric_data.to_csv('test.csv', index=False)
# print("Ready")


In [None]:
# numeric_data['Label']

Save label

In [155]:
numeric_data = numeric_data.replace(np.nan, 0)
numeric_data = numeric_data.replace(np.inf, 0)
numeric_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 184445
Data columns (total 62 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dst Port           15000 non-null  int64  
 1   Protocol           15000 non-null  int64  
 2   Flow Duration      15000 non-null  int64  
 3   Tot Fwd Pkts       15000 non-null  int64  
 4   Tot Bwd Pkts       15000 non-null  int64  
 5   TotLen Fwd Pkts    15000 non-null  int64  
 6   TotLen Bwd Pkts    15000 non-null  int64  
 7   Fwd Pkt Len Mean   15000 non-null  float64
 8   Fwd Pkt Len Std    15000 non-null  float64
 9   Bwd Pkt Len Mean   15000 non-null  float64
 10  Bwd Pkt Len Std    15000 non-null  float64
 11  Flow Byts/s        15000 non-null  float64
 12  Flow Pkts/s        15000 non-null  float64
 13  Flow IAT Mean      15000 non-null  float64
 14  Flow IAT Std       15000 non-null  float64
 15  Fwd IAT Tot        15000 non-null  int64  
 16  Fwd IAT Mean       15

In [151]:
labels = numeric_data['Label']
numeric_data = numeric_data.drop(['Label'], axis=1)

In [None]:
# labels = LabelEncoder().fit_transform(labels)

In [156]:
# numeric_data = numeric_data.drop(['Label'], axis=1)
print(np.any(np.isnan(numeric_data)))
print(np.all(np.isfinite(numeric_data)))

False
True


In [157]:
from sklearn.preprocessing import StandardScaler


def scale(dataToTransform, columntoTransform):

    ss_dict = {col: StandardScaler() for col in columntoTransform}

    for colKey in columntoTransform:
        dataToTransform[colKey] = ss_dict[colKey].fit_transform(
            np.array(dataToTransform[colKey]).reshape(-1, 1))

    return dataToTransform


In [158]:
# numericColumntoTransform = ['Flow Duration',
#                             'Init Fwd Win Byts', 'Init Bwd Win Byts']
numericColumntoTransform = numeric_data.keys()
numeric_data = scale(numeric_data, numericColumntoTransform)


In [159]:
def lowerTypes(dataToTransform, columnToFloat):
    for colKey in columnToFloat:
        dataToTransform[colKey] = dataToTransform[colKey].astype('float32')
    return dataToTransform

numeric_data = lowerTypes(numeric_data, numericColumntoTransform)

In [160]:
# def removeNan(dataToTransform, columntoTransform):
#     for colKey in columntoTransform:
#         dataToTransform[colKey] = pd.to_numeric(
#             dataToTransform[colKey], downcast="float", errors='coerce').fillna(dataToTransform[colKey][100])
#     return dataToTransform

# numeric_data = removeNan(numeric_data, numeric_data.keys())

# numeric_data = numeric_data.drop(['11'], axis=1)
# numeric_data = numeric_data.drop(['12'], axis=1)
numeric_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 184445
Data columns (total 62 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dst Port           15000 non-null  float32
 1   Protocol           15000 non-null  float32
 2   Flow Duration      15000 non-null  float32
 3   Tot Fwd Pkts       15000 non-null  float32
 4   Tot Bwd Pkts       15000 non-null  float32
 5   TotLen Fwd Pkts    15000 non-null  float32
 6   TotLen Bwd Pkts    15000 non-null  float32
 7   Fwd Pkt Len Mean   15000 non-null  float32
 8   Fwd Pkt Len Std    15000 non-null  float32
 9   Bwd Pkt Len Mean   15000 non-null  float32
 10  Bwd Pkt Len Std    15000 non-null  float32
 11  Flow Byts/s        15000 non-null  float32
 12  Flow Pkts/s        15000 non-null  float32
 13  Flow IAT Mean      15000 non-null  float32
 14  Flow IAT Std       15000 non-null  float32
 15  Fwd IAT Tot        15000 non-null  float32
 16  Fwd IAT Mean       15

Scale numeric values

In [161]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    numeric_data, labels, test_size=0.33, random_state=42)


In [162]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion='entropy')

In [None]:
labels

In [163]:
model.fit(numeric_data, labels)

RandomForestClassifier(criterion='entropy')

In [164]:
from sklearn.metrics import roc_auc_score, accuracy_score

X_train_preds_acc = model.predict(X_train)
X_test_preds_acc = model.predict(X_test)
    
# print('Train auc:', roc_auc_score(y_train, X_train_preds))
# print('Test auc:', roc_auc_score(y_test, X_test_preds))
print('\n')
print('Train accuracy: ', accuracy_score(y_train, X_train_preds_acc))
print('Test accuracy: ', accuracy_score(y_test, X_test_preds_acc))



Train accuracy:  1.0
Test accuracy:  1.0


In [166]:
y_train

274795    0
181481    1
184289    1
183423    1
3069      1
         ..
286       1
182856    1
485       1
313513    0
2365      1
Name: Label, Length: 10050, dtype: int64

In [165]:
X_train_preds_acc

array([0, 1, 1, ..., 1, 0, 1])