### Check input files



In [375]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('../datasets'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../datasets/sample.csv
../datasets/firstDataset.csv


### Import train data and create DataFrame object

In [376]:
numeric_data = pd.read_csv('../datasets/firstDataset.csv')
print("The files have been imported")


The files have been imported


### Find and display unique data
#### This function is used to search a Data Frame and then display unique values for each feature. Thanks to this, you can find features that have the same values, e.q: `OnlineSecurity` which contains the answers: 'Yes', 'No' and 'No internet service'.

In [377]:
def printUniqueValues(dataToCheck):
    for column in dataToCheck:
        uniqueValues = dataToCheck[column].unique()
        uniqueCount = len(uniqueValues)
        print(f"{column} = {uniqueValues} = {uniqueCount}")


printUniqueValues(numeric_data)


Dst Port = [    0    22    80 ... 46898  2041  2178] = 18567
Protocol = [ 0  6 17] = 3
Timestamp = ['14/02/2018 08:31:01' '14/02/2018 08:33:50' '14/02/2018 08:36:39' ...
 '14/02/2018 01:51:56' '14/02/2018 01:49:38' '14/02/2018 01:47:38'] = 32043
Flow Duration = [112641719 112641466 112638623 ...   5095331   5235511   5807256] = 389493
Tot Fwd Pkts = [   3   15   14   16    5    1   91   11    6    4   41   23   10  975
 1038  125 1041  555   25   22   21   26   24   20   27   19   28    2
   52   17   12   13    9    7    8   51  533 4909   54   18   29   31
   39   97   35   58   32   79   55  120   43   36  126  129  130   99
  178  205   50   74   33   77   47  226   71   44   87   30   40  162
   42  105   60   49  112   37   96  123   34   48  151  310  153   53
   70  106   85  636   90   59   57   76  496  149   69   46   72   68
   45  133  155  134  390   38   67  180   73   86   83  282  234  107
  633  667   56  425  230   93   82 4352  597  341  271   62  115  405
   61  11

### DataFrame information
#### The `info()` method provides us a basic information about featurs, like: feature's amout of data (how many values are NaN values) and the types of data (e.q int or float). It is very important information for future data processing beacuse, e.q: fetures which contains variables of type 'int' will be labeled differently than features which contains variables of type 'float'.

In [378]:
numeric_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

### Preprocess training data.

Drop unnecessary features

In [379]:
def dropUnnecessaryFeatures(dataToTransform):
    dataToTransform = dataToTransform.drop(['Timestamp'], axis=1)

    dataToTransform = dataToTransform.drop(['Fwd Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Fwd Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Bwd Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Bwd Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Flow IAT Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Flow IAT Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Fwd IAT Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Fwd IAT Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Bwd IAT Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Bwd IAT Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Idle Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Idle Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Active Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Active Min'], axis=1)

    # "Protocol","PSH Flag Cnt","Init Fwd Win Byts","Flow Byts/s","Flow Pkts/s"
    return dataToTransform


numeric_data = dropUnnecessaryFeatures(numeric_data)


### Heatmap
#### Heatmap provide us information on how much the individual features are interpedent. 

In [380]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# f, ax = plt.subplots(figsize=(15, 8))
# sns.heatmap(abs(numeric_data.corr()), annot=True,
#             linewidths=0.5, fmt='.2f', ax=ax)


#### Process labels

#### Before preprocess

In [381]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly as py
import plotly.express as px
init_notebook_mode(connected=True)


def showAttackDistribution(data):
    normal = 'Benign'
    ftpAttack = 'FTP-BruteForce'
    sshAttack = 'SSH-Bruteforce'
    fig = go.Figure(data=[
        go.Bar(name=normal,
               y=data.value_counts().values[0:1],
               x=[normal],
               text=data.value_counts()[0:1],
               orientation='v',
               textposition='outside',),
        go.Bar(name=ftpAttack,
               y=data.value_counts().values[1:2],
               x=[ftpAttack],
               text=data.value_counts()[1:2],
               orientation='v',
               textposition='outside',),
        go.Bar(name=sshAttack,
               y=data.value_counts().values[2:],
               x=[sshAttack],
               text=data.value_counts()[2:],
               orientation='v',
               textposition='outside',)
    ])
    # Change the bar mode
    fig.update_layout(
        width=800,
        height=600,
        title=f'Labels Distribution',
        yaxis_title='Number of attacks',
        xaxis_title='Attack Name',)
    iplot(fig)


In [382]:
showAttackDistribution(numeric_data['Label'])


In [383]:
def shortData(data, featureName):
    shortedNormal = data[data[featureName] == "Benign"][:10000]
    shortedFTP = data[data[featureName] == "FTP-BruteForce"][:5000]
    shortedSSH = data[data[featureName] == "SSH-Bruteforce"][:5000]
    return pd.concat([shortedNormal, shortedFTP, shortedSSH], axis=0)


In [384]:
numeric_data = shortData(numeric_data, 'Label')
showAttackDistribution(numeric_data['Label'])


In [385]:

def makeOnlyAttackOrNot(dataToTransform):
    columnToTransform = ['Label']
    for colKey in columnToTransform:
        dataToTransform[colKey] = dataToTransform[colKey].map(
            lambda i: 0 if i == 'Benign' else 1)

    return dataToTransform


In [386]:
numeric_data = makeOnlyAttackOrNot(numeric_data)


Save label

In [387]:
numeric_data = numeric_data.replace(np.nan, 0)
numeric_data = numeric_data.replace(np.inf, 0)


In [388]:
labels = numeric_data['Label']
numeric_data = numeric_data.drop(['Label'], axis=1)


In [389]:
print('Is any Nan:', np.any(np.isnan(numeric_data)))
print('Is all finite:', np.all(np.isfinite(numeric_data)))


Is any Nan: False
Is all finite: True


In [390]:
from sklearn.preprocessing import StandardScaler


def scale(dataToTransform, columntoTransform):

    ss_dict = {col: StandardScaler() for col in columntoTransform}

    for colKey in columntoTransform:
        dataToTransform[colKey] = ss_dict[colKey].fit_transform(
            np.array(dataToTransform[colKey]).reshape(-1, 1))

    return dataToTransform


Scale numeric values

In [391]:
numericColumntoTransform = numeric_data.keys()
numeric_data = scale(numeric_data, numericColumntoTransform)

In [392]:
# def lowerTypes(dataToTransform, columnToFloat):
#     for colKey in columnToFloat:
#         dataToTransform[colKey] = dataToTransform[colKey].astype('float32')
#     return dataToTransform

In [393]:
# numeric_data = lowerTypes(numeric_data, numericColumntoTransform)

#### After preprocess

In [395]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    numeric_data, labels, test_size=0.33, random_state=32)


In [401]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

def model():
    # model = MLPClassifier(random_state=32)
    model = RandomForestClassifier(n_estimators = 100)
    return model

In [402]:
model = model()

In [403]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [404]:
from sklearn.metrics import  accuracy_score

X_train_preds_acc = model.predict(X_train)
X_test_preds_acc = model.predict(X_test)

print('Train accuracy: ', accuracy_score(y_train, X_train_preds_acc))
print('Test accuracy: ', accuracy_score(y_test, X_test_preds_acc))


Train accuracy:  1.0
Test accuracy:  1.0


In [400]:
numeric_data.to_csv('test.csv', index=False)
print("Ready")


Ready
