### Check input files



In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('../datasets'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Import train data and create DataFrame object

In [None]:
numeric_data = pd.read_csv('../datasets/firstDataset.csv')
print("The files have been imported")


### Find and display unique data
#### This function is used to search a Data Frame and then display unique values for each feature. Thanks to this, you can find features that have the same values, e.q: `OnlineSecurity` which contains the answers: 'Yes', 'No' and 'No internet service'.

In [None]:
def printUniqueValues(dataToCheck):
    for column in dataToCheck:
        uniqueValues = dataToCheck[column].unique()
        uniqueCount = len(uniqueValues)
        print(f"{column} = {uniqueValues} = {uniqueCount}")


printUniqueValues(numeric_data)

### DataFrame information
#### The `info()` method provides us a basic information about featurs, like: feature's amout of data (how many values are NaN values) and the types of data (e.q int or float). It is very important information for future data processing beacuse, e.q: fetures which contains variables of type 'int' will be labeled differently than features which contains variables of type 'float'.

In [None]:
numeric_data.info()


### Preprocess training data.

In [None]:
display(numeric_data)

Drop unnecessary features

In [None]:
def dropUnnecessaryFeatures(dataToTransform):
    dataToTransform = dataToTransform.drop(['Timestamp'], axis=1)

    dataToTransform = dataToTransform.drop(['Fwd Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Fwd Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Bwd Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Bwd Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Flow IAT Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Flow IAT Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Fwd IAT Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Fwd IAT Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Bwd IAT Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Bwd IAT Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Pkt Len Min'], axis=1)
    dataToTransform = dataToTransform.drop(['Pkt Len Max'], axis=1)

    dataToTransform = dataToTransform.drop(['Idle Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Idle Min'], axis=1)

    dataToTransform = dataToTransform.drop(['Active Max'], axis=1)
    dataToTransform = dataToTransform.drop(['Active Min'], axis=1)

    # "Protocol","PSH Flag Cnt","Init Fwd Win Byts","Flow Byts/s","Flow Pkts/s"
    return dataToTransform


numeric_data = dropUnnecessaryFeatures(numeric_data)
display(numeric_data)


In [None]:

print(numeric_data['Init Bwd Win Byts'])


In [None]:
from sklearn.preprocessing import StandardScaler


def scale(dataToTransform, columntoTransform):

    ss_dict = {col: StandardScaler() for col in columntoTransform}

    for colKey in columntoTransform:
        dataToTransform[colKey] = ss_dict[colKey].fit_transform(
            np.array(dataToTransform[colKey]).reshape(-1, 1))

    return dataToTransform


In [None]:
print(np.any(np.isnan(numeric_data)))
print(np.all(np.isfinite(numeric_data)))

In [None]:
# numericColumntoTransform = ['Flow Duration',
#                             'Init Fwd Win Byts', 'Init Bwd Win Byts']

labels = numeric_data['Label']
numeric_data = numeric_data.drop(['Label'], axis=1)

numericColumntoTransform = numeric_data.keys()

numeric_data = scale(numeric_data, numericColumntoTransform)
display(numeric_data['Label'])


In [None]:
print(numeric_data['Init Fwd Win Byts'])


### Heatmap
#### Heatmap provide us information on how much the individual features are interpedent. Based on heatmap, we droped the feature that are high corelated with each other, beacuse, such data do not provide additional information and can increase the model load. For example, we drop `TotalCharges` because it's has high corellation with `MonthlyCharges`.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

f, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(abs(numeric_data.corr()), annot=True,
            linewidths=0.5, fmt='.2f', ax=ax)


#### Process labels

#### Before preprocess

In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
import plotly.express as px
init_notebook_mode(connected=True)


def showAttackDistribution(data):
    normal = 'Benign'
    ftpAttack = 'FTP-BruteForce'
    sshAttack = 'SSH-Bruteforce'
    fig = go.Figure(data=[
        go.Bar(name=normal,
               y=data.value_counts().values[0:1],
               x=[normal],
               text=data.value_counts()[0:1],
               orientation='v',
               textposition='outside',),
        go.Bar(name=ftpAttack,
               y=data.value_counts().values[1:2],
               x=[ftpAttack],
               text=data.value_counts()[1:2],
               orientation='v',
               textposition='outside',),
        go.Bar(name=sshAttack,
               y=data.value_counts().values[2:],
               x=[sshAttack],
               text=data.value_counts()[2:],
               orientation='v',
               textposition='outside',)
    ])
    # Change the bar mode
    fig.update_layout(
        width=800,
        height=600,
        title=f'Labels Distribution',
        yaxis_title='Number of attacks',
        xaxis_title='Attack Name',)
    iplot(fig)


In [None]:
showAttackDistribution(numeric_data['Label'])


In [None]:
def shortData(data, featureName):
    shortedNormal = data[data[featureName] == "Benign"][:5000]
    shortedFTP = data[data[featureName] == "FTP-BruteForce"][:5000]
    shortedSSH = data[data[featureName] == "SSH-Bruteforce"][:5000]
    return pd.concat([shortedNormal, shortedFTP, shortedSSH], axis=0)


In [None]:
numeric_data = shortData(numeric_data, 'Label')


In [None]:
from sklearn.preprocessing import LabelEncoder


def labelEncodeTypeAttack(dataToTransform):
    dataToTransform = LabelEncoder().fit_transform(
        dataToTransform)

    return dataToTransform


In [None]:
numeric_data['Label'] = labelEncodeTypeAttack(numeric_data['Label'])
unique, counts = np.unique(numeric_data['Label'], return_counts=True)
dict(zip(unique, counts))


In [None]:
numeric_data.to_csv('test.csv', index=False)
print("Ready")


In [None]:
from sklearn.model_selection import train_test_split



numeric_data = np.nan_to_num(numeric_data)

X_train, X_test, y_train, y_test = train_test_split(
    numeric_data, labels, test_size=0.98, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score

X_train_preds_acc = model.predict(X_train)
# X_test_preds_acc = model.predict(X_test)
    
# print('Train auc:', roc_auc_score(y_train, X_train_preds))
# print('Test auc:', roc_auc_score(y_test, X_test_preds))
print('\n')
print('Train accuracy: ', accuracy_score(y_train, X_train_preds_acc))
# print('Test accuracy: ', accuracy_score(y_test, X_test_preds_acc))