### Check input files



In [None]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('datasets'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Import train data and create DataFrame object

In [None]:
def convert_dtype_to_int(x):
    if not x:
        return np.nan
    try:
        return int(x)
    except:
        return np.nan


def convert_dtype_to_float(x):
    if not x:
        return np.nan
    try:
        return float(x)
    except:
        return np.nan


In [None]:
rawsData = pd.read_csv('datasets/Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv',
                       converters={'Dst Port': convert_dtype_to_int,
                                   'Protocol': convert_dtype_to_int,
                                   'Flow Duration': convert_dtype_to_int,
                                   'Tot Fwd Pkts': convert_dtype_to_int,
                                   'Tot Bwd Pkts': convert_dtype_to_int,
                                   'TotLen Fwd Pkts': convert_dtype_to_int,
                                   'TotLen Bwd Pkts': convert_dtype_to_int,
                                   'Fwd Pkt Len Max': convert_dtype_to_int,
                                   'Fwd Pkt Len Min': convert_dtype_to_int,
                                   'Fwd Pkt Len Mean': convert_dtype_to_float,
                                   'Fwd Pkt Len Std': convert_dtype_to_float,
                                   'Bwd Pkt Len Max': convert_dtype_to_int,
                                   'Bwd Pkt Len Min': convert_dtype_to_int,
                                   'Bwd Pkt Len Mean': convert_dtype_to_float,
                                   'Bwd Pkt Len Std': convert_dtype_to_float,
                                   'Flow Byts/s': convert_dtype_to_float,
                                   'Flow Pkts/s': convert_dtype_to_float,
                                   'Flow IAT Mean': convert_dtype_to_float,
                                   'Flow IAT Std': convert_dtype_to_float,
                                   'Flow IAT Max': convert_dtype_to_int,
                                   'Flow IAT Min': convert_dtype_to_int,
                                   'Fwd IAT Tot': convert_dtype_to_int,
                                   'Fwd IAT Mean': convert_dtype_to_float,
                                   'Fwd IAT Std': convert_dtype_to_float,
                                   'Fwd IAT Max': convert_dtype_to_int,
                                   'Fwd IAT Min': convert_dtype_to_int,
                                   'Bwd IAT Tot': convert_dtype_to_int,
                                   'Bwd IAT Mean': convert_dtype_to_float,
                                   'Bwd IAT Std': convert_dtype_to_float,
                                   'Bwd IAT Max': convert_dtype_to_int,
                                   'Bwd IAT Min': convert_dtype_to_int,
                                   'Fwd PSH Flags': convert_dtype_to_int,
                                   'Bwd PSH Flags': convert_dtype_to_int,
                                   'Fwd URG Flags': convert_dtype_to_int,
                                   'Bwd URG Flags': convert_dtype_to_int,
                                   'Fwd Header Len': convert_dtype_to_int,
                                   'Bwd Header Len': convert_dtype_to_int,
                                   'Fwd Pkts/s': convert_dtype_to_float,
                                   'Bwd Pkts/s': convert_dtype_to_float,
                                   'Pkt Len Min': convert_dtype_to_int,
                                   'Pkt Len Max': convert_dtype_to_int,
                                   'Pkt Len Mean': convert_dtype_to_float,
                                   'Pkt Len Std': convert_dtype_to_float,
                                   'Pkt Len Var': convert_dtype_to_float,
                                   'FIN Flag Cnt': convert_dtype_to_int,
                                   'SYN Flag Cnt': convert_dtype_to_int,
                                   'RST Flag Cnt': convert_dtype_to_int,
                                   'PSH Flag Cnt': convert_dtype_to_int,
                                   'ACK Flag Cnt': convert_dtype_to_int,
                                   'URG Flag Cnt': convert_dtype_to_int,
                                   'CWE Flag Count': convert_dtype_to_int,
                                   'ECE Flag Cnt': convert_dtype_to_int,
                                   'Down/Up Ratio': convert_dtype_to_int,
                                   'Pkt Size Avg': convert_dtype_to_float,
                                   'Fwd Seg Size Avg': convert_dtype_to_float,
                                   'Bwd Seg Size Avg': convert_dtype_to_float,
                                   'Fwd Byts/b Avg': convert_dtype_to_int,
                                   'Fwd Pkts/b Avg': convert_dtype_to_int,
                                   'Fwd Blk Rate Avg': convert_dtype_to_int,
                                   'Bwd Byts/b Avg': convert_dtype_to_int,
                                   'Bwd Pkts/b Avg': convert_dtype_to_int,
                                   'Bwd Blk Rate Avg': convert_dtype_to_int,
                                   'Subflow Fwd Pkts': convert_dtype_to_int,
                                   'Subflow Fwd Byts': convert_dtype_to_int,
                                   'Subflow Bwd Pkts': convert_dtype_to_int,
                                   'Subflow Bwd Byts': convert_dtype_to_int,
                                   'Init Fwd Win Byts': convert_dtype_to_int,
                                   'Init Bwd Win Byts': convert_dtype_to_int,
                                   'Fwd Act Data Pkts': convert_dtype_to_int,
                                   'Fwd Seg Size Min': convert_dtype_to_int,
                                   'Active Mean': convert_dtype_to_float,
                                   'Active Std': convert_dtype_to_float,
                                   'Active Max': convert_dtype_to_int,
                                   'Active Min': convert_dtype_to_int,
                                   'Idle Mean': convert_dtype_to_float,
                                   'Idle Std': convert_dtype_to_float,
                                   'Idle Max': convert_dtype_to_int,
                                   'Idle Min': convert_dtype_to_int})

print("The files have been imported")


In [None]:
columnsToDrop = [
    'Flow Pkts/s',
    'Timestamp',
    'Active Max',
    'Active Min',
    'Active Std',
    'Bwd IAT Max',
    'Bwd IAT Min',
    'Bwd IAT Std',
    'Bwd IAT Tot',
    'Bwd Pkt Len Max',
    'Bwd Pkt Len Min',
    'Bwd Pkt Len Std',
    'Flow IAT Max',
    'Flow IAT Min',
    'Flow IAT Std',
    'Fwd IAT Max',
    'Fwd IAT Min',
    'Fwd IAT Std',
    'Fwd IAT Tot',
    'Fwd Pkt Len Max',
    'Fwd Pkt Len Min',
    'Fwd Pkt Len Std',
    'Idle Max',
    'Idle Min',
    'Idle Std',
    'Pkt Len Max',
    'Pkt Len Min',
    'Pkt Len Std']

rawsData = rawsData.drop(columnsToDrop, axis=1)


In [None]:
rawsData['Label'].unique()


In [None]:
LABEL = 'Label'
NORMAL = 'Benign'
INFILTRATION = 'Infilteration'


In [None]:
def removeLabelFromLabel(dataToTransform):
    dataToTransform[LABEL] = dataToTransform[LABEL].map(
        lambda i: 'Benign' if i == LABEL else i)
    return dataToTransform


In [None]:
rawsData = removeLabelFromLabel(rawsData)


In [None]:
rawNormal = rawsData[rawsData[LABEL] == NORMAL]
rawInfiltration = rawsData[rawsData[LABEL] == INFILTRATION]


In [None]:
from operator import itemgetter

FEATURE = 'Feature'
COUNT = 'Count'


def getNotDuplicatedFeatures(data):
    result = []
    for column in data.keys():
        feature = data[column]
        feature = feature.drop_duplicates()
        result.append({FEATURE: column, COUNT: feature.shape[0]})
    return sorted(result, key=itemgetter(COUNT), reverse=True)


In [None]:
def specialPrint(data):
    for feature in data:
        print('{0:17}  {1}'.format(feature[FEATURE], feature[COUNT]))


In [None]:
countedRawNormal = getNotDuplicatedFeatures(rawNormal)
countedRawInfiltration = getNotDuplicatedFeatures(rawInfiltration)


In [None]:
specialPrint(countedRawNormal)


In [None]:
specialPrint(countedRawInfiltration)


In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)


def showAttackDistribution(data):
    counted = data.value_counts()

    fig = go.Figure(data=[
        go.Bar(name=NORMAL,
               y=[counted[NORMAL]],
               x=[NORMAL],
               text=str(counted[NORMAL]),
               orientation='v',
               textposition='outside',),
        go.Bar(name=INFILTRATION,
               y=[counted[INFILTRATION]],
               x=[INFILTRATION],
               text=str(counted[INFILTRATION]),
               orientation='v',
               textposition='outside',)
    ])
    # Change the bar mode
    fig.update_layout(
        width=800,
        height=600,
        title=f'Labels Distribution',
        yaxis_title='Number of attacks',
        xaxis_title='Attack Name',)
    iplot(fig)


In [None]:
showAttackDistribution(rawsData['Label'])


In [None]:
selectedFeatures = [
    'Flow IAT Mean',
    'Fwd Pkts/s',
    'Flow Duration',
    'Flow Byts/s',
    'Fwd IAT Mean',
    'Bwd Pkts/s',
    'Bwd IAT Mean',
    'Idle Mean',
    'Active Mean',
    'Pkt Len Var',
    'Pkt Len Mean',
    'Pkt Size Avg',
    'Bwd Pkt Len Mean',
    'TotLen Bwd Pkts',
    'Bwd Seg Size Avg',
    'Subflow Bwd Byts',
    'Fwd Pkt Len Mean',
    'Dst Port',
    'Fwd Seg Size Avg',
    'TotLen Fwd Pkts',
    'Subflow Fwd Byts',
    'Tot Fwd Pkts',
    'Fwd Header Len',
    'Subflow Fwd Pkts',
    'Fwd Act Data Pkts',
    'Label'
]

selectedFeatures = list(set(selectedFeatures))


In [None]:
def getSliceFromRawData():
    return rawsData[selectedFeatures].copy()


In [None]:
selectedData = getSliceFromRawData()
allDatasetToTest = getSliceFromRawData()


In [None]:
withoutDuplicates = selectedData.drop_duplicates().copy()
withoutDuplicates.info()


In [None]:
showAttackDistribution(withoutDuplicates[LABEL])


In [None]:
def shortData(data, benignCount, infiltrationCount):
    shortedNormal = data[data[LABEL] ==
                         NORMAL].sample(benignCount, ignore_index=True, random_state=32)
    shortedInfiltration = data[data[LABEL] == INFILTRATION].sample(
        infiltrationCount, ignore_index=True, random_state=32)
    return pd.concat([shortedNormal, shortedInfiltration], axis=0)


In [None]:
toTrainModel = shortData(withoutDuplicates, 100000, 35000).copy()
selectedToSaveDataset = shortData(
    withoutDuplicates, 100000, 49760).copy()
showAttackDistribution(toTrainModel[LABEL])


In [None]:

def makeOnlyAttackOrNot(dataToTransform):
    dataToTransform[LABEL] = dataToTransform[LABEL].map(
        lambda i: 0 if i == NORMAL else 1)
    return dataToTransform


In [None]:
toTrainModel = makeOnlyAttackOrNot(toTrainModel)
selectedToSaveDataset = makeOnlyAttackOrNot(selectedToSaveDataset)
allDatasetToTest = makeOnlyAttackOrNot(allDatasetToTest)


In [None]:
def removeNanInf(data):
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(inplace=True)
    return data


In [None]:
toTrainModel = removeNanInf(toTrainModel)
allDatasetToTest = removeNanInf(allDatasetToTest)
selectedToSaveDataset = removeNanInf(selectedToSaveDataset)


In [None]:
# from sklearn.preprocessing import StandardScaler


# def scale(dataToTransform, columntoTransform):

#     ss_dict = {col: StandardScaler() for col in columntoTransform}

#     for colKey in columntoTransform:
#         dataToTransform[colKey] = ss_dict[colKey].fit_transform(
#             np.array(dataToTransform[colKey]).reshape(-1, 1))

#     return dataToTransform


In [None]:
# numericColumntoTransform = withoutDuplicates.keys()
# withoutDuplicates = scale(withoutDuplicates, numericColumntoTransform)


### Heatmap
#### Heatmap provide us information on how much the individual features are interpedent. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

f, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(abs(toTrainModel.corr()), annot=True,
            linewidths=0.5, fmt='.2f', ax=ax)


Save label

In [None]:
labels = toTrainModel[LABEL]
toTrainModel = toTrainModel.drop([LABEL], axis=1)


In [None]:
def printNanFinite(data):
    print('Is any Nan:', np.any(np.isnan(data)))
    print('Is all finite:', np.all(np.isfinite(data)))


In [None]:
printNanFinite(toTrainModel)
print('\n')
printNanFinite(allDatasetToTest)


Scale numeric values

#### After preprocess

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    toTrainModel, labels, test_size=0.33, random_state=32)


In [None]:
isRFC = True
isMLPC = False
isSequential = False


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense


def model():
    if isRFC:
        model = RandomForestClassifier(
            n_estimators=100, max_depth=5, min_samples_leaf=4)
    elif isMLPC:
        model = MLPClassifier(random_state=32)
    else:
        model = Sequential([
            Dense(256, activation='softplus', kernel_initializer='glorot_uniform',
                  input_dim=X_train.shape[1]),
            Dense(128, activation='softplus',
                  kernel_initializer='glorot_uniform'),
            Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid')
        ])

        model.compile(loss='binary_crossentropy',
                      optimizer='Adam')

    return model


In [None]:
model = model()


In [None]:
model.fit(X_train, y_train)


In [None]:

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


def confMatrix(yTrain, xTrainPreds, yTest, xTestPreds):
    # Your code here
    cmTrain = confusion_matrix(yTrain, xTrainPreds)
    X_train_disp = ConfusionMatrixDisplay(confusion_matrix=cmTrain)

    cmTest = confusion_matrix(yTest, xTestPreds)
    X_test_disp = ConfusionMatrixDisplay(confusion_matrix=cmTest)
    # End

    _, ax = plt.subplots(1, 2, figsize=(10, 5), dpi=150)

    X_train_disp.plot(ax=ax[0])
    ax[0].set_title("Train")

    X_test_disp.plot(ax=ax[1])
    ax[1].set_title("Test")
    plt.show()


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score


def printScores(name, labels, predicts):
    print(name)
    print('Accuracy: %.3f ' % accuracy_score(labels, predicts))
    print('Precision: %.3f ' % precision_score(labels, predicts))
    print('Recall: %.3f' % recall_score(labels, predicts))
    print('F1 Score: %.3f' % f1_score(labels, predicts))
    print('\n')


In [None]:
X_train_preds = model.predict(X_train)
X_test_preds = model.predict(X_test)

if isSequential:
    print('Train auc:', roc_auc_score(y_train, X_train_preds))
    print('Test auc:', roc_auc_score(y_test, X_test_preds))
else:
    printScores('Train:', y_train, X_train_preds)
    printScores('Test:', y_test, X_test_preds)


In [None]:
confMatrix(y_train, X_train_preds, y_test, X_test_preds)


In [None]:
allDatasetLabels = allDatasetToTest[LABEL]
allDatasetToTest = allDatasetToTest.drop([LABEL], axis=1)


In [None]:
AllDatasetPreds = model.predict(allDatasetToTest)

if isSequential:
    print('Auc:', roc_auc_score(allDatasetLabels, AllDatasetPreds))
else:
    printScores('All dataset:', allDatasetLabels, AllDatasetPreds)


In [None]:
allDatasetConfusionMatrix = confusion_matrix(allDatasetLabels, AllDatasetPreds)
allDataSetMatrixDisplay = ConfusionMatrixDisplay(
    confusion_matrix=allDatasetConfusionMatrix)
_, ax = plt.subplots(figsize=(8, 4), dpi=130)
ax.set_title("All dataset")
allDataSetMatrixDisplay.plot(ax=ax)
plt.show()


In [None]:
selectedToSaveDataset.to_csv('infiltration.csv')
