In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, confusion_matrix, recall_score

# How big is train dataset ?
train_test_ratio = 0.039
np.random.RandomState(seed=42)

data_path = os.path.abspath(os.path.join(os.getcwd(), '..',  'dataset'))

frames = []

frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_1_anonymized_new.csv")))
# Uncomment to load all csv
frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_2_anonymized_new.csv")))
frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_3_anonymized_new.csv")))
frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_4_anonymized_new.csv")))

df = pd.concat(frames, axis=0, ignore_index=True)

mask = np.random.rand(len(df)) < train_test_ratio
train = df[mask]
test = df[~mask]

# Clear memory
del df

train.head()

In [None]:
# Info about data types and columns
train.info()

In [None]:
# Count null values in columns
test.isnull().sum()

In [None]:
# How many attacks and normal records do we have ?
train['Label'].value_counts()

In [None]:
correlation = train.corr()['Label']
correlation = correlation.drop('Label')

plt.style.use('seaborn')
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)

plt.figure(figsize=(16,8))
plt.bar(correlation.keys(), correlation.tolist())
plt.title('Korelacja z kolumną target', size=20)
plt.xticks(rotation=90)
plt.axhline(y=0.2, linewidth=2, color='r')
plt.axhline(y=-0.2, linewidth=2, color='r')
plt.show()

In [None]:
corr_dict = correlation.to_dict()
correlation_treshold = 0.2

# Return column names where correlation is greater or equal than threshold
column_names = [key for key, value in corr_dict.items() if abs(value) >= correlation_treshold]
column_names

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(train[column_names])
y_train = train['Label']
# del train

X_test = scaler.transform(test[column_names])
y_test = test['Label']
# del test

In [None]:
classifiers = {
    'logreg': LogisticRegression(),
    'forest': RandomForestClassifier(),
    'gradboost': GradientBoostingClassifier(),
    'svc': SVC(),
    'mlp': MLPClassifier()
}

In [None]:
results = []
conf_matrix = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    results.append([name, f1, prec, rec, acc])
    conf_matrix[name] = confusion_matrix(y_test, y_pred)
    
df_results = pd.DataFrame(results, columns=['model', 'f1', 'precision', 'recall', 'accuracy'])

In [None]:
df_results.round(4)

In [None]:
for key, item in conf_matrix.items():
    conf_matrix[key] = item / len(test)

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15, 6), sharey=True, sharex=True, constrained_layout=True)
axes = axes.flatten()

for i, (model, cm) in enumerate(conf_matrix.items()):
    ax = axes[i]
    sns.heatmap(cm, ax=ax, annot=True, square=True, cbar=False,
                fmt=".2%", vmin=0, vmax=cm.sum().sum(), annot_kws={'size': 13})
    
    ax.set_title(model, fontsize=16)
    ax.margins(0)
    ax.grid(False)
    
    if i == 0:
        ax.set_ylabel('true label')
    ax.set_xlabel('predicted label')