In [None]:
%pip install pandas
%pip install numpy
%pip install sklearn
%pip install matplotlib

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, confusion_matrix, recall_score
import copy

In [None]:
# How big is train dataset ?
train_test_ratio = 0.039
np.random.RandomState(seed=42)

data_path = os.path.abspath(os.path.join(os.getcwd(), '..',  'dataset'))

names = pd.read_csv(os.path.join(data_path, 'NUSW-NB15_features_v2.csv'))['Name'].tolist()

frames = []

frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_1.csv"), names=names))
# Uncomment to load all csv
frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_2.csv"), names=names))
frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_3.csv"), names=names))
frames.append(pd.read_csv(os.path.join(data_path, "UNSW-NB15_4.csv"), names=names))

df = pd.concat(frames, axis=0, ignore_index=True)

mask = np.random.rand(len(df)) < train_test_ratio
train = df[mask]
test = df[~mask]

# Clear memory
del df

train.head()

In [None]:
# Info about data types and columns
train.info()

In [None]:
# Count null values in columns
train.isnull().sum()

In [None]:
# How many attacks and normal records do we have ?
train['Label'].value_counts()

In [None]:
correlation = train.corr()['Label']

plt.bar(correlation.keys(), correlation.tolist())
# plt.show()

In [None]:
corr_dict = correlation.to_dict()
correlation_treshold = 0.2

# Return column names where correlation is greater or equal than threshold
column_names = [key for key, value in corr_dict.items() if abs(value) >= correlation_treshold]
column_names.remove('Label')
column_names

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(train[column_names])
y_train = train['Label']
# del train

X_test = scaler.transform(test[column_names])
y_test = test['Label']
# del test

In [None]:
classifiers = {
    'logreg': LogisticRegression(),
    'forest': RandomForestClassifier(),
    'gradboost': GradientBoostingClassifier(),
    'svc': SVC(),
    'mlp': MLPClassifier()
}

In [None]:
results = []
conf_matrix = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    results.append([name, f1, prec, rec, acc])
    conf_matrix[name] = confusion_matrix(y_test, y_pred)
    
df_results = pd.DataFrame(results, columns=['model', 'f1', 'precision', 'recall', 'accuracy'])

In [None]:
reg_data = df_results.round(4)

import pickle

# save reg_data results to pickle file
with open('reg_data_results.pickle', 'wb') as f:
    pickle.dump(reg_data, f)



In [None]:
for key, item in conf_matrix.items():
    conf_matrix[key] = item / len(test)

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15, 6), sharey=True, sharex=True, constrained_layout=True)
axes = axes.flatten()

for i, (model, cm) in enumerate(conf_matrix.items()):
    ax = axes[i]
    sns.heatmap(cm, ax=ax, annot=True, square=True, cbar=False,
                fmt=".2%", vmin=0, vmax=cm.sum().sum(), annot_kws={'size': 13})
    
    ax.set_title(model, fontsize=16)
    ax.margins(0)
    ax.grid(False)
    
    if i == 0:
        ax.set_ylabel('true label')
    ax.set_xlabel('predicted label')

Opisac dane jak go przygotowaliśmy.
Algorytmy
opisac wyniki dla SL i OCSVM
Dalsze prace
Kilka zdan o wynikach w porownaniu do danych zanonimizowanych.

## Anonymized data

In [None]:
# How big is train dataset ?
train_test_ratio = 0.039
np.random.RandomState(seed=42)

data_path = os.path.abspath(os.path.join(os.getcwd(), '..',  'dataset'))

cats = ["srcip","dstip","proto","state","dur","sbytes","dbytes","sttl","dttl","sloss","dloss","service","Sload","Dload","Spkts","Dpkts","swin","dwin","stcpb","dtcpb","smeansz","dmeansz","trans_depth","res_bdy_len","Sjit","Djit","Sintpkt","Dintpkt","tcprtt","synack","ackdat","is_sm_ips_ports","ct_state_ttl","ct_flw_http_mthd","is_ftp_login","ct_ftp_cmd","ct_srv_src","ct_srv_dst","ct_dst_ltm","ct_src_ ltm","ct_src_dport_ltm","ct_dst_sport_ltm","ct_dst_src_ltm","attack_cat","Label"]

# names = pd.read_csv(os.path.join(data_path, 'NUSW-NB15_features_v2.csv'))['Name'].tolist()

df = pd.read_csv(os.path.join(data_path, "UNSW-NB15_1_anonymized_no_col_names.csv"), names=cats)

mask = np.random.rand(len(df)) < train_test_ratio
train = df[mask]
test = df[~mask]

# Clear memory
del df

train.head()

In [None]:
correlation = train.corr()['Label']

plt.bar(correlation.keys(), correlation.tolist())

In [None]:
corr_dict = correlation.to_dict()
correlation_treshold = 0.2

# Return column names where correlation is greater or equal than threshold
column_names = [key for key, value in corr_dict.items() if abs(value) >= correlation_treshold]
column_names.remove('Label')
column_names

In [None]:
classifiers = {
    'logreg': LogisticRegression(),
    'forest': RandomForestClassifier(),
    'gradboost': GradientBoostingClassifier(),
    'svc': SVC(),
    'mlp': MLPClassifier()
}

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(train[column_names])
y_train = train['Label']
# del train

X_test = scaler.transform(test[column_names])
y_test = test['Label']
# del test

In [None]:
results = []
conf_matrix = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    results.append([name, f1, prec, rec, acc])
    conf_matrix[name] = confusion_matrix(y_test, y_pred)
    
df_results = pd.DataFrame(results, columns=['model', 'f1', 'precision', 'recall', 'accuracy'])

In [None]:
df_results.round(4)




In [None]:
for key, item in conf_matrix.items():
    conf_matrix[key] = item / len(test)

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15, 6), sharey=True, sharex=True, constrained_layout=True)
axes = axes.flatten()

for i, (model, cm) in enumerate(conf_matrix.items()):
    ax = axes[i]
    sns.heatmap(cm, ax=ax, annot=True, square=True, cbar=False,
                fmt=".2%", vmin=0, vmax=cm.sum().sum(), annot_kws={'size': 13})
    
    ax.set_title(model, fontsize=16)
    ax.margins(0)
    ax.grid(False)
    
    if i == 0:
        ax.set_ylabel('true label')
    ax.set_xlabel('predicted label')

plt.savefig('confusion_matrix.png')
plt.show()

# One Class SVM

Getting highly correlated features to reduce time for training.

In [None]:
corr_dict = correlation.to_dict()
correlation_treshold_for_OCSVM = 0.6

# Return column names where correlation is greater or equal than threshold
column_names_for_OCSVM = [key for key, value in corr_dict.items() if abs(value) >= correlation_treshold_for_OCSVM]
column_names_for_OCSVM.remove('Label')
column_names_for_OCSVM = [  'dwin',
                            'stcpb',
                            'dtcpb'] # changed for testing

## OCSVM trained on full set of data

OCSVM model trained on both attack and not attack data. It should create a model that labels attacks as outliers (gives them -1).

In [None]:
# svm One Class

# Load data
x_train = train[column_names_for_OCSVM]
x_test = test[column_names_for_OCSVM]

one_class_svm = OneClassSVM(gamma='auto')
output = one_class_svm.fit_predict(x_train)

In [None]:
new_output = copy.deepcopy(output)

for count, out in enumerate(output):
    if out == -1:
        new_output[count] = 1
    else:
        new_output[count] = 0

In [None]:
y_train_output = train['Label']

tn, fp, fn, tp = confusion_matrix(new_output, y_train_output).ravel()
print(f"TP: {tp}")
print(f"TN: {tn}")
print(f"FP: {fp}")
print(f"FN: {fn} <- attacks not detected")

Model does not detect attacks.

In [None]:
y_test_output = one_class_svm.predict(test[column_names_for_OCSVM])

In [None]:
# Good train data (no attacks)

x_train_no_attacts = train[train['Label'] == 0]

x_train_no_attacts = x_train_no_attacts[column_names_for_OCSVM]

one_class_svm_only_good_data = OneClassSVM(gamma='auto')
output_only_good_data = one_class_svm_only_good_data.fit(x_train_no_attacts)

x_train_no_attacts


In [None]:
# test OCSVM that was trained only on good data

y_test_output_good_data = one_class_svm_only_good_data.predict(test[column_names_for_OCSVM].head(10000))

y_test_output_good_data

In [None]:
new_y_test_output_good_data = copy.deepcopy(y_test_output_good_data)

for count, out in enumerate(y_test_output_good_data):
    if out == -1:
        new_y_test_output_good_data[count] = 1
    else:
        new_y_test_output_good_data[count] = 0
print(len(new_y_test_output_good_data))
print(np.sum(new_y_test_output_good_data))


In [None]:
y_label_good_data = test['Label'].head(10000)
tn, fp, fn, tp = confusion_matrix(y_label_good_data, new_y_test_output_good_data).ravel()
print(f"TP: {tp}")
print(f"TN: {tn}")
print(f"FP: {fp}")
print(f"FN: {fn} <- attacks not detected")

### Results:
One Class SVM for tested features does not detect attacks and is not suitable for detecting network attacks.
It also takes a lot of time to train for higher number of features. 
To create a better OCSVM model we need to find features where distance between attacks and not attacks is highest. That would improve model's accuracy.
