In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import LocalOutlierFactor
from numpy import arange
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

In [2]:
benign_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Benign_resample.csv')
benign_df.head(2)

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,15.854422,-0.34369,-0.086569,0.096351,0.142,0.002723,0.706619,1.982467,1.798333,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,1
1,-0.060352,5.598377,2.253048,4.681794,3.829148,0.125465,3.569438,-0.039001,2.481753,5.125594,...,-0.059807,1.103561,0.162506,0.515708,1.673959,6.18478,12.942534,8.511326,2.479016,1


In [3]:
netbios_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\NetBios_resample_4000.csv')
netbios_df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.019544,-0.034902,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
1,0.019544,-0.034870,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
2,0.019544,-0.034902,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
3,0.019544,-0.034869,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
4,0.019544,-0.034869,-0.014359,-0.00594,0.292062,-0.003071,1.153700,1.486555,1.495316,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6644,0.019544,-0.034901,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
6645,0.019544,-0.034902,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
6646,0.019544,-0.034902,-0.014359,-0.00594,-0.010963,-0.003071,0.010276,0.021431,0.019534,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1
6647,0.019544,-0.034869,-0.014359,-0.00594,0.898111,-0.003071,3.440546,4.416803,4.446881,-0.009313,...,0.155863,-0.003031,-0.001923,-0.003225,-0.002593,-0.034464,-0.002979,-0.034487,-0.034441,-1


In [5]:
benign_test_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Benign_resample_4000.csv')

In [6]:
# novelty detection
df_test = pd.concat([benign_test_df, netbios_df], ignore_index=True)
df_train = benign_df

In [7]:
df_test = df_test[[' Flow Packets/s', ' ACK Flag Count', ' Bwd Avg Packets/Bulk', ' Idle Std',' Label']]
df_train = df_train[[' Flow Packets/s', ' ACK Flag Count', ' Bwd Avg Packets/Bulk', ' Idle Std',' Label']]

In [8]:
# training_data
X_train = df_train.drop([' Label'], axis=1)
y_train = df_train[' Label']
# testing_data
X_test = df_test.drop([' Label'], axis=1)
y_test = df_test[' Label']

In [9]:
# Novelty detection with 8000 sample balanced
for a in range(1, 10):
    for i in arange(0.1, 0.5, 0.1):
        clf = LocalOutlierFactor(n_neighbors=a, novelty=True, contamination=i)
        clf.fit(X_train)
        y_pred_test = clf.predict(X_test)
        print('Neighbour : ' + str(a))
        print('contamination : ' + str(i))
        print('---Classification Report---')
        print(classification_report(y_test, y_pred_test))

Neighbour : 1
contamination : 0.1
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6649
           1       1.00      1.00      1.00      4003

    accuracy                           1.00     10652
   macro avg       1.00      1.00      1.00     10652
weighted avg       1.00      1.00      1.00     10652

Neighbour : 1
contamination : 0.2
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6649
           1       1.00      1.00      1.00      4003

    accuracy                           1.00     10652
   macro avg       1.00      1.00      1.00     10652
weighted avg       1.00      1.00      1.00     10652

Neighbour : 1
contamination : 0.30000000000000004
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6649
           1       1.00      1.00  

Neighbour : 6
contamination : 0.2
---Classification Report---
              precision    recall  f1-score   support

          -1       0.93      1.00      0.96      6649
           1       1.00      0.87      0.93      4003

    accuracy                           0.95     10652
   macro avg       0.96      0.93      0.95     10652
weighted avg       0.95      0.95      0.95     10652

Neighbour : 6
contamination : 0.30000000000000004
---Classification Report---
              precision    recall  f1-score   support

          -1       0.89      1.00      0.94      6649
           1       1.00      0.80      0.89      4003

    accuracy                           0.92     10652
   macro avg       0.95      0.90      0.91     10652
weighted avg       0.93      0.92      0.92     10652

Neighbour : 6
contamination : 0.4
---Classification Report---
              precision    recall  f1-score   support

          -1       0.86      1.00      0.93      6649
           1       1.00      0.73  