In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import LocalOutlierFactor
from numpy import arange
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

In [2]:
benign_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Benign_resample.csv')
benign_df.head(2)

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,15.854422,-0.34369,-0.086569,0.096351,0.142,0.002723,0.706619,1.982467,1.798333,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,1
1,-0.060352,5.598377,2.253048,4.681794,3.829148,0.125465,3.569438,-0.039001,2.481753,5.125594,...,-0.059807,1.103561,0.162506,0.515708,1.673959,6.18478,12.942534,8.511326,2.479016,1


In [3]:
syn_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Syn_resample_4000.csv')
syn_df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.060352,-0.344948,-0.086569,-0.154149,-0.037724,-0.006439,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
1,-0.060352,-0.344948,-0.086569,-0.154149,-0.037724,-0.006439,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
2,-0.060352,-0.344942,-0.086569,0.096351,-0.037724,-0.005474,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
3,-0.060352,-0.344942,-0.086569,0.096351,-0.037724,-0.005474,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
4,-0.060352,-0.344948,-0.086569,-0.154149,-0.037724,-0.006439,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3998,-0.060352,2.164873,0.629950,0.597352,0.085516,-0.003546,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,1.181746,1.541742,1.373369,-0.020880,2.396404,3.511758,2.911271,1.808577,-1
3999,-0.060352,-0.344948,-0.086569,-0.154149,-0.037724,-0.006439,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
4000,-0.060352,-0.344948,-0.086569,-0.154149,-0.037724,-0.006439,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1
4001,-0.060352,-0.344948,-0.086569,-0.154149,-0.037724,-0.006439,-0.046063,-0.039001,-0.049093,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,-1


In [4]:
benign_test_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Benign_resample_4000.csv')

In [5]:
# novelty detection
df_test = pd.concat([benign_test_df, syn_df], ignore_index=True)
df_train = benign_df

In [6]:
df_test = df_test[[' Flow IAT Mean', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Bwd Avg Packets/Bulk', ' Label']]
df_train = df_train[[' Flow IAT Mean', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Bwd Avg Packets/Bulk', ' Label']]

In [7]:
# training_data
X_train = df_train.drop([' Label'], axis=1)
y_train = df_train[' Label']
# testing_data
X_test = df_test.drop([' Label'], axis=1)
y_test = df_test[' Label']

In [8]:
# Novelty detection with 8000 sample balanced
for a in range(1, 5):
    for i in arange(0.1, 0.5, 0.1):
        clf = LocalOutlierFactor(n_neighbors=a, novelty=True, contamination=i)
        clf.fit(X_train)
        y_pred_test = clf.predict(X_test)
        print('Neighbour : ' + str(a))
        print('contamination : ' + str(i))
        print('---Classification Report---')
        print(classification_report(y_test, y_pred_test))

Neighbour : 1
contamination : 0.1
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      4003
           1       1.00      1.00      1.00      4003

    accuracy                           1.00      8006
   macro avg       1.00      1.00      1.00      8006
weighted avg       1.00      1.00      1.00      8006

Neighbour : 1
contamination : 0.2
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      4003
           1       1.00      1.00      1.00      4003

    accuracy                           1.00      8006
   macro avg       1.00      1.00      1.00      8006
weighted avg       1.00      1.00      1.00      8006

Neighbour : 1
contamination : 0.30000000000000004
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      4003
           1       1.00      1.00  