In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from numpy import arange
from sklearn.metrics import classification_report

In [2]:
benign_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Benign_resample.csv')
benign_df.head(2)

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,15.854422,-0.34369,-0.086569,0.096351,0.142,0.002723,0.706619,1.982467,1.798333,-0.034669,...,-0.059807,-0.095769,-0.109896,-0.112787,-0.020891,-0.339324,-0.282883,-0.341153,-0.315336,1
1,-0.060352,5.598377,2.253048,4.681794,3.829148,0.125465,3.569438,-0.039001,2.481753,5.125594,...,-0.059807,1.103561,0.162506,0.515708,1.673959,6.18478,12.942534,8.511326,2.479016,1


In [3]:
ddos_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\DDoS_resample.csv')
ddos_df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.045263,-0.015250,-0.005277,-0.005954,0.289320,-0.002643,0.322770,0.323287,0.323284,-0.016174,...,0.226640,-0.007737,-0.005256,-0.007436,-0.006370,-0.013105,-0.005207,-0.013185,-0.012838,-1
1,0.045263,-0.015250,-0.005277,-0.005954,0.309328,-0.002643,0.344531,0.345012,0.345025,-0.016174,...,0.226640,-0.007737,-0.005256,-0.007436,-0.006370,-0.013105,-0.005207,-0.013185,-0.012838,-1
2,0.045263,-0.015250,-0.005277,-0.005954,0.309328,-0.002643,0.344531,0.345012,0.345025,-0.016174,...,0.226640,-0.007737,-0.005256,-0.007436,-0.006370,-0.013105,-0.005207,-0.013185,-0.012838,-1
3,0.045263,-0.015249,-0.005277,-0.005954,0.309328,-0.002643,0.344531,0.345012,0.345025,-0.016174,...,0.226640,-0.007737,-0.005256,-0.007436,-0.006370,-0.013105,-0.005207,-0.013185,-0.012838,-1
4,0.045263,-0.015250,-0.005277,-0.005954,0.289320,-0.002643,0.322770,0.323287,0.323284,-0.016174,...,0.226646,-0.007737,-0.005256,-0.007436,-0.006370,-0.013105,-0.005207,-0.013185,-0.012838,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29029,2.209517,-0.327036,-0.309612,-0.339661,0.820881,-0.010439,2.527200,2.846555,2.732269,-0.198358,...,0.063244,-0.098072,-0.107270,-0.109441,-0.015768,-0.315335,-0.268473,-0.315664,-0.295924,-1
29030,2.209517,-0.327036,-0.309612,-0.339661,0.820881,-0.010439,2.527200,2.846555,2.732269,-0.198358,...,0.063244,-0.098072,-0.107270,-0.109441,-0.015768,-0.315335,-0.268473,-0.315664,-0.295924,-1
29031,2.209517,-0.327036,-0.309612,-0.339661,0.820881,-0.010439,2.527200,2.846555,2.732269,-0.198358,...,0.063244,-0.098072,-0.107270,-0.109441,-0.015768,-0.315335,-0.268473,-0.315664,-0.295924,-1
29032,2.209517,-0.327036,-0.309612,-0.339661,0.820881,-0.010439,2.527200,2.846555,2.732269,-0.198358,...,0.063244,-0.098072,-0.107270,-0.109441,-0.015768,-0.315335,-0.268473,-0.315664,-0.295924,-1


In [4]:
benign_test_df = pd.read_csv(r'D:\Kuliah\ProyekAkhir\New Data\resample\Benign_resample_4000.csv')

In [5]:
# novelty detection
df_test = pd.concat([benign_test_df, ddos_df], ignore_index=True)
df_train = benign_df

# training_data
X_train = df_train.drop([' Label'], axis=1)
y_train = df_train[' Label']
# testing_data
X_test = df_test.drop([' Label'], axis=1)
y_test = df_test[' Label']

In [6]:
from scipy.spatial.distance import cdist, euclidean, pdist
from sklearn.cluster import KMeans

class OutlierDetection:
  def __init__(self, benign=None, cluster_n=None):
    if isinstance(benign, pd.DataFrame):
        data = benign.values
    else:
        data = benign
    self.benign = data
    self.cluster_n = cluster_n
    self.kmeans = KMeans(n_clusters=self.cluster_n).fit(self.benign)
    
  def cluster(self):
    labels = self.kmeans.labels_
    centroids = self.kmeans.cluster_centers_
    
    averaged = dict(enumerate(centroids, 0))
    dic= {label: self.benign[labels==label] for label in np.unique(labels)}
    maxDistance = {}
    for i in dic:
      distance = []
      for j in dic[i]:
        distance.append(euclidean(j, averaged[i]))
      maxDistance[i] = np.max(distance)
    
    return maxDistance, averaged
  
  def predict(self, ddos=None):      
    davg, averaged = self.cluster()
    
    anomaly = []
    if isinstance(ddos, pd.DataFrame):
        data = ddos.values
    else:
        data = ddos
        
    for i in data:
        labels = self.kmeans.predict([i])
        dist = euclidean(i, averaged[labels[0]])
        if dist <= davg[labels[0]]:
            anomaly.append(1)
        else:
            anomaly.append(-1) 
    return anomaly  

In [7]:
for i in range(1, 15):
    clf = OutlierDetection(benign=X_train, cluster_n=i)
    y_pred_test = clf.predict(X_test)
    print('Cluster : ' + str(i))
    print('---Classification Report---')
    print(classification_report(y_test, y_pred_test))

Cluster : 1
---Classification Report---
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro avg       0.06      0.50      0.11     33037
weighted avg       0.01      0.12      0.03     33037



  _warn_prf(average, modifier, msg_start, len(result))


Cluster : 2
---Classification Report---
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro avg       0.06      0.50      0.11     33037
weighted avg       0.01      0.12      0.03     33037



  _warn_prf(average, modifier, msg_start, len(result))


Cluster : 3
---Classification Report---
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro avg       0.06      0.50      0.11     33037
weighted avg       0.01      0.12      0.03     33037



  _warn_prf(average, modifier, msg_start, len(result))


Cluster : 4
---Classification Report---
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro avg       0.06      0.50      0.11     33037
weighted avg       0.01      0.12      0.03     33037



  _warn_prf(average, modifier, msg_start, len(result))


Cluster : 5
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro avg       0.56      0.50      0.11     33037
weighted avg       0.89      0.12      0.03     33037

Cluster : 6
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro avg       0.56      0.50      0.11     33037
weighted avg       0.89      0.12      0.03     33037

Cluster : 7
---Classification Report---
              precision    recall  f1-score   support

          -1       1.00      0.00      0.00     29034
           1       0.12      1.00      0.22      4003

    accuracy                           0.12     33037
   macro