In [1]:
# Library
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn.feature_selection import SelectFromModel
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import sklearn.metrics as metrics
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
import time
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.preprocessing import LabelEncoder

In [2]:
# Input Dataset 1
dataset1 = pd.read_csv("C:\\Data Raihan\Perkuliahan Semester 8\\SKC\\Dataset\\Cyber Security Attacks\\cybersecurity_attacks.csv")

In [3]:
# Menampilkan nama Kolom yang ada pada dataset 1
dataset1.columns

Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
       'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
       'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
       'Action Taken', 'Severity Level', 'User Information',
       'Device Information', 'Network Segment', 'Geo-location Data',
       'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source'],
      dtype='object')

In [4]:
# Menampilkan fitur yang ada pada dataset 1
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      20000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null

In [5]:
#Feature Selection
X = dataset1.drop(['Payload Data','Alerts/Warnings','Proxy Information', 'Malware Indicators', 'Firewall Logs', 'IDS/IPS Alerts'], axis=1).select_dtypes(include=['float64', 'int64']).values
# Target variable
y = dataset1['Malware Indicators'].values

In [6]:
# Mengisi value yang null dengan mean dari feature tersebut
dataset1['Malware Indicators'].fillna('', inplace=True)

In [7]:
#Data Fitting and choosing the important variables
extratrees = ek.ExtraTreesClassifier().fit(X,y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]

In [8]:
# Ubah label menjadi format numerik
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [9]:
#splitting the data (70% - training and 30% - testing)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.29, stratify = y)

In [10]:
features = []
index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]

In [11]:
#All the required features
for f in range(nbfeatures):
    print("%d. feature %s (%f)" % (f + 1, dataset1.columns[2+index[f]], extratrees.feature_importances_[index[f]]))
    features.append(dataset1.columns[2+f])

1. feature Destination IP Address (0.250872)
2. feature Source Port (0.250438)
3. feature Protocol (0.250289)


In [12]:
estimators = [("Naive Bayes", GaussianNB()),
              ("DecisionTree", DecisionTreeClassifier(max_depth=10)),
              ("RandomForest", ek.RandomForestClassifier(n_estimators=50))
]

In [13]:
# Define the Deep Neural Network (DNN) model
model_dnn = Sequential()
model_dnn.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model_dnn.add(Dense(64, activation='relu'))
model_dnn.add(Dense(1, activation='sigmoid'))

In [14]:
# Assuming X_train and X_test are your input data arrays
X_train_reshaped = np.expand_dims(X_train, axis=-1)
X_test_reshaped = np.expand_dims(X_test, axis=-1)

# Define the Convolutional Neural Network (CNN) model
model_cnn = Sequential()
model_cnn.add(Flatten(input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))

In [15]:
# Testing which Classifier will give better result
model = { 
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": ek.RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
    "Stochastic Gradient Descent": SGDClassifier(loss="hinge", penalty="l2", max_iter=10000),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Stack Ensamble": StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()),
    "Deep Neural Network": model_dnn,
    "Convolutional Neural Network": model_cnn
}

In [16]:
results = {}
for algo in model:
    if algo == "Deep Neural Network" or algo == "Convolutional Neural Network":
        clf = model[algo]
        # Waktu awal training
        start_time = time.time()
        clf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        clf.fit(X_train, y_train , epochs=20)
        # Waktu akhir training
        end_time = time.time()

        # Evaluate the model
        _, score_train = clf.evaluate(X_train, y_train, verbose=0)
        _, score_test = clf.evaluate(X_test, y_test, verbose=0)

        # Waktu awal testing
        start_time_test = time.time()

        # Waktu akhir testing
        end_time_test = time.time()

        print("%s : Training Score: %s, Testing Score: %s" % (algo, score_train, score_test))

        # Menghitung total waktu training
        training_time = end_time - start_time
        print("Total Training Time:", training_time, "seconds")

        # Menghitung total waktu testing
        testing_time = end_time_test - start_time_test
        print("Total Testing Time:", testing_time, "seconds")

        results[algo] = (score_train, score_test, training_time, testing_time)
    else:
        clf = model[algo]
        # Waktu awal training
        start_time = time.time()
        clf.fit(X_train, y_train)
        # Waktu akhir training
        end_time = time.time()

        score_train = clf.score(X_train, y_train)

        # Waktu awal testing
        start_time_test = time.time()

        score_test = clf.score(X_test, y_test)

        # Waktu akhir testing
        end_time_test = time.time()

        print("%s : Training Score: %s, Testing Score: %s" % (algo, score_train, score_test))

        # Menghitung total waktu training
        training_time = end_time - start_time
        print("Total Training Time:", training_time, "seconds")

        # Menghitung total waktu testing
        testing_time = end_time_test - start_time_test
        print("Total Testing Time:", testing_time, "seconds")

        results[algo] = (score_train, score_test, training_time, testing_time)

DecisionTree : Training Score: 0.5363732394366197, Testing Score: 0.4921551724137931
Total Training Time: 0.12833356857299805 seconds
Total Testing Time: 0.0010004043579101562 seconds
RandomForest : Training Score: 1.0, Testing Score: 0.5013793103448276
Total Training Time: 5.223153352737427 seconds
Total Testing Time: 0.12387251853942871 seconds
Logistic Regression : Training Score: 0.5065140845070423, Testing Score: 0.4912068965517241
Total Training Time: 0.025673389434814453 seconds
Total Testing Time: 0.0010004043579101562 seconds
Naive Bayes : Training Score: 0.5089084507042253, Testing Score: 0.4961206896551724
Total Training Time: 0.0035390853881835938 seconds
Total Testing Time: 0.0015873908996582031 seconds
MLP : Training Score: 0.5, Testing Score: 0.5
Total Training Time: 0.8203692436218262 seconds
Total Testing Time: 0.002259492874145508 seconds
Stochastic Gradient Descent : Training Score: 0.5011267605633802, Testing Score: 0.5017241379310344
Total Training Time: 0.36414718

In [17]:
winner = max(results, key=results.get)# Selecting the classifier with good result
print("Using", winner, "for classification, with",len(features), 'features.')

Using RandomForest for classification, with 3 features.


In [18]:
# Input Dataset 2
dataset2 = pd.read_csv("C:\\Data Raihan\Perkuliahan Semester 8\\SKC\\Dataset\\APIsecurityAccessbehavoranomalydataset\supervised_dataset.csv")

In [19]:
# Menampilkan nama Kolom yang ada pada dataset 2
dataset2.columns

Index(['Unnamed: 0', '_id', 'inter_api_access_duration(sec)',
       'api_access_uniqueness', 'sequence_length(count)',
       'vsession_duration(min)', 'ip_type', 'num_sessions', 'num_users',
       'num_unique_apis', 'source', 'classification'],
      dtype='object')

In [20]:
# Menampilkan fitur yang ada pada dataset 2
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1699 entries, 0 to 1698
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      1699 non-null   int64  
 1   _id                             1699 non-null   object 
 2   inter_api_access_duration(sec)  1695 non-null   float64
 3   api_access_uniqueness           1695 non-null   float64
 4   sequence_length(count)          1699 non-null   float64
 5   vsession_duration(min)          1699 non-null   int64  
 6   ip_type                         1699 non-null   object 
 7   num_sessions                    1699 non-null   float64
 8   num_users                       1699 non-null   float64
 9   num_unique_apis                 1699 non-null   float64
 10  source                          1699 non-null   object 
 11  classification                  1699 non-null   object 
dtypes: float64(6), int64(2), object(4)

In [21]:
#Feature Selection
X = dataset2.drop(['Unnamed: 0','_id','ip_type', 'source', 'classification', 'api_access_uniqueness', 'inter_api_access_duration(sec)'], axis=1).select_dtypes(include=['float64', 'int64']).values
# Target variable
y = dataset2['classification'].values

In [22]:
#Data Fitting and choosing the important variables
extratrees = ek.ExtraTreesClassifier().fit(X,y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]

In [23]:
# Ubah label menjadi format numerik
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [24]:
#splitting the data (70% - training and 30% - testing)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.29, stratify = y)

In [25]:
features = []
index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]

In [26]:
#All the required features
for f in range(nbfeatures):
    print("%d. feature %s (%f)" % (f + 1, dataset1.columns[2+index[f]], extratrees.feature_importances_[index[f]]))
    features.append(dataset1.columns[2+f])

1. feature Protocol (0.429146)
2. feature Destination Port (0.328881)


In [27]:
estimators = [("Naive Bayes", GaussianNB()),
              ("DecisionTree", DecisionTreeClassifier(max_depth=10)),
              ("RandomForest", ek.RandomForestClassifier(n_estimators=50))
]

In [28]:
# Define the Deep Neural Network (DNN) model
model_dnn = Sequential()
model_dnn.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model_dnn.add(Dense(64, activation='relu'))
model_dnn.add(Dense(1, activation='sigmoid'))

In [29]:
# Assuming X_train and X_test are your input data arrays
X_train_reshaped = np.expand_dims(X_train, axis=-1)
X_test_reshaped = np.expand_dims(X_test, axis=-1)

# Define the Convolutional Neural Network (CNN) model
model_cnn = Sequential()
model_cnn.add(Flatten(input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))

In [30]:
# Testing which Classifier will give better result
model = { 
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": ek.RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
    "Stochastic Gradient Descent": SGDClassifier(loss="hinge", penalty="l2", max_iter=10000),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Stack Ensamble": StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()),
    "Deep Neural Network": model_dnn,
    "Convolutional Neural Network": model_cnn
}

In [31]:
results = {}
for algo in model:
    if algo == "Deep Neural Network" or algo == "Convolutional Neural Network":
        clf = model[algo]
        # Waktu awal training
        start_time = time.time()
        clf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        clf.fit(X_train, y_train , epochs=20)
        # Waktu akhir training
        end_time = time.time()

        # Evaluate the model
        _, score_train = clf.evaluate(X_train, y_train, verbose=0)
        _, score_test = clf.evaluate(X_test, y_test, verbose=0)

        # Waktu awal testing
        start_time_test = time.time()

        # Waktu akhir testing
        end_time_test = time.time()

        print("%s : Training Score: %s, Testing Score: %s" % (algo, score_train, score_test))

        # Menghitung total waktu training
        training_time = end_time - start_time
        print("Total Training Time:", training_time, "seconds")

        # Menghitung total waktu testing
        testing_time = end_time_test - start_time_test
        print("Total Testing Time:", testing_time, "seconds")

        results[algo] = (score_train, score_test, training_time, testing_time)
    else:
        clf = model[algo]
        # Waktu awal training
        start_time = time.time()
        clf.fit(X_train, y_train)
        # Waktu akhir training
        end_time = time.time()

        score_train = clf.score(X_train, y_train)

        # Waktu awal testing
        start_time_test = time.time()

        score_test = clf.score(X_test, y_test)

        # Waktu akhir testing
        end_time_test = time.time()

        print("%s : Training Score: %s, Testing Score: %s" % (algo, score_train, score_test))

        # Menghitung total waktu training
        training_time = end_time - start_time
        print("Total Training Time:", training_time, "seconds")

        # Menghitung total waktu testing
        testing_time = end_time_test - start_time_test
        print("Total Testing Time:", testing_time, "seconds")

        results[algo] = (score_train, score_test, training_time, testing_time)

DecisionTree : Training Score: 1.0, Testing Score: 1.0
Total Training Time: 0.0010039806365966797 seconds
Total Testing Time: 0.0 seconds
RandomForest : Training Score: 1.0, Testing Score: 1.0
Total Training Time: 0.04875469207763672 seconds
Total Testing Time: 0.0030019283294677734 seconds
Logistic Regression : Training Score: 1.0, Testing Score: 1.0
Total Training Time: 0.006001472473144531 seconds
Total Testing Time: 0.0 seconds
Naive Bayes : Training Score: 0.9950248756218906, Testing Score: 1.0
Total Training Time: 0.0010004043579101562 seconds
Total Testing Time: 0.0 seconds




MLP : Training Score: 0.9875621890547264, Testing Score: 0.9939148073022313
Total Training Time: 0.3653125762939453 seconds
Total Testing Time: 0.0010001659393310547 seconds
Stochastic Gradient Descent : Training Score: 1.0, Testing Score: 1.0
Total Training Time: 0.0010004043579101562 seconds
Total Testing Time: 0.0009996891021728516 seconds
ADA Boost : Training Score: 1.0, Testing Score: 1.0
Total Training Time: 0.0010006427764892578 seconds
Total Testing Time: 0.0 seconds
Stack Ensamble : Training Score: 1.0, Testing Score: 1.0
Total Training Time: 0.2851376533508301 seconds
Total Testing Time: 0.00400090217590332 seconds
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Deep Neural Network : Training Score: 0.9867330193519592, Testing Score: 0.993914783000946
Total Training Time: 1.1205766201019287 secon

In [32]:
winner = max(results, key=results.get)# Selecting the classifier with good result
print("Using", winner, "for classification, with",len(features), 'features.')

Using Stack Ensamble for classification, with 2 features.
