In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Training Dataset
data = pd.read_csv("./dataset/UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")

In [3]:
# Extracting 10 best features
ten_best_features = data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
target_features = data[['attack','category','subcategory']]

In [4]:
# Data Preprocessing
# Label Encoding the target columns
le = LabelEncoder()
target_features['category'] = le.fit_transform(target_features['category'])
target_features['subcategory'] = le.fit_transform(target_features['subcategory'])

Unnamed: 0,attack,category,subcategory
0,1,0,7
1,1,0,6
2,1,0,6
3,1,1,7
4,1,0,7


In [5]:
# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(ten_best_features, target_features)

In [6]:
# Scaling the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
# Defining the ML Model Classes
class RandomForest:
    def __init__(self, max_depth):
        self.rfc_attack = RandomForestClassifier(max_depth=max_depth)
        self.rfc_category = RandomForestClassifier(max_depth=max_depth)
        self.rfc_subcategory = RandomForestClassifier(max_depth=max_depth)
    
    def fit(self, X_train, y_train):
        self.rfc_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)), axis=1)
        self.rfc_category.fit(features_category, y_train['category'])
        
        features_subcategory = np.concatenate((features_category, np.array(y_train['category']).reshape(-1,1)), axis=1)
        self.rfc_subcategory.fit(features_subcategory, y_train['subcategory'])
        
    def predict(self, X_test):
        predict_attack = self.rfc_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)), axis=1)
        predict_category = self.rfc_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category, predict_category.reshape(-1,1)), axis=1)
        predict_subcategory = self.rfc_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack': predict_attack, 'category': predict_category, 'subcategory': predict_subcategory})

In [8]:
class NaiveBayes:
    def __init__(self):
        self.nb_attack = GaussianNB()
        self.nb_category = GaussianNB()
        self.nb_subcategory = GaussianNB()
    
    def fit(self, X_train, y_train):
        self.nb_attack.fit(X_train, y_train['attack'])
        
        features_category = np.concatenate((X_train, np.array(y_train['attack']).reshape(-1,1)), axis=1)
        self.nb_category.fit(features_category, y_train['category'])
        
        features_subcategory = np.concatenate((features_category, np.array(y_train['category']).reshape(-1,1)), axis=1)
        self.nb_subcategory.fit(features_subcategory, y_train['subcategory'])
        
    def predict(self, X_test):
        predict_attack = self.nb_attack.predict(X_test)
        
        test_category = np.concatenate((X_test, predict_attack.reshape(-1,1)), axis=1)
        predict_category = self.nb_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category, predict_category.reshape(-1,1)), axis=1)
        predict_subcategory = self.nb_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack': predict_attack, 'category': predict_category, 'subcategory': predict_subcategory})

In [10]:
class DecisionTree:
    def __init__(self, criterion, max_depth=5):
        self.dtree_attack = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
        self.dtree_category = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
        self.dtree_subcategory = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    
    def fit(self, X_train, y_train):
        self.dtree_attack.fit(X_train, y_train['attack'])
        
        features_category = np.concatenate((X_train, np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.dtree_category.fit(features_category, y_train['category'])
        
        features_subcategory = np.concatenate((features_category, np.array(y_train['category']).reshape(-1,1)), axis=1)
        self.dtree_subcategory.fit(features_subcategory, y_train['subcategory'])
        
    def predict(self, X_test):
        predict_attack = self.dtree_attack.predict(X_test)
        
        test_category = np.concatenate((X_test, predict_attack.reshape(-1,1)), axis=1)
        predict_category = self.dtree_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category, predict_category.reshape(-1,1)), axis=1)
        predict_subcategory = self.dtree_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack': predict_attack, 'category': predict_category, 'subcategory': predict_subcategory})

In [9]:
class GradientBoost:
    def __init__(self):
        self.xgb_attack = XGBClassifier()
        self.xgb_category = XGBClassifier()
        self.xgb_subcategory = XGBClassifier()
    
    def fit(self, X_train, y_train):
        self.xgb_attack.fit(X_train, y_train['attack'])
        
        features_category = np.concatenate((X_train, np.array(y_train['attack']).reshape(-1,1)), axis=1)
        self.xgb_category.fit(features_category, y_train['category'])
        
        features_subcategory = np.concatenate((features_category, np.array(y_train['category']).reshape(-1,1)), axis=1)
        self.xgb_subcategory.fit(features_subcategory, y_train['subcategory'])
        
    def predict(self, X_test):
        predict_attack = self.xgb_attack.predict(X_test)
        
        test_category = np.concatenate((X_test, predict_attack.reshape(-1,1)), axis=1)
        predict_category = self.xgb_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category, predict_category.reshape(-1,1)), axis=1)
        predict_subcategory = self.xgb_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack': predict_attack, 'category': predict_category, 'subcategory': predict_subcategory})

In [11]:
# Validation of ML Models on Training Dataset
# Random Forest

rf = RandomForest(max_depth=3)
rf.fit(X_train, y_train)

predictions_rfc = rf.predict(X_test)

In [12]:
print(confusion_matrix(y_test['attack'], predictions_rfc['attack']))
print(classification_report(y_test['attack'], predictions_rfc['attack']))

[[     8     83]
 [     0 733614]]
              precision    recall  f1-score   support

           0       1.00      0.09      0.16        91
           1       1.00      1.00      1.00    733614

    accuracy                           1.00    733705
   macro avg       1.00      0.54      0.58    733705
weighted avg       1.00      1.00      1.00    733705



In [13]:
print(confusion_matrix(y_test['category'], predictions_rfc['category']))
print(classification_report(y_test['category'], predictions_rfc['category']))

[[368560  16566      0      0      0]
 [ 32668 297396      0      2      0]
 [     0     48      0     43      0]
 [ 11363   1051      0   5985      0]
 [     0      8      0     15      0]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92    385126
           1       0.94      0.90      0.92    330066
           2       0.00      0.00      0.00        91
           3       0.99      0.33      0.49     18399
           4       0.00      0.00      0.00        23

    accuracy                           0.92    733705
   macro avg       0.57      0.44      0.47    733705
weighted avg       0.92      0.92      0.91    733705



In [14]:
print(confusion_matrix(y_test['subcategory'], predictions_rfc['subcategory']))
print(classification_report(y_test['subcategory'], predictions_rfc['subcategory'])) 

[[     0      0      0      0      0      0      0      2]
 [     0      0      0      0      0      2    496      0]
 [     0      0      0      0      0     15      0      6]
 [     0      0      0      0      0     43     46      2]
 [     0      0      0      0      0    142   3468      5]
 [     0      0      0      0      0   5843   8941      0]
 [     0      0      0      0      0      0 318611    231]
 [     0      0      0      0      0      0     19 395833]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00       498
           2       0.00      0.00      0.00        21
           3       0.00      0.00      0.00        91
           4       0.00      0.00      0.00      3615
           5       0.97      0.40      0.56     14784
           6       0.96      1.00      0.98    318842
           7       1.00      1.00      1.00    395852

    accuracy                           

In [15]:
rf.rfc_attack.score(X_test, y_test['attack'])

0.9998868755153638

In [16]:
rf.rfc_category.score(np.concatenate((X_test, np.array(predictions_rfc['attack']).reshape(-1,1)), axis=1), y_test['category'])

0.915819028083494

In [17]:
rf.rfc_subcategory.score(np.concatenate((X_test,np.array(predictions_rfc['attack']).reshape(-1,1), 
                                         np.array(predictions_rfc['category']).reshape(-1,1)), axis=1), y_test['subcategory'])

0.981711995965681

In [18]:
# Naive Bayes
nb = NaiveBayes()
nb.fit(X_train, y_train)

predictions_nb = nb.predict(X_test)

In [19]:
print(confusion_matrix(y_test['attack'],predictions_nb['attack']))
print(classification_report(y_test['attack'],predictions_nb['attack']))

[[    80     11]
 [  2897 730717]]
              precision    recall  f1-score   support

           0       0.03      0.88      0.05        91
           1       1.00      1.00      1.00    733614

    accuracy                           1.00    733705
   macro avg       0.51      0.94      0.53    733705
weighted avg       1.00      1.00      1.00    733705



In [20]:
print(confusion_matrix(y_test['category'],predictions_nb['category']))
print(classification_report(y_test['category'],predictions_nb['category']))

[[368532  16428     36    130      0]
 [183218 145789    758    301      0]
 [     0      8     80      3      0]
 [ 10850   1751   2087   3711      0]
 [     0      0     16      0      7]]
              precision    recall  f1-score   support

           0       0.66      0.96      0.78    385126
           1       0.89      0.44      0.59    330066
           2       0.03      0.88      0.05        91
           3       0.90      0.20      0.33     18399
           4       1.00      0.30      0.47        23

    accuracy                           0.71    733705
   macro avg       0.69      0.56      0.44    733705
weighted avg       0.77      0.71      0.68    733705



In [21]:
print(confusion_matrix(y_test['subcategory'],predictions_nb['subcategory']))
print(classification_report(y_test['subcategory'],predictions_nb['subcategory']))

[[     1      0      0      1      0      0      0      0]
 [     0    329      0      8      2    142     17      0]
 [     4      0      2     15      0      0      0      0]
 [     0      0      0     80      3      0      8      0]
 [     0    216      0     97      1    158   3143      0]
 [     0    614      0   1990    421   3131   8628      0]
 [     0    900      0    775     11    274 316882      0]
 [     0      0      0     11      2      0     24 395815]]
              precision    recall  f1-score   support

           0       0.20      0.50      0.29         2
           1       0.16      0.66      0.26       498
           2       1.00      0.10      0.17        21
           3       0.03      0.88      0.05        91
           4       0.00      0.00      0.00      3615
           5       0.85      0.21      0.34     14784
           6       0.96      0.99      0.98    318842
           7       1.00      1.00      1.00    395852

    accuracy                           

In [22]:
# Decision Tree (Information Gain)
dtree_ig = DecisionTree('entropy')
dtree_ig.fit(X_train,y_train)
predictions_dtree_ig = dtree_ig.predict(X_test)

In [23]:
print(confusion_matrix(y_test['attack'],predictions_dtree_ig['attack']))
print(classification_report(y_test['attack'],predictions_dtree_ig['attack']))

[[    33     58]
 [     0 733614]]
              precision    recall  f1-score   support

           0       1.00      0.36      0.53        91
           1       1.00      1.00      1.00    733614

    accuracy                           1.00    733705
   macro avg       1.00      0.68      0.77    733705
weighted avg       1.00      1.00      1.00    733705



In [24]:
print(confusion_matrix(y_test['category'],predictions_dtree_ig['category']))
print(classification_report(y_test['category'],predictions_dtree_ig['category']))

[[338811  45756      0    559      0]
 [ 11546 318477      0     43      0]
 [     9     16     26     40      0]
 [   365   7517      0  10517      0]
 [     0      0      0     23      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385126
           1       0.86      0.96      0.91    330066
           2       1.00      0.29      0.44        91
           3       0.94      0.57      0.71     18399
           4       0.00      0.00      0.00        23

    accuracy                           0.91    733705
   macro avg       0.75      0.54      0.60    733705
weighted avg       0.92      0.91      0.91    733705



In [25]:
print(confusion_matrix(y_test['subcategory'], predictions_dtree_ig['subcategory']))
print(classification_report(y_test['subcategory'], predictions_dtree_ig['subcategory']))

[[     0      0      2      0      0      0      0      0]
 [     0    369      0      0      0     93     35      1]
 [     0      0      6      0      0     15      0      0]
 [     0      0      0     28      6     32     10     15]
 [     0      0      0      2    892    594   2104     23]
 [     0     20      0      0    263   8768   5724      9]
 [     0     13      0      0      0    491 318337      1]
 [     0      0      0      2      0      0      6 395844]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.92      0.74      0.82       498
           2       0.75      0.29      0.41        21
           3       0.88      0.31      0.46        91
           4       0.77      0.25      0.37      3615
           5       0.88      0.59      0.71     14784
           6       0.98      1.00      0.99    318842
           7       1.00      1.00      1.00    395852

    accuracy                           

In [26]:
dtree_ig.dtree_attack.score(X_test, y_test['attack'])

0.9999209491553145

In [27]:
dtree_ig.dtree_category.score(np.concatenate((X_test, np.array(predictions_dtree_ig['attack'])
                                              .reshape(-1,1)), axis=1), y_test['category'])

0.9102173216756053

In [28]:
dtree_ig.dtree_subcategory.score(np.concatenate((X_test, np.array(predictions_dtree_ig['attack'])
                                                 .reshape(-1,1), np.array(predictions_dtree_ig['category'])
                                                 .reshape(-1,1)), axis=1), y_test['subcategory'])

0.9871051716970717

In [29]:
# Gradient Boost
xgb = GradientBoost()
xgb.fit(X_train, y_train)
predictions_xgb = xgb.predict(X_test)

In [30]:
print(confusion_matrix(y_test['attack'], predictions_xgb['attack']))
print(classification_report(y_test['attack'], predictions_xgb['attack']))

[[    91      0]
 [     1 733613]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        91
           1       1.00      1.00      1.00    733614

    accuracy                           1.00    733705
   macro avg       0.99      1.00      1.00    733705
weighted avg       1.00      1.00      1.00    733705



In [31]:
print(confusion_matrix(y_test['category'], predictions_xgb['category']))
print(classification_report(y_test['category'], predictions_xgb['category']))

[[385115     11      0      0      0]
 [    12 330053      0      1      0]
 [     0      0     91      0      0]
 [     0      0      1  18398      0]
 [     0      1      0      1     21]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    385126
           1       1.00      1.00      1.00    330066
           2       0.99      1.00      0.99        91
           3       1.00      1.00      1.00     18399
           4       1.00      0.91      0.95        23

    accuracy                           1.00    733705
   macro avg       1.00      0.98      0.99    733705
weighted avg       1.00      1.00      1.00    733705



In [32]:
print(confusion_matrix(y_test['subcategory'], predictions_xgb['subcategory']))
print(classification_report(y_test['subcategory'], predictions_xgb['subcategory']))

[[     2      0      0      0      0      0      0      0]
 [     0    497      0      0      1      0      0      0]
 [     0      1     19      0      0      1      0      0]
 [     0      0      0     91      0      0      0      0]
 [     0      0      0      0   3397    218      0      0]
 [     0      0      0      1    240  14543      0      0]
 [     0      0      0      0      0      0 318841      1]
 [     0      0      0      0      0      0      3 395849]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       498
           2       1.00      0.90      0.95        21
           3       0.99      1.00      0.99        91
           4       0.93      0.94      0.94      3615
           5       0.99      0.98      0.98     14784
           6       1.00      1.00      1.00    318842
           7       1.00      1.00      1.00    395852

    accuracy                           

In [33]:
xgb.xgb_attack.score(X_test,y_test['attack'])

0.999998637054402

In [34]:
xgb.xgb_subcategory.score(np.concatenate((X_test, np.array(predictions_xgb['attack'])
                                          .reshape(-1,1), np.array(predictions_xgb['category'])
                                          .reshape(-1,1)),axis=1),y_test['subcategory'])

0.9993648673513197

In [36]:
# Test Data
test_data = pd.read_csv("./dataset/UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")

In [37]:
# Extracting the ten-best features from test set
test_ten_best = test_data[ten_best_features.columns]
test_labels = test_data[['attack', 'category', 'subcategory']]

In [38]:
# Standard Scaling
train_data = sc.fit_transform(ten_best_features)
test = sc.transform(test_ten_best)

In [39]:
ten_best_features = data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
target_features = data[['attack','category','subcategory']]

In [40]:
# Label Encoding
target_features['category'] = le.fit_transform(target_features['category'])
test_labels['category'] = le.transform(test_labels['category'])

target_features['subcategory'] = le.fit_transform(target_features['subcategory'])
test_labels['subcategory'] = le.transform(test_labels['subcategory'])

In [41]:
# Training ML Models on Complete Data and Testing on test set
# Random Forest
rf_clf = RandomForest(max_depth=3)
rf_clf.fit(train_data, target_features)
predictions_rf = rf_clf.predict(test)

In [42]:
print("Random Forest: Attack\n")
print(confusion_matrix(test_labels['attack'], predictions_rf['attack']), "\n")
print(classification_report(test_labels['attack'], predictions_rf['attack']))

Random Forest: Attack

[[     9     98]
 [     0 733598]] 

              precision    recall  f1-score   support

           0       1.00      0.08      0.16       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       1.00      0.54      0.58    733705
weighted avg       1.00      1.00      1.00    733705



In [43]:
print("Random Forest: Category\n")
print(confusion_matrix(test_labels['category'], predictions_rf['category']))
print(classification_report(test_labels['category'], predictions_rf['category']))

Random Forest: Category

[[370454  14855      0      0      0]
 [ 34884 295227      0      1      0]
 [     0     59      0     48      0]
 [ 10101   1197      0   6865      0]
 [     0      5      0      9      0]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.93    385309
           1       0.95      0.89      0.92    330112
           2       0.00      0.00      0.00       107
           3       0.99      0.38      0.55     18163
           4       0.00      0.00      0.00        14

    accuracy                           0.92    733705
   macro avg       0.57      0.45      0.48    733705
weighted avg       0.92      0.92      0.91    733705



In [44]:
print("Random Forest: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'], predictions_rf['subcategory']))
print(classification_report(test_labels['subcategory'], predictions_rf['subcategory']))

Random Forest: Subcategory

[[     0      0      0      0      1    503      0]
 [     0      0      0      0      9      0      5]
 [     0      0      0      0     48     57      2]
 [     0      0      0      0    205   3413      3]
 [     0      0      0      0   6660   7874      8]
 [     0      0      0      0      0 318107    230]
 [     0      0      0      0      0     17 396563]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       504
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00       107
           4       0.00      0.00      0.00      3621
           5       0.96      0.46      0.62     14542
           6       0.96      1.00      0.98    318337
           7       1.00      1.00      1.00    396580

    accuracy                           0.98    733705
   macro avg       0.42      0.35      0.37    733705
weighted avg       0.98      0.98      0.98    733705



In [45]:
# Naive Bayes
nb_clf = NaiveBayes()
nb_clf.fit(train_data, target_features)
predictions_nb = nb_clf.predict(test)

In [46]:
print("Naive Bayes: Attack\n")
print(confusion_matrix(test_labels['attack'], predictions_nb['attack']))
print(classification_report(test_labels['attack'], predictions_nb['attack']))

Naive Bayes: Attack

[[    96     11]
 [  2775 730823]]
              precision    recall  f1-score   support

           0       0.03      0.90      0.06       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       0.52      0.95      0.53    733705
weighted avg       1.00      1.00      1.00    733705



In [47]:
print("Naive Bayes: Category\n")
print(confusion_matrix(test_labels['category'], predictions_nb['category']), "\n")
print(classification_report(test_labels['category'], predictions_nb['category']))

Naive Bayes: Category

[[368894  16264     31    120      0]
 [182703 146381    699    329      0]
 [     0      9     96      2      0]
 [ 10751   1630   2040   3742      0]
 [     0      1      5      0      8]] 

              precision    recall  f1-score   support

           0       0.66      0.96      0.78    385309
           1       0.89      0.44      0.59    330112
           2       0.03      0.90      0.06       107
           3       0.89      0.21      0.33     18163
           4       1.00      0.57      0.73        14

    accuracy                           0.71    733705
   macro avg       0.69      0.62      0.50    733705
weighted avg       0.77      0.71      0.68    733705



In [48]:
print("Naive Bayes: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'], predictions_nb['subcategory']))
print(classification_report(test_labels['subcategory'], predictions_nb['subcategory']))

Naive Bayes: Subcategory

[[     0      0      0      0      0      0      0      0]
 [     0    337      0      7      0    146     14      0]
 [     1      0      7      5      0      0      1      0]
 [     0      0      0     96      1      1      9      0]
 [     0    199      0    128      0    151   3143      0]
 [     0    578      0   1912    612   2979   8461      0]
 [     0    815      0    715      5    296 316506      0]
 [     0      0      0      8      2      0     23 396547]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.17      0.67      0.28       504
           2       1.00      0.50      0.67        14
           3       0.03      0.90      0.06       107
           4       0.00      0.00      0.00      3621
           5       0.83      0.20      0.33     14542
           6       0.96      0.99      0.98    318337
           7       1.00      1.00      1.00    396580

    accuracy 

In [49]:
# Decision Tree (Information Gain)
dtree_ig_clf = DecisionTree('entropy')
dtree_ig_clf.fit(train_data, target_features)
predictions_ig = dtree_ig_clf.predict(test)

In [50]:
print("Decision Tree (Information Gain): Attack\n")
print(confusion_matrix(test_labels['attack'], predictions_ig['attack']), "\n")
print(classification_report(test_labels['attack'], predictions_ig['attack']))

Decision Tree (Information Gain): Attack

[[    40     67]
 [     2 733596]] 

              precision    recall  f1-score   support

           0       0.95      0.37      0.54       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       0.98      0.69      0.77    733705
weighted avg       1.00      1.00      1.00    733705



In [51]:
print("Decision Tree (Information Gain): Category\n")
print(confusion_matrix(test_labels['category'], predictions_ig['category']))
print(classification_report(test_labels['category'], predictions_ig['category']))

Decision Tree (Information Gain): Category

[[339497  45290      0    522      0]
 [ 11703 318367      0     42      0]
 [     9     15     33     50      0]
 [   351   7450      2  10360      0]
 [     0      0      0     14      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385309
           1       0.86      0.96      0.91    330112
           2       0.94      0.31      0.46       107
           3       0.94      0.57      0.71     18163
           4       0.00      0.00      0.00        14

    accuracy                           0.91    733705
   macro avg       0.74      0.54      0.60    733705
weighted avg       0.92      0.91      0.91    733705



In [52]:
print("Decision Tree (Information Gain): Subcategory\n")
print(confusion_matrix(test_labels['subcategory'], predictions_ig['subcategory']), "\n")
print(classification_report(test_labels['subcategory'], predictions_ig['subcategory']))

Decision Tree (Information Gain): Subcategory

[[   384      0      0      0     87     33      0]
 [     0      5      0      0      9      0      0]
 [     0      0     38      4     44      9     12]
 [     0      0      2    942    574   2089     14]
 [    27      0      0    222   8623   5654     16]
 [    23      0      0      0    453 317861      0]
 [     0      0     24      0      0      8 396548]] 

              precision    recall  f1-score   support

           1       0.88      0.76      0.82       504
           2       1.00      0.36      0.53        14
           3       0.59      0.36      0.44       107
           4       0.81      0.26      0.39      3621
           5       0.88      0.59      0.71     14542
           6       0.98      1.00      0.99    318337
           7       1.00      1.00      1.00    396580

    accuracy                           0.99    733705
   macro avg       0.88      0.62      0.70    733705
weighted avg       0.99      0.99      0.99 

In [53]:
# Decision Tree (Gini Index)
dtree_gi_clf = DecisionTree('gini')
dtree_gi_clf.fit(train_data, target_features)
predictions_gi = dtree_gi_clf.predict(test)

In [54]:
print("Decision Tree (Gini Index): Attack\n")
print(confusion_matrix(test_labels['attack'], predictions_gi['attack']),"\n")
print(classification_report(test_labels['attack'], predictions_gi['attack']))

Decision Tree (Gini Index): Attack

[[    39     68]
 [     3 733595]] 

              precision    recall  f1-score   support

           0       0.93      0.36      0.52       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       0.96      0.68      0.76    733705
weighted avg       1.00      1.00      1.00    733705



In [55]:
print("Decision Tree (Gini Index): Category\n")
print(confusion_matrix(test_labels['category'], predictions_gi['category']))
print(classification_report(test_labels['category'], predictions_gi['category']))

Decision Tree (Gini Index): Category

[[338396  46453      0    460      0]
 [  8631 321472      1      8      0]
 [     0     18     33     56      0]
 [    68   7383      2  10710      0]
 [     0      5      0      9      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385309
           1       0.86      0.97      0.91    330112
           2       0.92      0.31      0.46       107
           3       0.95      0.59      0.73     18163
           4       0.00      0.00      0.00        14

    accuracy                           0.91    733705
   macro avg       0.74      0.55      0.61    733705
weighted avg       0.92      0.91      0.91    733705



In [56]:
print("Decision Tree (Gini Index): Subcategory\n")
print(confusion_matrix(test_labels['subcategory'], predictions_gi['subcategory']),"\n")
print(classification_report(test_labels['subcategory'], predictions_gi['subcategory']))

Decision Tree (Gini Index): Subcategory

[[   416      0      0      0     10     78      0]
 [     0      5      0      0      9      0      0]
 [     1      0     17      5     82      2      0]
 [    10      0      5    860    710   2036      0]
 [     6      3      5     71   9069   5387      1]
 [    63      0      0      0    459 317814      1]
 [     0      0      0      0      0      9 396571]] 

              precision    recall  f1-score   support

           1       0.84      0.83      0.83       504
           2       0.62      0.36      0.45        14
           3       0.63      0.16      0.25       107
           4       0.92      0.24      0.38      3621
           5       0.88      0.62      0.73     14542
           6       0.98      1.00      0.99    318337
           7       1.00      1.00      1.00    396580

    accuracy                           0.99    733705
   macro avg       0.84      0.60      0.66    733705
weighted avg       0.99      0.99      0.99    733

In [57]:
# Gradient Boost
xgb_clf = GradientBoost()
xgb_clf.fit(train_data,target_features)
predictions_gb = xgb_clf.predict(test)

In [58]:
print("Gradient Boost: Attack\n")
print(confusion_matrix(test_labels['attack'], predictions_gb['attack']), "\n")
print(classification_report(test_labels['attack'], predictions_gb['attack']))

Gradient Boost: Attack

[[   100      7]
 [     1 733597]] 

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       1.00      0.97      0.98    733705
weighted avg       1.00      1.00      1.00    733705



In [59]:
print("Gradient Boost: Category\n")
print(confusion_matrix(test_labels['category'], predictions_gb['category']), "\n")
print(classification_report(test_labels['category'], predictions_gb['category']))

Gradient Boost: Category

[[385296     12      0      1      0]
 [    17 330094      1      0      0]
 [     0      0    100      7      0]
 [     0      1      0  18162      0]
 [     0      0      0      0     14]] 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    385309
           1       1.00      1.00      1.00    330112
           2       0.99      0.93      0.96       107
           3       1.00      1.00      1.00     18163
           4       1.00      1.00      1.00        14

    accuracy                           1.00    733705
   macro avg       1.00      0.99      0.99    733705
weighted avg       1.00      1.00      1.00    733705



In [60]:
print("Gradient Boost: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'], predictions_gb['subcategory']), "\n")
print(classification_report(test_labels['subcategory'], predictions_gb['subcategory']))

Gradient Boost: Subcategory

[[   496      0      1      0      1      6      0]
 [     0     14      0      0      0      0      0]
 [     0      0    100      0      7      0      0]
 [     0      0      0   3350    271      0      0]
 [     1      0      0    227  14314      0      0]
 [     0      0      0      0      0 318337      0]
 [     1      0      0      0      0      5 396574]] 

              precision    recall  f1-score   support

           1       1.00      0.98      0.99       504
           2       1.00      1.00      1.00        14
           3       0.99      0.93      0.96       107
           4       0.94      0.93      0.93      3621
           5       0.98      0.98      0.98     14542
           6       1.00      1.00      1.00    318337
           7       1.00      1.00      1.00    396580

    accuracy                           1.00    733705
   macro avg       0.99      0.98      0.98    733705
weighted avg       1.00      1.00      1.00    733705

