In [1]:
import pandas as pd
import numpy as np

In [2]:
filepath = "malware_detection_dataset.csv"

In [3]:
df = pd.read_csv(filepath)

### Split the Dataset into X and y

In [4]:
X = df.drop('label',axis=1)
y = df['label']

In [5]:
X

Unnamed: 0,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto_icmp,proto_tcp,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
0,0.114184,48.0,48.0,0,1,76,1,76,0,0,...,0,0,0,0,0,0,0,1,0,0
1,160.367579,7536.0,0.0,0,24,8208,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0.016986,48.0,48.0,0,1,76,1,76,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.003497,0.0,0.0,0,5,212,3,144,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.036724,34.0,311.0,0,1,62,1,339,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046906,0.012744,96.0,96.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1046907,0.012744,96.0,96.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1046908,0.012744,96.0,96.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1046909,0.012744,96.0,96.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
y

0          benign
1          benign
2          benign
3          benign
4          benign
            ...  
1046906      DDoS
1046907      DDoS
1046908      DDoS
1046909      DDoS
1046910      DDoS
Name: label, Length: 1046911, dtype: object

### Splitting the X and y Dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [8]:
y_test

222402                         Okiru
622557                         Okiru
435746                          DDoS
81474      PartOfAHorizontalPortScan
49984      PartOfAHorizontalPortScan
                     ...            
348333     PartOfAHorizontalPortScan
980993                          DDoS
779507     PartOfAHorizontalPortScan
620243                         Okiru
1027748                         DDoS
Name: label, Length: 209383, dtype: object

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(837528, 24)
(209383, 24)
(837528,)
(209383,)


## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
from sklearn.naive_bayes import GaussianNB

import timeit
start = timeit.default_timer()


nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
#Predict the response for test dataset
nb_pred  = nb_clf.predict(X_test)
print(nb_pred)
print()

end = timeit.default_timer()
print("total runtime of the program is: ",end-start,'seconds')

['Okiru' 'Okiru' 'DDoS' ... 'Okiru' 'Okiru' 'DDoS']

total runtime of the program is:  5.791716500000007 seconds


In [12]:
##Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, nb_pred)
print(cm)

[[  760     0    17    25     0     0     0     0     1     0     0     0]
 [    0   326     9    10     0     0     3     0     0  2632     0     0]
 [    0     0     2     0     0     0     0     0     0     0     0     0]
 [    0     0     0    42     0     0     0     0     0    30     0     0]
 [    0     0     0     0     1     0     0     0     0     0     0     0]
 [    1     0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     5     0     0]
 [    0     0     0     1     0     0     3 22632     0    20     0     0]
 [    0     0     1     0     0     0     0     0     0     0     0     0]
 [    1   144     0    26     0     0    64     0     0 39622     0     0]
 [   36  1129   168 12401     0     0    77     0    31 95731     0     0]
 [   32   190    80   376     0     0    37    10     5 21096     0 11606]]


In [13]:
### Final Results
from sklearn.metrics import classification_report
print(classification_report(y_test,nb_pred,zero_division=0))

                            precision    recall  f1-score   support

                    Attack       0.92      0.95      0.93       803
                       C&C       0.18      0.11      0.14      2980
          C&C-FileDownload       0.01      1.00      0.01         2
             C&C-HeartBeat       0.00      0.58      0.01        72
C&C-HeartBeat-FileDownload       1.00      1.00      1.00         1
                 C&C-Mirai       0.00      0.00      0.00         1
                 C&C-Torii       0.00      0.00      0.00         5
                      DDoS       1.00      1.00      1.00     22656
              FileDownload       0.00      0.00      0.00         1
                     Okiru       0.25      0.99      0.40     39857
 PartOfAHorizontalPortScan       0.00      0.00      0.00    109573
                    benign       1.00      0.35      0.52     33432

                  accuracy                           0.36    209383
                 macro avg       0.36      0.5

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, nb_pred)

0.3581522855246128