In [15]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sn 
import numpy as np 
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

### Load Dataset 

In [2]:
df_full_feature = pd.read_csv('../Dataset/Murni/processed_dataset_10.csv',sep=',')
df_chi_square = pd.read_csv('../Dataset/Murni/processed_dataset_chi_square.csv', sep=',')
df_feature_selection = pd.read_csv('../Dataset/Murni/processed_dataset_5.csv',sep=',')
df_slice = pd.read_csv('../Dataset/Murni/processed_dataset_slice.csv',sep=',')

### Feature Slicing 

In [3]:
# Dataset Full Feature
full_feature_x = df_full_feature[df_full_feature.columns[:9]]
full_feature_y = df_full_feature['label']

full_feature_x.head(5)

Unnamed: 0,src,dst,Protocol,pktrate,pktperflow,dur,dur_nsec,bytecount,pktcount
0,1.0,11.0,1,0,29,983,235000000,94178,961
1,6.0,12.0,2,511,15344,384,114000000,11954682,221383
2,10.0,15.0,1,1,30,821,243000000,78498,801
3,15.0,16.0,2,330,9919,98,622000000,2145912,32512
4,2.0,11.0,3,274,8223,287,579000000,90935340,87270


In [4]:
# Dataset Chi Square 
chi_square_x = df_chi_square[df_chi_square.columns[:5]]
chi_square_y = df_chi_square['label']

chi_square_x.head(5)

Unnamed: 0,dst,src,Protocol,pktcount,pktcount.1
0,11.0,1.0,1,961,961
1,12.0,6.0,2,221383,221383
2,15.0,10.0,1,801,801
3,16.0,15.0,2,32512,32512
4,11.0,2.0,3,87270,87270


In [5]:
# Dataset Feature Selection
feature_selection_x = df_feature_selection[df_feature_selection.columns[:5]]
feature_selection_y = df_feature_selection['label']

feature_selection_x.head(5)

Unnamed: 0,pktrate,pktperflow,Protocol,src,dst
0,0,29,1,1.0,11.0
1,511,15344,2,6.0,12.0
2,1,30,1,10.0,15.0
3,330,9919,2,15.0,16.0
4,274,8223,3,2.0,11.0


In [6]:
# Dataset Slice
slice_x = df_slice[df_slice.columns[:4]]
slice_y= df_slice['label']

slice_x.head(5)

Unnamed: 0,dst,src,Protocol,bytecount
0,11.0,1.0,1,94178
1,12.0,6.0,2,11954682
2,15.0,10.0,1,78498
3,16.0,15.0,2,2145912
4,11.0,2.0,3,90935340


### Feature Selection Model

Perlu untuk dilakukan diskretisasi karena nilai continous mengurangi performansi dari Naive Bayes

In [7]:
#Splitting Data 

x_train,x_test,y_train,y_test = train_test_split(feature_selection_x,feature_selection_y,test_size=0.30,random_state=5)

In [8]:
nb_classifier = GaussianNB()
nb_classifier

GaussianNB()

In [9]:
nb_classifier.fit(x_train, y_train)

GaussianNB()

In [10]:
y_pred_train = nb_classifier.predict(x_train)
y_pred_test = nb_classifier.predict(x_test)

#### Training

In [16]:
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print("Accuracy:",accuracy_score(y_train, y_pred_train)*100)
print("Precision:", precision_score(y_train, y_pred_train)*100)


[[30910     0]
 [19630     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76     30910
         1.0       0.00      0.00      0.00     19630

    accuracy                           0.61     50540
   macro avg       0.31      0.50      0.38     50540
weighted avg       0.37      0.61      0.46     50540

Accuracy: 61.1594776414721
Precision: 0.0


#### Testing

In [13]:
warnings.filterwarnings('ignore')
y_pred_test = [np.argmax(element) for element in y_pred_test]
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print("Accuracy:",accuracy_score(y_test, y_pred_test)*100)

[[13109     0]
 [ 8552     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.75     13109
         1.0       0.00      0.00      0.00      8552

    accuracy                           0.61     21661
   macro avg       0.30      0.50      0.38     21661
weighted avg       0.37      0.61      0.46     21661

Accuracy: 60.518904944370064


##### Hyperparameter Tuning

In [13]:
import joblib

In [14]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=1000)
}

NB_Feature_Grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, scoring='accuracy' ,verbose=1, cv=10, n_jobs=1)
NB_Feature_Grid.fit(x_train, y_train)
print(NB_Feature_Grid.best_estimator_)


Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GaussianNB(var_smoothing=1.645190587753664e-05)


[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:  4.7min finished


In [15]:
print(NB_Feature_Grid.best_score_)

0.7994459833795013


In [16]:
best_estimator = NB_Feature_Grid.best_estimator_
best_estimator

GaussianNB(var_smoothing=1.645190587753664e-05)

In [17]:
#Save Best Estimator 
joblib.dump(best_estimator, 'naive_bayes_full.pkl')

['naive_bayes_full.pkl']

### Slice Model

In [18]:
#Splitting Data 

x_train,x_test,y_train,y_test = train_test_split(slice_x,slice_y,test_size=0.20,random_state=4)

In [19]:
nb_classifier_slice = GaussianNB()
nb_classifier_slice

GaussianNB()

In [20]:
nb_classifier_slice.fit(x_train, y_train)

GaussianNB()

In [21]:
y_pred_train = nb_classifier_slice.predict(x_train)
y_pred_test = nb_classifier_slice.predict(x_test)

#### Training

In [22]:
import warnings
warnings.filterwarnings('ignore')
y_pred_train = [np.argmax(element) for element in y_pred_train]
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print("Accuracy:",accuracy_score(y_train, y_pred_train)*100)


[[35241     0]
 [22519     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76     35241
         1.0       0.00      0.00      0.00     22519

    accuracy                           0.61     57760
   macro avg       0.31      0.50      0.38     57760
weighted avg       0.37      0.61      0.46     57760

Accuracy: 61.01281163434903


#### Testing

In [23]:
warnings.filterwarnings('ignore')
y_pred_test = [np.argmax(element) for element in y_pred_test]
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print("Accuracy:",accuracy_score(y_test, y_pred_test)*100)

[[8778    0]
 [5663    0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76      8778
         1.0       0.00      0.00      0.00      5663

    accuracy                           0.61     14441
   macro avg       0.30      0.50      0.38     14441
weighted avg       0.37      0.61      0.46     14441

Accuracy: 60.78526417838099


##### Hyperparameter Tuning

In [24]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=1000)
}

NB_Slice_Grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, scoring='accuracy' ,verbose=1, cv=10, n_jobs=1)
NB_Slice_Grid.fit(x_train, y_train)
print(NB_Slice_Grid.best_estimator_)


Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GaussianNB(var_smoothing=0.05711586478126432)


[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:  5.8min finished


In [25]:
print(NB_Slice_Grid.best_score_)

0.6502423822714681


In [26]:
best_estimator = NB_Slice_Grid.best_estimator_
best_estimator

GaussianNB(var_smoothing=0.05711586478126432)

In [27]:
#Save Best Estimator 
joblib.dump(best_estimator, 'naive_bayes_slice.pkl')

['naive_bayes_slice.pkl']

### Chi Model

In [28]:
#Splitting Data 

x_train,x_test,y_train,y_test = train_test_split(chi_square_x,chi_square_y,test_size=0.30,random_state=9)

In [29]:
nb_classifier_chi = GaussianNB()
nb_classifier_chi

GaussianNB()

In [30]:
nb_classifier_chi.fit(x_train, y_train)

GaussianNB()

In [31]:
y_pred_train = nb_classifier_chi.predict(x_train)
y_pred_test = nb_classifier_chi.predict(x_test)

#### Training

In [32]:
import warnings
warnings.filterwarnings('ignore')
y_pred_train = [np.argmax(element) for element in y_pred_train]
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print("Accuracy:",accuracy_score(y_train, y_pred_train)*100)


[[30789     0]
 [19751     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76     30789
         1.0       0.00      0.00      0.00     19751

    accuracy                           0.61     50540
   macro avg       0.30      0.50      0.38     50540
weighted avg       0.37      0.61      0.46     50540

Accuracy: 60.9200633161852


#### Testing 

In [33]:
warnings.filterwarnings('ignore')
y_pred_test = [np.argmax(element) for element in y_pred_test]
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print("Accuracy:",accuracy_score(y_test, y_pred_test)*100)

[[13230     0]
 [ 8431     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76     13230
         1.0       0.00      0.00      0.00      8431

    accuracy                           0.61     21661
   macro avg       0.31      0.50      0.38     21661
weighted avg       0.37      0.61      0.46     21661

Accuracy: 61.07751258021329


##### Hyperparameter Tuning

In [34]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=1000)
}

NB_Chi_Grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, scoring='accuracy' ,verbose=1, cv=10, n_jobs=1)
NB_Chi_Grid.fit(x_train, y_train)
print(NB_Chi_Grid.best_estimator_)


Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GaussianNB(var_smoothing=0.4452958509942655)


[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:  5.8min finished


In [35]:
print(NB_Chi_Grid.best_score_)

0.6885437277404037


In [36]:
best_estimator = NB_Chi_Grid.best_estimator_
best_estimator

GaussianNB(var_smoothing=0.4452958509942655)

In [37]:
#Save Best Estimator 
joblib.dump(best_estimator, 'naive_bayes_chi.pkl')

['naive_bayes_chi.pkl']

### Full Model

In [38]:
#Splitting Data 

x_train,x_test,y_train,y_test = train_test_split(full_feature_x,full_feature_y,test_size=0.30,random_state=9)

In [39]:
nb_classifier_full = GaussianNB()
nb_classifier_full

GaussianNB()

In [40]:
nb_classifier_full.fit(x_train, y_train)

GaussianNB()

In [41]:
y_pred_train = nb_classifier_full.predict(x_train)
y_pred_test = nb_classifier_full.predict(x_test)

#### Training

In [42]:
import warnings
warnings.filterwarnings('ignore')
y_pred_train = [np.argmax(element) for element in y_pred_train]
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print("Accuracy:",accuracy_score(y_train, y_pred_train)*100)


[[30789     0]
 [19751     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76     30789
         1.0       0.00      0.00      0.00     19751

    accuracy                           0.61     50540
   macro avg       0.30      0.50      0.38     50540
weighted avg       0.37      0.61      0.46     50540

Accuracy: 60.9200633161852


#### Testing

In [43]:
warnings.filterwarnings('ignore')
y_pred_test = [np.argmax(element) for element in y_pred_test]
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print("Accuracy:",accuracy_score(y_test, y_pred_test)*100)

[[13230     0]
 [ 8431     0]]
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.76     13230
         1.0       0.00      0.00      0.00      8431

    accuracy                           0.61     21661
   macro avg       0.31      0.50      0.38     21661
weighted avg       0.37      0.61      0.46     21661

Accuracy: 61.07751258021329


##### Hyperparameter Tuning

In [44]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=1000)
}

NB_Full_Grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, scoring='accuracy' ,verbose=1, cv=10, n_jobs=1)
NB_Full_Grid.fit(x_train, y_train)
print(NB_Full_Grid.best_estimator_)


Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GaussianNB(var_smoothing=2.389892566231053e-09)


[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:  7.9min finished


In [45]:
print(NB_Full_Grid.best_score_)

0.6736248516026909


In [46]:
best_estimator = NB_Full_Grid.best_estimator_
best_estimator

GaussianNB(var_smoothing=2.389892566231053e-09)

In [47]:
#Save Best Estimator 
joblib.dump(best_estimator, 'naive_bayes_chi.pkl')

['naive_bayes_chi.pkl']

Perlu pemilihan fitur kembali karena pointnya masih kurang baik akurasinya, dan belum tahu ketika nanti di kombinasikan dengan LSTM, selain itu perlu mengetahui lagi mengenai Gaussian NB dll