# 4. MODEL COMPARISON
    - Logistic Regression
    - Naive Bayes
    - Support Vector Classifier
    - Random Forest
    - XGBoost Classifier

In [5]:
import warnings
import pandas as pd
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

df_train = pd.read_csv('Final_Train_dataset.csv')
X_train  = df_train.iloc[:,:-1]
y_train  = df_train.VirusDetected

df_test = pd.read_csv('Final_Test_dataset.csv')
X_test  = df_test.iloc[:,:-1]
y_test  = df_test.VirusDetected

In [6]:
models = []
models.append(('Naive Bayes', GaussianNB()))
models.append(('Logistic Regression', LogisticRegression(C        = 78.47,
                                                         max_iter = 100,
                                                         solver   = 'lbfgs')))
models.append(('SVC', SVC(C      = 5,
                          degree = 100,
                          gamma  = 'scale',
                          kernel = 'linear')))

models.append(('Random Forest', RandomForestClassifier(n_estimators      = 50,
                                                       min_samples_split = 2,
                                                       min_samples_leaf  = 1,
                                                       max_features      = 'auto',
                                                       max_depth         = 5,
                                                       criterion         = 'gini')))
models.append(('XGBoost',XGBClassifier(colsample_bytree = 0.3,
                                       learning_rate    = 0.05,
                                       max_depth        = 1, 
                                       min_child_weight = 1)))

import time
start_time = time.time()

for name, model in models:
    fitting  = model.fit(X_train, y_train)
    y_pred   = fitting.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)  
    print('Accuracy: ',round(accuracy, 5), name,"--- %s seconds ---" % (time.time() - start_time) )
    
for name, model in models:
    fitting   = model.fit(X_train, y_train)
    y_pred    = fitting.predict(X_test)
    precision = precision_score(y_test, y_pred)   
    print('Precision: ',round(precision, 5), name,"--- %s seconds ---" % (time.time() - start_time) )
    
for name, model in models:
    fitting   = model.fit(X_train, y_train)
    y_pred    = fitting.predict(X_test)
    recall    = recall_score(y_test, y_pred)   
    print('Sensitivity: ',round(recall, 5), name,"--- %s seconds ---" % (time.time() - start_time) )
    
for name, model in models:
    fitting    = model.fit(X_train, y_train)
    y_pred     = fitting.predict(X_test)
    pre_recall = f1_score(y_test, y_pred)   
    print('F1-Score: ',round(pre_recall, 5), name,"--- %s seconds ---" % (time.time() - start_time) )

Accuracy:  0.95745 Naive Bayes --- 0.0060002803802490234 seconds ---
Accuracy:  0.91489 Logistic Regression --- 0.013002634048461914 seconds ---
Accuracy:  0.95745 SVC --- 0.016001224517822266 seconds ---
Accuracy:  0.95745 Random Forest --- 0.06599950790405273 seconds ---
Accuracy:  0.95745 XGBoost --- 0.1120002269744873 seconds ---
Precision:  1.0 Naive Bayes --- 0.11600184440612793 seconds ---
Precision:  0.93333 Logistic Regression --- 0.12299919128417969 seconds ---
Precision:  1.0 SVC --- 0.12599849700927734 seconds ---
Precision:  1.0 Random Forest --- 0.17699885368347168 seconds ---
Precision:  1.0 XGBoost --- 0.2239975929260254 seconds ---
Sensitivity:  0.93333 Naive Bayes --- 0.2279980182647705 seconds ---
Sensitivity:  0.93333 Logistic Regression --- 0.2350177764892578 seconds ---
Sensitivity:  0.93333 SVC --- 0.23801469802856445 seconds ---
Sensitivity:  0.93333 Random Forest --- 0.2900979518890381 seconds ---
Sensitivity:  0.93333 XGBoost --- 0.3379976749420166 seconds ---

- Naive Bayes Technique outperforms all algorithms in terms of execution time with a better or at least equal classification performance 

## Deployment

In [7]:
optimal_model = GaussianNB().fit(X_train, y_train)

In [8]:
import pickle
file = open("naive_bayes.pkl", 'wb')
pickle.dump(optimal_model, file)