In [1]:
from numpy import *
import matplotlib.pyplot as plt 
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import preprocessing
import pandas as pd
from joblib import dump,load
import requests,json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#df = pd.read_csv("NoSqli_Dataset.csv")
df = pd.read_csv("/content/drive/MyDrive/Thesis Resources/dataset/NoSql_Injection/NoSqli_Dataset.csv")
df.head()

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,ClassLabel
0,0,0,0,1,1,0,0,0,0,0,0
1,1,1,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0


In [4]:

X = df.drop(['ClassLabel'],axis='columns')
Y = df['ClassLabel']

In [5]:
Y.value_counts()

0    801
1    203
Name: ClassLabel, dtype: int64

In [6]:
X.head(2)

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9
0,0,0,0,1,1,0,0,0,0,0
1,1,1,0,1,0,0,0,0,0,0


In [7]:
Y.head(5)

0    0
1    1
2    0
3    0
4    0
Name: ClassLabel, dtype: int64

# Evaluation of Best Classification Model Using GridSearchCV(with handling imbalanced data)

In [8]:
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix , classification_report

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto',class_weight='balanced'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear','poly']
        }  
    },
    'decision_tree': {
        'model': tree.DecisionTreeClassifier(class_weight='balanced'),
        'params' : {
            'criterion':["gini","entropy"],
            'max_depth':[1,2,3,4,5,6,7,None]
        }
    },
    'Adaboost':{
        'model':AdaBoostClassifier(base_estimator = tree.DecisionTreeClassifier(random_state = 11, max_features = "auto",max_depth = None,class_weight='balanced')),
        'params':{
              'base_estimator__criterion' : ["gini", "entropy"],
              'base_estimator__splitter' :   ["best", "random"],
              'n_estimators': [1, 2]
             }
    },

    'random_forest': {
        'model': RandomForestClassifier(class_weight = 'balanced'),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto',class_weight = 'balanced'),
        'params': {
            'C': [1,5,10]
        }
    },
    'Naive_Bayes:BernoulliNB':{
        'model':BernoulliNB(),
        'params':{
            'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
        }
    },
}

Apply transformations to dataset

In [9]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids,NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

def transform(transformer,X,Y):
    print("Transforming {}".format(transformer.__class__.__name__))
    X_resampled,Y_resampled = transformer.fit_sample(X.values,Y.values.ravel())
    return transformer.__class__.__name__,pd.DataFrame(X_resampled), pd.DataFrame(Y_resampled)

datasets = []
datasets.append(("base",X,Y))
datasets.append(transform(SMOTE(n_jobs=-1),X,Y))
datasets.append(transform(RandomOverSampler(),X,Y))
datasets.append(transform(NearMiss(n_jobs=-1),X,Y))
datasets.append(transform(RandomUnderSampler(),X,Y))
datasets.append(transform(SMOTEENN(),X,Y))
datasets.append(transform(SMOTETomek(),X,Y))



Transforming SMOTE
Transforming RandomOverSampler
Transforming NearMiss




Transforming RandomUnderSampler
Transforming SMOTEENN
Transforming SMOTETomek




Model Serialization

In [17]:
## As Adaboost is performing best among all the models we are storing Adaboost
## classifier object in the file to deploy it in the server
def saveModel(model_object,model_name):
    dump(model_object,model_name+'.joblib')

def loadModel(model_name):
    model_object= load(model_name)
    return model_object

def checkSavedClassifierResults():
    adaboostModel =loadModel('Adaboost.joblib')
    result= adaboostModel.best_estimator_.predict([[  0, 0, 1, 0, 0, 0, 0, 0, 0, 0  ]])
    print(result)

checkSavedClassifierResults()

[0]


In [12]:


def getBestClassifierObject(model_name,sampling_type,classifierObject):
   
    if(model_name=='Adaboost' and sampling_type=='SMOTEENN'):
        print('helloooooooooooooooooooooooooooooooooooooooo')
        #print(classifierObject)
        saveModel(classifierObject,model_name)

#this method should be updated with more ML models
def benchmark(sampling_type,x,y):
    #lr = LogisticRegression(penalty = 'l2')
    #param_grid = {'C':[0.01,0.1,1,10]}
    scores = []

    target_names = ['Not Malicious', 'Malicious']
    ConfusionMatrix={}

    for model_name, mp in model_params.items():
        grid =  GridSearchCV(estimator=mp['model'],param_grid= mp['params'],scoring='accuracy', cv=10,n_jobs=-1,verbose=2)
        grid = grid.fit(x, y)
        
        getBestClassifierObject(model_name,sampling_type,grid)

        y_pred=grid.best_estimator_.predict(x)
        print('-------------------------------------------')
        print(model_name.upper())  
        print('-------------------------------------------')
        report=classification_report(y, y_pred, target_names=target_names,output_dict=True)
        print(report)
        ConfusionMatrix[model_name]  =  confusion_matrix(y, y_pred)
        
        macro_precision =  report['macro avg']['precision'] 
        macro_recall = report['macro avg']['recall']    
        macro_f1 = report['macro avg']['f1-score']    
        accuracy = report['accuracy']

        
        # print("precision: ")
        # print(macro_precision)
        # print("recall: ")
        # print(macro_recall)
        # print("f1-score: ")
        # print(macro_f1)
        # print("accuracy: ")
        # print(accuracy)
      
        
        scores.append({
            'sampling_type':sampling_type,
            'model': model_name.upper(),
            'best_score': grid.best_score_,
            'precision':macro_precision,
            'recall':macro_recall,
            'f1_score':macro_f1,
            'best_params': grid.best_params_,    
        })
    
        #gs = GridSearchCV(estimator=lr, param_grid=param_grid, scoring='accuracy', cv=10, verbose=2)
        #gs = gs.fit(X.values,Y.values.ravel())
    return scores



In [13]:
benchmark_scores = []
for sample_type,X,Y in datasets:
    print('______________________________________________________________')
    print('{}'.format(sample_type))
    benchmark_scores = benchmark_scores + (benchmark(sample_type,X,Y))
    print('______________________________________________________________')



______________________________________________________________
base
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 0.9634300126103404, 'recall': 0.9538077403245943, 'f1-score': 0.958594730238394, 'support': 801}, 'Malicious': {'precision': 0.8246445497630331, 'recall': 0.8571428571428571, 'f1-score': 0.8405797101449276, 'support': 203}, 'accuracy': 0.9342629482071713, 'macro avg': {'precision': 0.8940372811866868, 'recall': 0.9054752987337257, 'f1-score': 0.8995872201916608, 'support': 1004}, 'weighted avg': {'precision': 0.9353688084689028, 'recall': 0.9342629482071713, 'f1-score': 0.9347331275700935, 'support': 1004}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 0.9698162729658792, 'recall': 0.9225967540574282, 'f1-score': 0.945617402431222, 'support': 801}, 'Malicious': {'precision': 0.743801652892562, 'recall': 0.8866995073891626, 'f1-score': 0.8089887640449438, 'support': 203}, 'accuracy': 0.9153386454183267, 'macro avg': {'precision': 0.8568089629292206, 'recall': 0.9046481307232954, 'f1-score': 0.8773030832380829, 'support': 1004}, 'weighted avg': {'precision': 0.9241180977916926, 'recall': 0.9153386454183267, 'f1-score': 0.917992289291367, 'support': 1004}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 0.9695817490494296, 'recall': 0.9550561797752809, 'f1-score': 0.9622641509433961, 'support': 801}, 'Malicious': {'precision': 0.8325581395348837, 'recall': 0.8817733990147784, 'f1-score': 0.8564593301435407, 'support': 203}, 'accuracy': 0.9402390438247012, 'macro avg': {'precision': 0.9010699442921566, 'recall': 0.9184147893950296, 'f1-score': 0.9093617405434684, 'support': 1004}, 'weighted avg': {'precision': 0.9418767762093372, 'recall': 0.9402390438247012, 'f1-score': 0.9408713435505967, 'support': 1004}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 0.979702300405954, 'recall': 0.9038701622971286, 'f1-score': 0.9402597402597402, 'support': 801}, 'Malicious': {'precision': 0.7094339622641509, 'recall': 0.9261083743842364, 'f1-score': 0.8034188034188033, 'support': 203}, 'accuracy': 0.9083665338645418, 'macro avg': {'precision': 0.8445681313350524, 'recall': 0.9149892683406825, 'f1-score': 0.8718392718392718, 'support': 1004}, 'weighted avg': {'precision': 0.9250564113195138, 'recall': 0.9083665338645418, 'f1-score': 0.9125917022331365, 'support': 1004}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits
-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 0.9730458221024259, 'recall': 0.9013732833957553, 'f1-score': 0.9358392741412831, 'support': 801}, 'Malicious': {'precision': 0.6984732824427481, 'recal

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.9459459459459459, 'recall': 0.9612983770287141, 'f1-score': 0.9535603715170278, 'support': 801}, 'Malicious': {'precision': 0.8368421052631579, 'recall': 0.7832512315270936, 'f1-score': 0.8091603053435116, 'support': 203}, 'accuracy': 0.9252988047808764, 'macro avg': {'precision': 0.8913940256045519, 'recall': 0.8722748042779038, 'f1-score': 0.8813603384302697, 'support': 1004}, 'weighted avg': {'precision': 0.9238861056485296, 'recall': 0.9252988047808764, 'f1-score': 0.9243639437946933, 'support': 1004}}
______________________________________________________________
______________________________________________________________
SMOTE
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.3s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 0.854586129753915, 'recall': 0.9538077403245943, 'f1-score': 0.9014749262536873, 'support': 801}, 'Malicious': {'precision': 0.9477401129943502, 'recall': 0.8377028714107366, 'f1-score': 0.8893306825712392, 'support': 801}, 'accuracy': 0.8957553058676654, 'macro avg': {'precision': 0.9011631213741327, 'recall': 0.8957553058676655, 'f1-score': 0.8954028044124632, 'support': 1602}, 'weighted avg': {'precision': 0.9011631213741327, 'recall': 0.8957553058676654, 'f1-score': 0.8954028044124632, 'support': 1602}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 0.8735224586288416, 'recall': 0.9225967540574282, 'f1-score': 0.8973891924711597, 'support': 801}, 'Malicious': {'precision': 0.917989417989418, 'recall': 0.8664169787765293, 'f1-score': 0.8914579319203597, 'support': 801}, 'accuracy': 0.8945068664169787, 'macro avg': {'precision': 0.8957559383091298, 'recall': 0.8945068664169789, 'f1-score': 0.8944235621957597, 'support': 1602}, 'weighted avg': {'precision': 0.8957559383091299, 'recall': 0.8945068664169787, 'f1-score': 0.8944235621957596, 'support': 1602}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.6s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 0.8653846153846154, 'recall': 0.9550561797752809, 'f1-score': 0.9080118694362018, 'support': 801}, 'Malicious': {'precision': 0.9498607242339833, 'recall': 0.8514357053682896, 'f1-score': 0.8979591836734694, 'support': 801}, 'accuracy': 0.9032459425717853, 'macro avg': {'precision': 0.9076226698092993, 'recall': 0.9032459425717853, 'f1-score': 0.9029855265548357, 'support': 1602}, 'weighted avg': {'precision': 0.9076226698092993, 'recall': 0.9032459425717853, 'f1-score': 0.9029855265548357, 'support': 1602}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.5s finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 0.8653846153846154, 'recall': 0.9550561797752809, 'f1-score': 0.9080118694362018, 'support': 801}, 'Malicious': {'precision': 0.9498607242339833, 'recall': 0.8514357053682896, 'f1-score': 0.8979591836734694, 'support': 801}, 'accuracy': 0.9032459425717853, 'macro avg': {'precision': 0.9076226698092993, 'recall': 0.9032459425717853, 'f1-score': 0.9029855265548357, 'support': 1602}, 'weighted avg': {'precision': 0.9076226698092993, 'recall': 0.9032459425717853, 'f1-score': 0.9029855265548357, 'support': 1602}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits
-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 0.8721351025331725, 'recall': 0.9026217228464419, 'f1-score': 0.8871165644171779, 'support': 801}, 'Malicious': {'precision': 0.8990944372574385, 'reca

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.8539719626168224, 'recall': 0.9126092384519351, 'f1-score': 0.8823174411587206, 'support': 801}, 'Malicious': {'precision': 0.9061662198391421, 'recall': 0.8439450686641697, 'f1-score': 0.8739495798319328, 'support': 801}, 'accuracy': 0.8782771535580525, 'macro avg': {'precision': 0.8800690912279823, 'recall': 0.8782771535580525, 'f1-score': 0.8781335104953267, 'support': 1602}, 'weighted avg': {'precision': 0.8800690912279823, 'recall': 0.8782771535580525, 'f1-score': 0.8781335104953267, 'support': 1602}}
______________________________________________________________
______________________________________________________________
RandomOverSampler
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 0.9025893958076449, 'recall': 0.9138576779026217, 'f1-score': 0.9081885856079405, 'support': 801}, 'Malicious': {'precision': 0.9127686472819216, 'recall': 0.9013732833957553, 'f1-score': 0.907035175879397, 'support': 801}, 'accuracy': 0.9076154806491885, 'macro avg': {'precision': 0.9076790215447832, 'recall': 0.9076154806491885, 'f1-score': 0.9076118807436687, 'support': 1602}, 'weighted avg': {'precision': 0.9076790215447833, 'recall': 0.9076154806491885, 'f1-score': 0.9076118807436688, 'support': 1602}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 0.8883553421368547, 'recall': 0.9238451935081149, 'f1-score': 0.9057527539779682, 'support': 801}, 'Malicious': {'precision': 0.9206762028608583, 'recall': 0.8838951310861424, 'f1-score': 0.9019108280254777, 'support': 801}, 'accuracy': 0.9038701622971286, 'macro avg': {'precision': 0.9045157724988565, 'recall': 0.9038701622971286, 'f1-score': 0.903831791001723, 'support': 1602}, 'weighted avg': {'precision': 0.9045157724988564, 'recall': 0.9038701622971286, 'f1-score': 0.903831791001723, 'support': 1602}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.7s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 0.9282970550576184, 'recall': 0.9051186017478152, 'f1-score': 0.9165613147914032, 'support': 801}, 'Malicious': {'precision': 0.9074299634591961, 'recall': 0.9300873907615481, 'f1-score': 0.9186189889025894, 'support': 801}, 'accuracy': 0.9176029962546817, 'macro avg': {'precision': 0.9178635092584073, 'recall': 0.9176029962546817, 'f1-score': 0.9175901518469963, 'support': 1602}, 'weighted avg': {'precision': 0.9178635092584072, 'recall': 0.9176029962546817, 'f1-score': 0.9175901518469963, 'support': 1602}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 0.8823529411764706, 'recall': 0.9550561797752809, 'f1-score': 0.9172661870503597, 'support': 801}, 'Malicious': {'precision': 0.9510204081632653, 'recall': 0.8726591760299626, 'f1-score': 0.91015625, 'support': 801}, 'accuracy': 0.9138576779026217, 'macro avg': {'precision': 0.9166866746698679, 'recall': 0.9138576779026217, 'f1-score': 0.9137112185251799, 'support': 1602}, 'weighted avg': {'precision': 0.916686674669868, 'recall': 0.9138576779026217, 'f1-score': 0.9137112185251798, 'support': 1602}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 0.9022277227722773, 'recall': 0.9101123595505618, 'f1-score': 0.906152889993785, 'support': 801}, 'Malicious': {'precision': 0.9093198992443325, 'recall': 0.9013732833957553, 'f1-score': 0.9053291536050158, 'support': 801}, 'accuracy': 0.9057428214731585, 'macro avg': {'precision': 0.9057738110083049, 'recall': 0.9057428214731585, 'f1-score': 0.9057410217994004, 'support': 1602}, 'weighted avg': {'precision': 0.9057738110083049, 'recall': 0.9057428214731585, 'f1-score': 0.9057410217994004, 'support': 1602}}
Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.4s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.8618266978922716, 'recall': 0.9188514357053683, 'f1-score': 0.8894259818731117, 'support': 801}, 'Malicious': {'precision': 0.9131016042780749, 'recall': 0.8526841448189763, 'f1-score': 0.8818592640413169, 'support': 801}, 'accuracy': 0.8857677902621723, 'macro avg': {'precision': 0.8874641510851733, 'recall': 0.8857677902621723, 'f1-score': 0.8856426229572143, 'support': 1602}, 'weighted avg': {'precision': 0.8874641510851733, 'recall': 0.8857677902621723, 'f1-score': 0.8856426229572142, 'support': 1602}}
______________________________________________________________
______________________________________________________________
NearMiss
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.6s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 0.9035532994923858, 'recall': 0.8768472906403941, 'f1-score': 0.89, 'support': 203}, 'Malicious': {'precision': 0.8803827751196173, 'recall': 0.9064039408866995, 'f1-score': 0.8932038834951458, 'support': 203}, 'accuracy': 0.8916256157635468, 'macro avg': {'precision': 0.8919680373060015, 'recall': 0.8916256157635468, 'f1-score': 0.8916019417475729, 'support': 406}, 'weighted avg': {'precision': 0.8919680373060016, 'recall': 0.8916256157635468, 'f1-score': 0.8916019417475729, 'support': 406}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 0.8598130841121495, 'recall': 0.9064039408866995, 'f1-score': 0.8824940047961631, 'support': 203}, 'Malicious': {'precision': 0.9010416666666666, 'recall': 0.8522167487684729, 'f1-score': 0.8759493670886076, 'support': 203}, 'accuracy': 0.8793103448275862, 'macro avg': {'precision': 0.8804273753894081, 'recall': 0.8793103448275862, 'f1-score': 0.8792216859423854, 'support': 406}, 'weighted avg': {'precision': 0.880427375389408, 'recall': 0.8793103448275862, 'f1-score': 0.8792216859423854, 'support': 406}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.6s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 0.9035532994923858, 'recall': 0.8768472906403941, 'f1-score': 0.89, 'support': 203}, 'Malicious': {'precision': 0.8803827751196173, 'recall': 0.9064039408866995, 'f1-score': 0.8932038834951458, 'support': 203}, 'accuracy': 0.8916256157635468, 'macro avg': {'precision': 0.8919680373060015, 'recall': 0.8916256157635468, 'f1-score': 0.8916019417475729, 'support': 406}, 'weighted avg': {'precision': 0.8919680373060016, 'recall': 0.8916256157635468, 'f1-score': 0.8916019417475729, 'support': 406}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 0.8786407766990292, 'recall': 0.8916256157635468, 'f1-score': 0.8850855745721272, 'support': 203}, 'Malicious': {'precision': 0.89, 'recall': 0.8768472906403941, 'f1-score': 0.8833746898263026, 'support': 203}, 'accuracy': 0.8842364532019704, 'macro avg': {'precision': 0.8843203883495145, 'recall': 0.8842364532019704, 'f1-score': 0.884230132199215, 'support': 406}, 'weighted avg': {'precision': 0.8843203883495145, 'recall': 0.8842364532019704, 'f1-score': 0.8842301321992149, 'support': 406}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits
-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 0.8872549019607843, 'recall': 0.8916256157635468, 'f1-score': 0.8894348894348895, 'support': 203}, 'Malicious': {'precision': 0.8910891089108911, 'recall': 0.8866995073

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.9243243243243243, 'recall': 0.8423645320197044, 'f1-score': 0.881443298969072, 'support': 203}, 'Malicious': {'precision': 0.8552036199095022, 'recall': 0.9310344827586207, 'f1-score': 0.8915094339622641, 'support': 203}, 'accuracy': 0.8866995073891626, 'macro avg': {'precision': 0.8897639721169133, 'recall': 0.8866995073891626, 'f1-score': 0.8864763664656681, 'support': 406}, 'weighted avg': {'precision': 0.8897639721169134, 'recall': 0.8866995073891626, 'f1-score': 0.8864763664656682, 'support': 406}}
______________________________________________________________
______________________________________________________________
RandomUnderSampler
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.6s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 0.863013698630137, 'recall': 0.9310344827586207, 'f1-score': 0.8957345971563981, 'support': 203}, 'Malicious': {'precision': 0.9251336898395722, 'recall': 0.8522167487684729, 'f1-score': 0.8871794871794871, 'support': 203}, 'accuracy': 0.8916256157635468, 'macro avg': {'precision': 0.8940736942348546, 'recall': 0.8916256157635467, 'f1-score': 0.8914570421679426, 'support': 406}, 'weighted avg': {'precision': 0.8940736942348546, 'recall': 0.8916256157635468, 'f1-score': 0.8914570421679426, 'support': 406}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 0.8761467889908257, 'recall': 0.9408866995073891, 'f1-score': 0.9073634204275534, 'support': 203}, 'Malicious': {'precision': 0.9361702127659575, 'recall': 0.8669950738916257, 'f1-score': 0.9002557544757034, 'support': 203}, 'accuracy': 0.9039408866995073, 'macro avg': {'precision': 0.9061585008783916, 'recall': 0.9039408866995073, 'f1-score': 0.9038095874516283, 'support': 406}, 'weighted avg': {'precision': 0.9061585008783916, 'recall': 0.9039408866995073, 'f1-score': 0.9038095874516283, 'support': 406}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.6s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 0.8867924528301887, 'recall': 0.9261083743842364, 'f1-score': 0.9060240963855422, 'support': 203}, 'Malicious': {'precision': 0.9226804123711341, 'recall': 0.8817733990147784, 'f1-score': 0.9017632241813602, 'support': 203}, 'accuracy': 0.9039408866995073, 'macro avg': {'precision': 0.9047364326006614, 'recall': 0.9039408866995073, 'f1-score': 0.9038936602834512, 'support': 406}, 'weighted avg': {'precision': 0.9047364326006615, 'recall': 0.9039408866995073, 'f1-score': 0.9038936602834511, 'support': 406}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 0.8779342723004695, 'recall': 0.9211822660098522, 'f1-score': 0.8990384615384615, 'support': 203}, 'Malicious': {'precision': 0.917098445595855, 'recall': 0.8719211822660099, 'f1-score': 0.8939393939393939, 'support': 203}, 'accuracy': 0.896551724137931, 'macro avg': {'precision': 0.8975163589481623, 'recall': 0.896551724137931, 'f1-score': 0.8964889277389276, 'support': 406}, 'weighted avg': {'precision': 0.8975163589481623, 'recall': 0.896551724137931, 'f1-score': 0.8964889277389276, 'support': 406}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits
-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 0.8768472906403941, 'recall': 0.8768472906403941, 'f1-score': 0.8768472906403941, 'support': 203}, 'Malicious': {'precision': 0.8768472906403941, 'recall': 0

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.8676470588235294, 'recall': 0.8719211822660099, 'f1-score': 0.8697788697788699, 'support': 203}, 'Malicious': {'precision': 0.8712871287128713, 'recall': 0.8669950738916257, 'f1-score': 0.8691358024691358, 'support': 203}, 'accuracy': 0.8694581280788177, 'macro avg': {'precision': 0.8694670937682003, 'recall': 0.8694581280788178, 'f1-score': 0.8694573361240028, 'support': 406}, 'weighted avg': {'precision': 0.8694670937682003, 'recall': 0.8694581280788177, 'f1-score': 0.8694573361240028, 'support': 406}}
______________________________________________________________
______________________________________________________________
SMOTEENN
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.0s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 733}, 'Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 569}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 733}, 'Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 569}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.6s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


helloooooooooooooooooooooooooooooooooooooooo
-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 733}, 'Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 569}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 733}, 'Malicious': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 569}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1302}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 1.0, 'recall': 0.9945429740791268, 'f1-score': 0.9972640218878249, 'support': 733}, 'Malicious': {'precision': 0.9930191972076788, 'recall': 1.0, 'f1-score': 0.9964973730297723, 'support': 569}, 'accuracy': 0.9969278033794163, 'macro avg': {'precision': 0.9965095986038395, 'recall': 0.9972714870395634, 'f1-score': 0.9968806974587986, 'support': 1302}, 'weighted avg': {'precision': 0.9969492497781638, 'recall': 0.9969278033794163, 'f1-score': 0.9969289810274315, 'support': 1302}}
Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.951885565669701, 'recall': 0.9986357435197817, 'f1-score': 0.9747003994673769, 'support': 733}, 'Malicious': {'precision': 0.99812382739212, 'recall': 0.9349736379613357, 'f1-score': 0.9655172413793103, 'support': 569}, 'accuracy': 0.9708141321044547, 'macro avg': {'precision': 0.9750046965309105, 'recall': 0.9668046907405587, 'f1-score': 0.9701088204233436, 'support': 1302}, 'weighted avg': {'precision': 0.9720926093871021, 'recall': 0.9708141321044547, 'f1-score': 0.9706871760018547, 'support': 1302}}
______________________________________________________________
______________________________________________________________
SMOTETomek
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.3s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
SVM
-------------------------------------------
{'Not Malicious': {'precision': 0.906519065190652, 'recall': 0.920099875156055, 'f1-score': 0.9132589838909542, 'support': 801}, 'Malicious': {'precision': 0.9188846641318125, 'recall': 0.9051186017478152, 'f1-score': 0.9119496855345912, 'support': 801}, 'accuracy': 0.9126092384519351, 'macro avg': {'precision': 0.9127018646612322, 'recall': 0.9126092384519351, 'f1-score': 0.9126043347127727, 'support': 1602}, 'weighted avg': {'precision': 0.9127018646612323, 'recall': 0.9126092384519351, 'f1-score': 0.9126043347127727, 'support': 1602}}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
DECISION_TREE
-------------------------------------------
{'Not Malicious': {'precision': 0.9023199023199023, 'recall': 0.9225967540574282, 'f1-score': 0.9123456790123456, 'support': 801}, 'Malicious': {'precision': 0.9208173690932312, 'recall': 0.9001248439450686, 'f1-score': 0.9103535353535354, 'support': 801}, 'accuracy': 0.9113607990012484, 'macro avg': {'precision': 0.9115686357065668, 'recall': 0.9113607990012484, 'f1-score': 0.9113496071829406, 'support': 1602}, 'weighted avg': {'precision': 0.9115686357065667, 'recall': 0.9113607990012484, 'f1-score': 0.9113496071829406, 'support': 1602}}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.7s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
ADABOOST
-------------------------------------------
{'Not Malicious': {'precision': 0.9067484662576687, 'recall': 0.9225967540574282, 'f1-score': 0.9146039603960398, 'support': 801}, 'Malicious': {'precision': 0.9212198221092758, 'recall': 0.9051186017478152, 'f1-score': 0.9130982367758187, 'support': 801}, 'accuracy': 0.9138576779026217, 'macro avg': {'precision': 0.9139841441834722, 'recall': 0.9138576779026217, 'f1-score': 0.9138510985859292, 'support': 1602}, 'weighted avg': {'precision': 0.9139841441834723, 'recall': 0.9138576779026217, 'f1-score': 0.9138510985859292, 'support': 1602}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.5s finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
RANDOM_FOREST
-------------------------------------------
{'Not Malicious': {'precision': 0.9067484662576687, 'recall': 0.9225967540574282, 'f1-score': 0.9146039603960398, 'support': 801}, 'Malicious': {'precision': 0.9212198221092758, 'recall': 0.9051186017478152, 'f1-score': 0.9130982367758187, 'support': 801}, 'accuracy': 0.9138576779026217, 'macro avg': {'precision': 0.9139841441834722, 'recall': 0.9138576779026217, 'f1-score': 0.9138510985859292, 'support': 1602}, 'weighted avg': {'precision': 0.9139841441834723, 'recall': 0.9138576779026217, 'f1-score': 0.9138510985859292, 'support': 1602}}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


-------------------------------------------
LOGISTIC_REGRESSION
-------------------------------------------
{'Not Malicious': {'precision': 0.8902439024390244, 'recall': 0.9113607990012484, 'f1-score': 0.9006785934608268, 'support': 801}, 'Malicious': {'precision': 0.9092071611253197, 'recall': 0.8876404494382022, 'f1-score': 0.8982943777637398, 'support': 801}, 'accuracy': 0.8995006242197253, 'macro avg': {'precision': 0.8997255317821721, 'recall': 0.8995006242197253, 'f1-score': 0.8994864856122833, 'support': 1602}, 'weighted avg': {'precision': 0.8997255317821721, 'recall': 0.8995006242197253, 'f1-score': 0.8994864856122833, 'support': 1602}}
Fitting 10 folds for each of 5 candidates, totalling 50 fits
-------------------------------------------
NAIVE_BAYES:BERNOULLINB
-------------------------------------------
{'Not Malicious': {'precision': 0.8573099415204678, 'recall': 0.9151061173533084, 'f1-score': 0.8852657004830917, 'support': 801}, 'Malicious': {'precision': 0.9089692101740

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)


In [None]:
benchmark_scores

In [None]:
!python --version

In [None]:
sampling_results = pd.DataFrame(benchmark_scores,columns=['sampling_type','model','best_score','precision','recall','f1_score','best_params'])
sampling_results.to_csv("report(with_resampling).csv")
sampling_results

Train model with weighted class

In [None]:
# lr = LogisticRegression(penalty = 'l2',class_weight="balanced")
# lr.fit(X.values,Y.values.ravel())
# scores = []
# pred_test = lr.predict(X.values)
# pred_test_probs = lr.predict_proba(X.values)
# probs = lr.decision_function(X.values)
# fpr, tpr, thresholds = roc_curve(Y.values.ravel(),pred_test)
# p,r,t = precision_recall_curve(Y.values.ravel(),probs)
# scores.append(("weighted_base",
#                            f1_score(Y.values.ravel(),pred_test),
#                            precision_score(Y.values.ravel(),pred_test),
#                            recall_score(Y.values.ravel(),pred_test),
#                            accuracy_score(Y.values.ravel(),pred_test),
#                            auc(fpr, tpr),
#                            #auc(p,r,reorder=True),
#                            confusion_matrix(Y.values.ravel(),pred_test)))

# scores = pd.DataFrame(scores,columns=['Sampling Type','f1','precision','recall','accuracy','auc_roc','confusion_matrix'])
# results = sampling_results.append(scores)
# results

# Evaluation of Best Classification Model Using GridSearchCV(without handling imbalanced data)

In [None]:
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix , classification_report

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto',class_weight='balanced'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear','poly']
        }  
    },
    'decision_tree': {
        'model': tree.DecisionTreeClassifier(class_weight='balanced'),
        'params' : {
            'criterion':["gini","entropy"],
            'max_depth':[1,2,3,4,5,6,7,None]
        }
    },
    'Adaboost':{
        'model':AdaBoostClassifier(base_estimator = tree.DecisionTreeClassifier(random_state = 11, max_features = "auto",max_depth = None,class_weight='balanced')),
        'params':{
              'base_estimator__criterion' : ["gini", "entropy"],
              'base_estimator__splitter' :   ["best", "random"],
              'n_estimators': [1, 2]
             }
    },

    'random_forest': {
        'model': RandomForestClassifier(class_weight = 'balanced'),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto',class_weight = 'balanced'),
        'params': {
            'C': [1,5,10]
        }
    },
    'Naive_Bayes:BernoulliNB':{
        'model':BernoulliNB(),
        'params':{
            'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
        }
    },
}

In [None]:

scores = []

target_names = ['Not Malicious', 'Malicious']
ConfusionMatrix={}

accuracies=[]
precisions=[]
recalls=[]
f1_scores=[]

for model_name, mp in model_params.items():
    grid =  GridSearchCV(mp['model'],param_grid= mp['params'], cv=10, return_train_score=False,n_jobs=-1)
    grid.fit(X, Y)
    y_pred=grid.best_estimator_.predict(X)
    print('-------------------------------------------')
    print(model_name.upper())  
    print('-------------------------------------------')
    report=classification_report(Y, y_pred, target_names=target_names,output_dict=True)
    print(report)
    ConfusionMatrix[model_name]  =  confusion_matrix(Y, y_pred)
    
    macro_precision =  report['macro avg']['precision'] 
    macro_recall = report['macro avg']['recall']    
    macro_f1 = report['macro avg']['f1-score']    
    accuracy = report['accuracy']

    accuracies.append(accuracy)
    precisions.append(macro_precision)
    recalls.append(macro_recall)
    f1_scores.append(macro_f1)

    # print("precision: ")
    # print(macro_precision)
    # print("recall: ")
    # print(macro_recall)
    # print("f1-score: ")
    # print(macro_f1)
    # print("accuracy: ")
    # print(accuracy)
   

    scores.append({
        'model': model_name.upper(),
        'best_score': grid.best_score_,
        'precision':macro_precision,
        'recall':macro_recall,
        'f1_score':macro_f1,
        'best_params': grid.best_params_,
        
    
    })
          

    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','best_params'])
df.to_csv("report(without_resampling).csv")
df



1.   https://www.geeksforgeeks.org/create-a-grouped-bar-plot-in-matplotlib/




In [None]:
# importing package
import matplotlib.pyplot as plt
import numpy as np
  
# '6' means 6 ML models performance are going to be shown
x = np.arange(6)
width = 0.2
  
# plot data in grouped manner of bar type
plt.bar(x-0.2, accuracies, width, color='cyan')
plt.bar(x, precisions, width, color='orange')
plt.bar(x+0.2, recalls, width, color='green')
plt.bar(x+0.4, f1_scores, width, color='red')

plt.xticks(x, ['SVM', 'DECISION_TREE', 'ADABOOST', 'RANDOM_FOREST', 'LOGISTIC_REGRESSION','NAIVE_BAYES'])
plt.xlabel("Machine Learning Models")
plt.ylabel("F-measures")
plt.legend(["Accuracy", "Precision", "Recall","F1-score"])
plt.show()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import seaborn as sn
%matplotlib inline
import matplotlib.pyplot as plt

#Confusion matrix of best classification results of the models
for model_name,matrix in ConfusionMatrix.items():
    #print(model_name)
    plt.figure(figsize = (10,7))
    sn.heatmap(matrix, annot=True)
    plt.title(model_name.upper())
    plt.xlabel('Predicted')
    plt.ylabel('Truth')


# TPOT for Automated Machine Learning

##### Reference


*   https://machinelearningmastery.com/tpot-for-automated-machine-learning-in-python/



In [None]:
pip install tpot

In [None]:
import tpot
print('tpot: %s' % tpot.__version__)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv

In [None]:

# define search
model = TPOTClassifier( generations=10, population_size=50, cv=cv, scoring='f1', verbosity=2, random_state=10, n_jobs=-1)


In [None]:
# perform the search
model.fit(X, Y)
# export the best model
model.export('tpot_sonar_best_model.py')

## AutoML using Pycaret

In [None]:
pip install pycaret

In [None]:
pip install sklearn-genetic

In [None]:
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from genetic_selection import GeneticSelectionCV

mcc = make_scorer(matthews_corrcoef)
estimator = LogisticRegression(solver = "liblinear", C = 6, tol = 1, fit_intercept = True)

from sklearn.model_selection import *
report = pd.DataFrame()
nofeats = [] 
chosen_feats = [] 
cvscore = [] 
rkf = RepeatedStratifiedKFold(n_repeats = 2, n_splits = 10)
for i in range(2,11):
  
    selector = GeneticSelectionCV(estimator,
                                cv = rkf,
                                verbose = 0,
                                scoring = mcc,
                                max_features = i,
                                n_population = 200,
                                crossover_proba = 0.5,
                                mutation_proba = 0.2,
                                n_generations = 10,
                                crossover_independent_proba=0.5,
                                mutation_independent_proba=0.05,
                                #tournament_size = 3,
                                n_gen_no_change=10,
                                caching=True,
                                n_jobs=-1)
  
    selector = selector.fit(X, Y)
    genfeats = X.columns[selector.support_]
    genfeats = list(genfeats)
    print("Chosen Feats:  ", genfeats)

cv_score = selector.generation_scores_[-1]
nofeats.append(len(genfeats)) 
chosen_feats.append(genfeats) 
cvscore.append(cv_score)





report["No of Feats"] = nofeats
report["Chosen Feats"] = chosen_feats
report["Scores"] = cvscore






In [None]:
report["Scores"] = np.round(report["Scores"], 3)
report.sort_values(by = "Scores", ascending = False, inplace = False)
#report.index
ga_feats = report.iloc[0]["Chosen Feats"]

used_feats =  ga_feats.copy()
used_feats.append('ClassLabel')

print(report)
print(ga_feats)


In [None]:
from pycaret.datasets import get_data
from pycaret.classification import *
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids,NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek



ename = setup(data = df[used_feats], target = "ClassLabel", 
              train_size=0.8,
              test_data = None,
              fold_strategy = "stratifiedkfold",
              fold_shuffle = True, 
              use_gpu = True,
              normalize = True,
              categorical_features = None,
              
              #pca = True,
              #pca_method = "kernel",
              #pca_components = 5, 
              preprocess = False,
              html = True,
              #POLYNOMIAL
              #polynomial_features = True,
              #polynomial_degree = 2, 
              #transformation
              
              #transformation = True,
              feature_selection = True, 
              feature_interaction = True,
          
              fix_imbalance = True,
              #fix_imbalance_method  = SMOTE(),
              imputation_type='simple',
              verbose = True,
              )


rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 20)
best_model = compare_models(sort = "MCC", round = 2, 
                            fold = rskf,
                            #include = ["rf","catboost"],
                            #n_select = 1
                            )