In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from matplotlib import gridspec
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
path = r"C:\Users\Rajesh\OneDrive\Documents\finalpreprocessed2.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Dur,Proto,Sport,Dport,TotPkts,TotBytes,SrcBytes,Label,State
0,0.055572,1,13485,1,2,128,60,0,1
1,0.097566,1,35155,10,2,474,145,0,1
2,3598.238525,1,43565,23,876,96819,47731,0,1
3,3591.619629,1,43565,23,867,95866,47068,0,1
4,1337.931763,1,44774,23,275,28434,12099,0,1


In [3]:
# Determine number of fraud cases in dataset 
Botnet= df[df['Label'] == 1] 
Normal = df[df['Label'] == 0] 
outlierFraction = len(Botnet)/float(len(Normal)) 
print(outlierFraction) 
print("\n")
print('Fraud Cases: {}'.format(len(df[df['Label'] == 1]))) 
print('Valid Transactions: {}'.format(len(df[df['Label'] == 0])))

0.026917408559906125


Fraud Cases: 26885
Valid Transactions: 998796


In [4]:
# dividing the X and the Y from the dataset 
#df1 = df1.drop(['Proto'],axis =1)
df1=df[:]
df1 = df1.drop(['State'],axis =1)
X = df1.drop(['Label'], axis = 1) 
y = df1["Label"] 
print(X.shape) 
print(y.shape) 
# getting just the values for the sake of processing 
# (its a numpy array with no columns) 

(1025681, 7)
(1025681,)


In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0, stratify =y)

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

array([[ 2.50069123e+00, -4.09003823e-01,  1.27191267e+00, ...,
        -5.33356055e-03, -4.70500385e-03, -1.65548494e-03],
       [ 2.95269579e+00, -4.09003823e-01, -6.08799721e-01, ...,
        -4.26735702e-03, -4.63198144e-03, -1.61960052e-03],
       [-4.49697919e-01, -4.09003823e-01,  8.49697198e-01, ...,
        -5.86666232e-03, -4.69118175e-03, -1.67735734e-03],
       ...,
       [-4.49677298e-01,  1.94643748e+00, -1.82357793e+00, ...,
        -4.00080614e-03, -4.22801106e-03, -1.32979114e-03],
       [-4.49697951e-01, -4.09003823e-01,  4.01702298e-01, ...,
        -5.86666232e-03, -4.71908674e-03, -1.67838261e-03],
       [-4.49698002e-01, -4.09003823e-01,  2.32898603e-01, ...,
        -5.86666232e-03, -4.71908674e-03, -1.67838261e-03]])

In [7]:
from sklearn.naive_bayes import GaussianNB
#creating a gaussian classifier

model = GaussianNB()

In [8]:
#Training the model using the training data
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [9]:
#predicting the output values
y_pred = model.predict(X_test)

In [10]:
y_pred=model.predict(X_test) 
#Let's check various metrics using sklearns inbuitl function to compare the values our custom functions generated
from sklearn import metrics 
print(f"Actual accuracy_score : {metrics.accuracy_score(y_test,y_pred.round()): .4f}") 
print(f"Actual recall_score : {metrics.recall_score(y_test,y_pred.round()): .4f}") 
print(f"Actual precision_score : {metrics.precision_score(y_test,y_pred.round()): .4f}") 
print(f"Actual f1_score : {metrics.f1_score(y_test,y_pred.round()): .4f}") 

Actual accuracy_score :  0.9443
Actual recall_score :  0.9927
Actual precision_score :  0.3190
Actual f1_score :  0.4829


In [11]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred.round()))
print(classification_report(y_test.round(),y_pred.round()))

[[188367  11393]
 [    39   5338]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97    199760
           1       0.32      0.99      0.48      5377

    accuracy                           0.94    205137
   macro avg       0.66      0.97      0.73    205137
weighted avg       0.98      0.94      0.96    205137



In [13]:
TP = true_positives = 5338
TN = true_negatives = 188367
FP = false_positives = 11393
FN = false_negatives = 39

In [14]:
#False Positive rate
FPR = FP/(TN+FP)
print("The flase positive rate of this model is{:.4f}".format(FPR))

The flase positive rate of this model is0.0570


In [15]:
#true negative rate
TNR = TN/(TN+FP)
print("The true negative rate of this model is{:.4f}".format(TNR))

The true negative rate of this model is0.9430


In [12]:
# Evaluating the classifier 
# printing every score of the classifier 
# scoring in anything 
from sklearn.metrics import classification_report, accuracy_score 
from sklearn.metrics import precision_score, recall_score 
from sklearn.metrics import f1_score, matthews_corrcoef 
from sklearn.metrics import confusion_matrix 

n_outliers = len(Botnet) 
n_errors = (y_pred != y_test).sum() 
print("The model used is NaiveBayes classifier")
print(n_outliers)
print(n_errors)
acc = accuracy_score(y_test, y_pred) 
print("The accuracy is {}".format(acc)) 

prec = precision_score(y_test, y_pred) 
print("The precision is {}".format(prec)) 

rec = recall_score(y_test, y_pred) 
print("The recall is {}".format(rec)) 

f1 = f1_score(y_test, y_pred) 
print("The F1-Score is {}".format(f1)) 

MCC = matthews_corrcoef(y_test, y_pred) 
print("The Matthews correlation coefficient is{}".format(MCC)) 

The model used is NaiveBayes classifier
26885
11432
The accuracy is 0.944271389364181
The precision is 0.31904847289462673
The recall is 0.9927468848800446
The F1-Score is 0.4829021168807672
The Matthews correlation coefficient is0.5462089830280986
