In [3]:
# importing relevant libraries
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import seaborn as sns
import warnings
import imblearn
warnings.filterwarnings('ignore')


## Load Data

In [9]:
# Dataset field names
data_columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"]

# Load NSL_KDD train dataset
kdd_train = pd.read_table("NSL_KDD_dataset/KDDTrain.txt", sep=",", names=data_columns) # change path to where the dataset is located.
kdd_train = kdd_train.iloc[:,:-1] # removes an unwanted extra field

# Load NSL_KDD test dataset
kdd_test = pd.read_table("NSL_KDD_dataset/KDDTest.txt", sep=",", names=data_columns)
kdd_test = kdd_test.iloc[:,:-1]

### Train dataset

In [10]:
kdd_train.shape

(125973, 42)

In [11]:
# View train data
kdd_train.head(5)



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [5]:
kdd_train.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.000198,0.022687,0.000111,0.204409,0.001222,0.395736,0.27925,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
std,2604.51531,5870331.0,4021269.0,0.014086,0.25353,0.014366,2.149968,0.045239,0.48901,23.942042,...,99.206213,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Test dataset

In [6]:
# View test data
kdd_test.head(5)



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan


In [7]:
kdd_test.shape

(22544, 42)

## Data Preprocessing

### Map attack field to attack classes.
NSL-KDD dataset has 42 attributes for each connection record. Attribute attack types is mapped into following four attack classes:
1. Denial of service(DoS).
2. Probing attack(Probe).
3. User to root attack(U2R).
4. Remote to local attack(R2L).

In [8]:
mapping = {'loadmodule': 'U2R','rootkit': 'U2R','buffer_overflow': 'U2R','xterm': 'U2R','ps': 'U2R',
        'sqlattack': 'U2R','httptunnel': 'U2R','ipsweep': 'Probe','satan': 'Probe','nmap': 'Probe','portsweep': 'Probe','saint': 'Probe','mscan': 'Probe',
        'teardrop': 'DoS','pod': 'DoS','land': 'DoS','back': 'DoS','neptune': 'DoS','smurf': 'DoS','mailbomb': 'DoS',
        'udpstorm': 'DoS','apache2': 'DoS','processtable': 'DoS','snmpguess': 'R2L','worm': 'R2L','snmpgetattack': 'R2L',
        'xsnoop': 'R2L','xlock': 'R2L','sendmail': 'R2L','perl': 'U2R','ftp_write': 'R2L','phf': 'R2L','guess_passwd': 'R2L','warezmaster': 'R2L','warezclient': 'R2L','imap': 'R2L',
        'spy': 'R2L','multihop': 'R2L','named': 'R2L','normal': 'Normal'}

In [9]:
# Apply attack class mappings to the dataset
kdd_train['attack_class'] = kdd_train['attack'].apply(lambda x: mapping[x])
kdd_test['attack_class'] = kdd_test['attack'].apply(lambda x: mapping[x])

In [10]:
# Drop attack field from both train and test data
kdd_train.drop(['attack'], axis=1, inplace=True)
kdd_test.drop(['attack'], axis=1, inplace=True)

In [11]:
kdd_train.shape

(125973, 42)

In [12]:
# View top 3 train data 
kdd_train.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,Normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,Normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,Normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal


### Exploratory Data Analysis

In [13]:
# Descriptive statistics
kdd_train.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.000198,0.022687,0.000111,0.204409,0.001222,0.395736,0.27925,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
std,2604.51531,5870331.0,4021269.0,0.014086,0.25353,0.014366,2.149968,0.045239,0.48901,23.942042,...,99.206213,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
kdd_train['num_outbound_cmds'].value_counts()


0    125973
Name: num_outbound_cmds, dtype: int64

In [15]:
kdd_test['num_outbound_cmds'].value_counts()

0    22544
Name: num_outbound_cmds, dtype: int64

In [16]:
# 'num_outbound_cmds' field has all 0 values. Hence, it will be removed from both train and test dataset.
kdd_train.drop(['num_outbound_cmds'], axis=1, inplace=True)
kdd_test.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [17]:
# Attack Class Distribution
attack_class_train = kdd_train[['attack_class']].apply(lambda x: x.value_counts())
attack_class_train['frequency_per_train'] = round((100 * attack_class_train / attack_class_train.sum()),2)


In [18]:
attack_class_test = kdd_test[['attack_class']].apply(lambda x: x.value_counts())
attack_class_test['frequency_per_test'] = round((100 * attack_class_test / attack_class_test.sum()),2)


In [19]:
attack_class_distribution = pd.concat([attack_class_train,attack_class_test], axis=1) 
attack_class_distribution

Unnamed: 0,attack_class,frequency_per_train,attack_class.1,frequency_per_test
DoS,45927,36.46,7458,33.08
Normal,67343,53.46,9711,43.08
Probe,11656,9.25,2421,10.74
R2L,995,0.79,2754,12.22
U2R,52,0.04,200,0.89


In [20]:
kdd_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,Normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,Normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,Normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal


In [21]:
kdd_train.shape

(125973, 41)

### Scaling Numerical Attributes

In [22]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

# extracting numerical attributes and scale it to have zero mean and unit variance  
column = kdd_train.select_dtypes(include=['float64','int64']).columns
scale_train = scale.fit_transform(kdd_train.select_dtypes(include=['float64','int64']))
scale_test = scale.fit_transform(kdd_test.select_dtypes(include=['float64','int64']))

# turn the result back to  dataframe
scale_traindf = pd.DataFrame(scale_train, columns = column)
scale_testdf = pd.DataFrame(scale_test, columns = column)

In [23]:
scale_traindf.shape

(125973, 37)

In [24]:
scale_testdf.shape

(22544, 37)

### Encoding of Categorical Attributes

In [25]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

# extracting categorical attributes from both training and test sets 
cat_train = kdd_train.select_dtypes(include=['object']).copy()
cat_test = kdd_test.select_dtypes(include=['object']).copy()

# encoding the categorical attributes
train_cat = cat_train.apply(encode.fit_transform)
test_cat = cat_test.apply(encode.fit_transform)

# separating target column from encoded data 
encode_train = train_cat.drop(['attack_class'], axis=1)
encode_test = test_cat.drop(['attack_class'], axis=1)

cate_Ytrain = train_cat[['attack_class']].copy()
cate_Ytest = test_cat[['attack_class']].copy()

### Data Sampling

In [26]:
from imblearn.over_sampling import RandomOverSampler 
from collections import Counter

# define columns and extracting encoded train set for sampling 
scale_traindf = kdd_train.select_dtypes(include=['float64','int64'])
ref_classcol = pd.concat([scale_traindf, encode_train], axis=1).columns
ref_class = np.concatenate((scale_train, encode_train.values), axis=1)
X = ref_class
s, r = cate_Ytest.values.shape
y_test = cate_Ytest.values.reshape(s,)
s, r = cate_Ytrain.values.shape
y = cate_Ytrain.values.reshape(s,)

# apply the random over-sampling
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_sample(X, y)
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({1: 67343, 0: 45927, 2: 11656, 3: 995, 4: 52})
Resampled dataset shape Counter({1: 67343, 0: 67343, 3: 67343, 2: 67343, 4: 67343})


### Feature Selection

In [27]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier();

# fit random forest classifier on the training set
random_forest_classifier.fit(X_res, y_res);
# extract important features
score = np.round(random_forest_classifier.feature_importances_,3)
importances = pd.DataFrame({'feature':ref_classcol,'importance':score})
importances = importances.sort_values('importance',ascending=False).set_index('feature')


In [28]:
from sklearn.feature_selection import RFE
import itertools
random_forest_classifier = RandomForestClassifier()

# create the RFE model.
rfe = RFE(random_forest_classifier, n_features_to_select=10)
rfe = rfe.fit(X_res, y_res)
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), ref_classcol)]
selected_features = [v for i, v in feature_map if i==True]

In [29]:
selected_features

['duration',
 'src_bytes',
 'dst_bytes',
 'logged_in',
 'count',
 'srv_count',
 'dst_host_srv_count',
 'dst_host_diff_srv_rate',
 'dst_host_serror_rate',
 'service']

### Dataset  Partition

In [30]:
# define columns to new dataframe
new_cols = list(ref_classcol)
new_cols.append('attack_class')

# add a dimension to target
new_re = y_res[:, np.newaxis]

# create a dataframe from sampled data
res_arr = np.concatenate((X_res, new_re), axis=1)
res_df = pd.DataFrame(res_arr, columns = new_cols) 

# create test dataframe
ref_test = pd.concat([scale_testdf, test_cat], axis=1)
ref_test['attack_class'] = ref_test['attack_class'].astype(np.float64)
ref_test['protocol_type'] = ref_test['protocol_type'].astype(np.float64)
ref_test['flag'] = ref_test['flag'].astype(np.float64)
ref_test['service'] = ref_test['service'].astype(np.float64)

res_df.shape


(336715, 41)

In [31]:
ref_test.shape

(22544, 41)

In [32]:
from collections import defaultdict
class_dict = defaultdict(list)

# create two-target classes (normal class and an attack class)  
attacklist = [('DoS', 0.0), ('Probe', 2.0), ('R2L', 3.0), ('U2R', 4.0)]
normalclass = [('Normal', 1.0)]

def create_classdict():
     for j, k in normalclass: 
        for i, v in attacklist: 
            restrain_set = res_df.loc[(res_df['attack_class'] == k) | (res_df['attack_class'] == v)]
            class_dict[j +'_' + i].append(restrain_set)
            # test labels
            ref_test_set = ref_test.loc[(ref_test['attack_class'] == k) | (ref_test['attack_class'] == v)]
            class_dict[j +'_' + i].append(ref_test_set)
        
create_classdict()

In [33]:
for l, m in class_dict.items():
    l

In [34]:
pretrain = class_dict['Normal_DoS'][0]
pretest = class_dict['Normal_DoS'][1]
grpclass = 'Normal_DoS'

### one-hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
encode = OneHotEncoder()

X_resdf = pretrain 
new_test = pretest

X_resdfnew = X_resdf[selected_features]
X_resdfnum = X_resdfnew.drop(['service'], axis=1)
X_resdfcat = X_resdfnew[['service']].copy()

Xtest_features = new_test[selected_features]
X_testdfnum = Xtest_features.drop(['service'], axis=1)
X_testcat = Xtest_features[['service']].copy()


# Fit train data
encode.fit(X_resdfcat)

# Transform train data
X_train_hotenc = encode.transform(X_resdfcat).toarray()
       
# Transform test data
X_test_hotenc = encode.transform(X_testcat).toarray()

X_train = np.concatenate((X_resdfnum.values, X_train_hotenc), axis=1)
X_test = np.concatenate((X_testdfnum.values, X_test_hotenc), axis=1) 

y_train = X_resdf[['attack_class']].copy()
s, r = y_train.values.shape
Y_train = y_train.values.reshape(s,)

y_test = new_test[['attack_class']].copy()
s, r = y_test.values.shape
Y_test = y_test.values.reshape(s,)

## Train Models

In [42]:
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


# Train KNeighborsClassifier Model
KNN_Classifier_model = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier_model.fit(X_train, Y_train); 

# Train LogisticRegression Model
LGR_Classifier_model = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier_model.fit(X_train, Y_train);

# Train Gaussian Naive Baye Model
BNB_Classifier_model = BernoulliNB()
BNB_Classifier_model.fit(X_train, Y_train)
            
# Train Decision Tree Model
DTC_Classifier_model = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier_model.fit(X_train, Y_train);
   


## Evaluate Models

In [43]:
from sklearn import metrics

models = []
models.append(('KNeighborsClassifier', KNN_Classifier_model))
models.append(('Naive Baye Classifier', BNB_Classifier_model))
models.append(('Decision Tree Classifier', DTC_Classifier_model))
models.append(('LogisticRegression', LGR_Classifier_model))
for i, v in models:
    scores = cross_val_score(v, X_train, Y_train, cv=10)
    accuracy = metrics.accuracy_score(Y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(Y_train, v.predict(X_train))
    classification = metrics.classification_report(Y_train, v.predict(X_train))
    print('{} Model evaluation'.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()

KNeighborsClassifier Model evaluation

Cross Validation Mean Score:
 0.9957456691866492

Model Accuracy:
 0.9976092541169832

Confusion matrix:
 [[67243   100]
 [  222 67121]]

Classification report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     67343
         1.0       1.00      1.00      1.00     67343

   micro avg       1.00      1.00      1.00    134686
   macro avg       1.00      1.00      1.00    134686
weighted avg       1.00      1.00      1.00    134686


Naive Baye Classifier Model evaluation

Cross Validation Mean Score:
 0.9744962636833462

Model Accuracy:
 0.9744962356889357

Confusion matrix:
 [[65346  1997]
 [ 1438 65905]]

Classification report:
               precision    recall  f1-score   support

         0.0       0.98      0.97      0.97     67343
         1.0       0.97      0.98      0.97     67343

   micro avg       0.97      0.97      0.97    134686
   macro avg       0.97      0.97      0.97    1346

## Test Models

In [44]:
for i, v in models:
    accuracy = metrics.accuracy_score(Y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(Y_test, v.predict(X_test))
    classification = metrics.classification_report(Y_test, v.predict(X_test))
    print('{} Model Test Results '.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print("Confusion matrix:" "\n", confusion_matrix)
    print("Classification report:" "\n", classification) 
          


KNeighborsClassifier Model Test Results 

Model Accuracy:
 0.8257324247189702
Confusion matrix:
 [[5085 2373]
 [ 619 9092]]
Classification report:
               precision    recall  f1-score   support

         0.0       0.89      0.68      0.77      7458
         1.0       0.79      0.94      0.86      9711

   micro avg       0.83      0.83      0.83     17169
   macro avg       0.84      0.81      0.82     17169
weighted avg       0.84      0.83      0.82     17169

Naive Baye Classifier Model Test Results 

Model Accuracy:
 0.8106470965111539
Confusion matrix:
 [[5081 2377]
 [ 874 8837]]
Classification report:
               precision    recall  f1-score   support

         0.0       0.85      0.68      0.76      7458
         1.0       0.79      0.91      0.84      9711

   micro avg       0.81      0.81      0.81     17169
   macro avg       0.82      0.80      0.80     17169
weighted avg       0.82      0.81      0.81     17169

Decision Tree Classifier Model Test Results 

Mod