In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
import seaborn as sb
import flask
import time
from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle

In [None]:
%matplotlib inline
plt.style.use('ggplot')
# full_df = pd.read_csv(r"C:\Users\Nicz\Documents\GitHub\3204-CourseWork2\shuffled-noIPV6.csv")
full_df = pd.read_csv(r"shuffled-noIPV6.csv")

print(f"[*] Shape of dataset: {full_df.shape}")
from sklearn.utils import shuffle
full_df.drop(full_df.columns[0], axis=1, inplace=True)
full_df = shuffle(full_df)

print(full_df.tail())
print("\n")

In [None]:
full_df["src_port"]=  full_df["src_port"].replace(regex=",", value= "")
full_df["src_port"]=  full_df["src_port"].replace(regex=",", value= "")
full_df["dst_port"]=  full_df["dst_port"].replace(regex=",", value= "")
full_df["dst_port"]=  full_df["dst_port"].replace(regex=" ", value= "")
full_df["src_port"] = full_df["src_port"].replace(regex=" ", value="")
full_df["src_port"] = full_df["src_port"].replace(regex="dns", value="53")
full_df["src_port"] = full_df["src_port"].replace(regex="tls", value="0")
full_df["dst_port"] = full_df["dst_port"].replace(regex="dns", value="53")
full_df["dst_ip"] = full_df["dst_ip"].replace(regex="\S*:+\S+", value="0")

In [None]:
scores = {}
scores_list= []

k_value=[]
model_scores={}
accuracy_dict={}
precision_dict={}
cm_dict={}
recall_dict={}
f1_dict={}

algo_accuracy={}
algo_precision={}
algo_recall={}
algo_f1={}

def scoring_metrics(y_test, y_pred, model):
    print(f"y_test size:{y_test.size} y_pred size:{y_pred.size}")
    KNN_accuracy = metrics.accuracy_score(y_test, y_pred)
    KNN_precision = metrics.precision_score(y_test, y_pred, average="weighted")
    KNN_recall = metrics.recall_score(y_test, y_pred, average="weighted")
    KNN_f1_score = metrics.f1_score(y_test, y_pred, average="weighted")
    
    scores[model] = KNN_accuracy
    scores_list.append(KNN_accuracy)
    cm = metrics.confusion_matrix(y_test, y_pred, labels=['-','nmap_scan', 'port_scan', 'smtp_enumeration', 'sql_enumeration', 'web_enumeration'])
    
    k_value.append(model)
    accuracy_dict[model]= KNN_accuracy
    precision_dict[model]= KNN_precision
    recall_dict[model]= KNN_recall
    f1_dict[model]= KNN_f1_score
        
    print(f"Confusion Matrix: {cm}")
    
    print(f"\n[*] Model: {model}")
    print("[*]Precision: {:.3f}%".format(KNN_precision))
    print("[*] Recall: {:.3f}%".format(KNN_recall))

    print("[*] Accuracy: {:.3f}%".format(KNN_accuracy))
    print("[*] F1_score: {:.3f}%".format(KNN_f1_score))


In [None]:
# df = full_df.head(25000)
df = full_df.head(1000)
data_orig = df.copy()

In [None]:
print("Doing")
from sklearn.preprocessing import OneHotEncoder
df.replace(to_replace=["None"], value=np.nan, inplace=True)
clean_df = df.fillna(str(0))
clean_x = clean_df.iloc[:, :13].values
clean_y = clean_df["category"].values
features = df.columns.values[:-1]

for label in clean_df.columns:
    for index, rows in clean_df.iterrows():
        new_ip = ""
        ip = str(rows[label])
        if re.search("\d+\.\d+\.\d+\.\d+", ip):
            octets = ip.split(".")
            for octet in octets:
                octet = octet.rjust(3,"0")
                new_ip += octet
            clean_df[label][index] = new_ip

clean_df["http_response_code"] = clean_df["http_response_code"].replace('HTTP/1.1"', value="0")
clean_df["src_ip"] = clean_df["src_ip"].replace('::1', value="0")
clean_df["dst_ip"] = clean_df["dst_ip"].replace('::1', value="0")
print("Done")

In [None]:
clean_x = clean_df.iloc[:, :13]
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['Protocol', 'http_request_method', 'http_request_referrer', 'url_path', 'user_agent_original', 'sql_method', 'sql_query']),remainder='passthrough')
test = column_trans.fit_transform(clean_x)

# K-Means

In [None]:
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Normalisation using MinMaxScaler
mms = MinMaxScaler()
mms.fit(test)
data_transformed = mms.transform(test)

In [None]:
kmeans = KMeans(n_clusters=5)

In [None]:
#apply PCA to our dataset with n_components=0.95. 
#This will select the number of components while preserving 95% of the variability in the data
pca = PCA(n_components = 0.95)
reduced = pca.fit_transform(data_transformed)
label = kmeans.fit_predict(reduced)
center = np.array(kmeans.cluster_centers_)

In [None]:
#plot clustering graph with PCA
plt.figure(figsize=(8,8))
uniq = np.unique(label)
for i in uniq:
  plt.scatter(reduced[label == i , 0] , reduced[label == i , 1] , label = i)
plt.scatter(center[:,0], center[:,1], marker="*", c='black', s=250)
plt.legend()
plt.savefig('static/kmeans_scatterplot.png')
plt.show()

# Table

In [None]:
#Table of Relationship between cluster and category
data_orig['label_'] = kmeans.labels_
ct = pd.crosstab(data_orig['category'], data_orig['label_'])
print(ct)

In [None]:
# import sklearn
# print(sklearn.show_versions())

# Elbow

In [None]:
#the sum of squared distance between each point and the centroid in a cluster
wcss = []
for i in range(1,15):
   model = KMeans(n_clusters = i)
   model.fit(data_transformed)
   wcss.append(model.inertia_)
plt.figure(figsize=(10,10))
plt.plot(range(1,15), wcss)
kn = KneeLocator(range(1,15), wcss, curve='convex', direction='decreasing')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.show()

# KNN

In [None]:
x_train, x_test, y_train, y_test = train_test_split(test, clean_y, test_size=0.3)

In [None]:
scores = {}
scores_list= []
ktrainingtime={}
kpredictiontime={}

range_k = range(100, 160, 11)
knn = ""
skip = False

for k in range_k:
    knn = KNeighborsClassifier(n_neighbors=k)
    start_time = time.time()
    knn.fit(x_train, y_train)
    timetaken = time.time() - start_time
    ktrainingtime[k] = timetaken
    
    timetaken = 0 
    
    start_time = time.time()
    y_pred = knn.predict(x_test)
    timetaken = time.time() - start_time
    kpredictiontime[k] = timetaken
    
    scoring_metrics(y_test, y_pred, f"{k}")
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    
print(ktrainingtime)
print(kpredictiontime)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
#dictionary to store processing times
algotrainingtime = {}
algopredictiontime = {}

clf = RandomForestClassifier(criterion="gini",
                             min_samples_split = 20,
                             min_samples_leaf = 6,
                             max_depth = 100,
                             n_estimators=500,
                             random_state=5) #can put any number here
start_time =time.time()
clf.fit(x_train, y_train)
rftrainingtimetaken = time.time() - start_time
algotrainingtime["Random Forest"] = rftrainingtimetaken

start_time =time.time()
y_pred = clf.predict(x_test)
rfpredictiontimetaken = time.time() - start_time
algopredictiontime["Random Forest"] = rfpredictiontimetaken

# Processing Time Comparision [RF & KNN]

In [None]:
#Update the algotime dictionaries with KNN's results
algotrainingtime["KNN"] = ktrainingtime[133]
algopredictiontime["KNN"] = kpredictiontime[133]

#Plot the processing time graphs
plt.title('Processing Time')
plt.xlabel('ML Algorithm')

trainingtime = algotrainingtime.values()
predictiontime = algopredictiontime.values()

x_axis = np.arange(len(algotrainingtime))
width = 0.2
#multi bar charts
plt.bar(x_axis, trainingtime, color = 'b', width = 0.3, edgecolor = 'black',label='Training Time')
plt.bar(x_axis + width, predictiontime, color = 'g',width = 0.3, edgecolor ='black',label='Prediction Time')

plt.xticks(x_axis,algotrainingtime.keys())
plt.legend(loc="upper right")
plt.savefig('static/ProcessingTime_Comparison.png')
plt.show()

# Random Forest's Confusion Matrix

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, f1_score,recall_score
from sklearn.metrics import confusion_matrix

## ==== CONFUSION MATRIX ====
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sb.set(font_scale=1.4)
sb.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['-','nmap_scan', 'port_scan', 'smtp_enumeration', 'sql_enumeration', 'web_enumeration']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

RF_accuracy= accuracy_score(y_test, y_pred)
algo_accuracy["Random Forest"]= RF_accuracy

RF_precision = precision_score(y_test, y_pred,average="weighted")
algo_precision["Random Forest"] = RF_precision

R1_f1_score = f1_score(y_test, y_pred,average="weighted")
algo_f1["Random Forest"]=R1_f1_score

R1_recall = recall_score(y_test, y_pred,average="weighted")
algo_recall["Random Forest"]=R1_recall

print("Classification Report \n" , classification_report(y_pred, y_test, labels=['-','nmap_scan', 'port_scan', 'smtp_enumeration', 'sql_enumeration', 'web_enumeration'], output_dict=True))

# KNN's Confusion Matrix

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(11, 9))

categories = ['-','nmap_scan', 'port_scan', 'smtp_enumeration', 'sql_enumeration', 'web_enumeration']
ax.set_xticklabels(categories, rotation=45)
                   
sb.heatmap(confusion_matrix, annot=True, fmt='0', cmap='Blues', xticklabels=categories, yticklabels=categories)
plt.title('KNN Confusion Matrix')
plt.savefig('static/KNN_ConfusionMatrix.png')

# KNN's optimal value

In [None]:
plt.title('Optimal K')
plt.xlabel('K value')

k_value = list(accuracy_dict.keys())

algo_accuracy["KNN"]= accuracy_dict['133']
plt.plot(k_value, list(accuracy_dict.values()), label = "Accuracy", linestyle="-")
plt.plot(k_value, list(precision_dict.values()), label = "Precision", linestyle="--")
plt.plot(k_value, list(f1_dict.values()), label = "F1 Score", linestyle="-.")

plt.legend(loc="upper right")
plt.savefig('static/KNN_graph.png')
plt.show()

# Accuracy & Precision Comparison (RF & KNN)

In [None]:
plt.title('Accuracy and Precision Comparison')
plt.xlabel('ML Algorithm')

#Defining data to display
algo_accuracy["KNN"]= accuracy_dict['133']
algo_precision['KNN']= precision_dict['133']

accuracy = algo_accuracy.values()
precision = algo_precision.values()
x_axis = np.arange(len(algo_accuracy))
width = 0.2

#multi bar charts
plt.bar(x_axis, accuracy, color = 'b', width = 0.3, edgecolor = 'black',label='KNN')
plt.bar(x_axis + width, precision, color = 'g',width = 0.3, edgecolor ='black',label='Random Forest')

plt.xticks(x_axis,['Accuracy', 'Precision'])
plt.legend(loc='lower right')
plt.savefig('static/AnP_Comparison.png')
plt.show()

# Accuracy Comparison alone -dk if needed

In [None]:
plt.title('Accuracy')
plt.xlabel('ML Algorithm')

algo_accuracy["KNN"]= accuracy_dict['133']
accuracy = algo_accuracy.values()

x_axis = algo_accuracy.keys()
plt.bar(x_axis, accuracy,edgecolor = 'black', color=['b', 'g'])

plt.savefig('static/Accuracy_comparison.png')
plt.show()

# Precision Comparison alone -dk if needed

In [None]:
plt.title('Precision Score')
plt.xlabel('ML Algorithm')

algo_precision["KNN"]= precision_dict['133']
precisionscores = algo_precision.values()

x_axis = algo_precision.keys()

plt.bar(x_axis, precisionscores,edgecolor = 'black', label = "F1 Scores", color=['b', 'g'])

plt.savefig('static/precision_comparison.png')
plt.show()

# F1_Score Comparison

In [None]:
plt.title('F1 Score')
plt.xlabel('ML Algorithm')

algo_f1["KNN"]= f1_dict['133']
f1_scores = algo_f1.values()
print(f1_scores)
x_axis = algo_f1.keys()
print(x_axis)
plt.bar(x_axis, f1_scores,edgecolor = 'black', label = "F1 Scores", color=['b', 'g'])

plt.savefig('static/F1_comparison.png')
plt.show()

# Recall Score Comparison

In [None]:
plt.title('Recall Score')
plt.xlabel('ML Algorithm')

algo_recall["KNN"]= recall_dict['133']
recall = algo_recall.values()
print(recall)
x_axis = algo_recall.keys()
print(x_axis)
plt.bar(x_axis, recall,edgecolor = 'black', label = "recall", color=['b', 'g'])

plt.savefig('static/Recall_comparison.png')
plt.show()

In [None]:
import io
import random
import matplotlib.pyplot as plt
from flask import Flask, render_template, Response, url_for
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure

app = Flask(__name__)

@app.route('/')
def home():
    return render_template('help.html')

In [None]:
app.run()