# Deep Learning Based System For Web Attack Detection

# Importing essential libraries

In [1]:
# Importing essential libraries
import numpy as np

import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

# Accessing dataset

In [2]:
db = pd.read_csv('dataset.csv')

In [None]:
db.head()

In [None]:
db.shape

In [None]:
db.columns

# Cleaning and formating data's column names and, removing non-numeric columns

In [3]:
# dataset.columns = dataset.columns.str.strip()
# dataset.columns = dataset.columns.str.lower()

db.columns = db.columns.\
    str.strip().\
    str.lower()

In [4]:
db = db.select_dtypes(['number'])

In [5]:
db.head()

Unnamed: 0,url_length,number_special_characters,content_length,tcp_conversation_exchange,dist_remote_tcp_port,remote_ips,app_bytes,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,app_packets,dns_query_times,type
0,16,7,263.0,7,0,2,700,9,10,1153,832,9,2.0,1
1,16,6,15087.0,17,7,4,1230,17,19,1265,1230,17,0.0,0
2,16,6,324.0,0,0,0,0,0,0,0,0,0,0.0,0
3,17,6,162.0,31,22,3,3812,39,37,18784,4380,39,8.0,0
4,17,6,124140.0,57,2,5,4278,61,62,129889,4586,61,4.0,0


In [None]:
db.columns

# Extracting dependent and independent variable

In [6]:
# Feature extraction

x = db.drop(['type'], axis = 1)
y = db['type']

for cols in x.columns:
    print(cols)
    
# Replace nan with zero and inf with finite numbers
x = np.nan_to_num(x)


url_length
number_special_characters
content_length
tcp_conversation_exchange
dist_remote_tcp_port
remote_ips
app_bytes
source_app_packets
remote_app_packets
source_app_bytes
remote_app_bytes
app_packets
dns_query_times


In [None]:
print(x)
print(y)

# Input data visualization

In [7]:
# Splitting the data into Training and Testing datasets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state = 44)

In [8]:
# Applying Logistic Regression

classifier = LogisticRegression(random_state = 42, max_iter = 50000)
classifier.fit(x_train, y_train)

prediction = classifier.predict(x_test)

In [9]:
# Calculate accuracy for logistic regression

print('For Logistic Regression accuracy score is ', accuracy_score(y_test, prediction))
print('For Logistic Regression confusion_matrix is: \n\n', confusion_matrix(y_test, prediction))
print('For Logistic Regression Classification Report: \n\n ', classification_report(y_test, prediction))

For Logistic Regression accuracy score is  0.9175084175084175
For Logistic Regression confusion_matrix is: 

 [[504  10]
 [ 39  41]]
For Logistic Regression Classification Report: 

                precision    recall  f1-score   support

           0       0.93      0.98      0.95       514
           1       0.80      0.51      0.63        80

    accuracy                           0.92       594
   macro avg       0.87      0.75      0.79       594
weighted avg       0.91      0.92      0.91       594



In [10]:
# Applying KNN

classifier2=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
classifier2.fit(x_train,y_train)

prediction2=classifier2.predict(x_test)

In [11]:
# Calculating accuracy for KNN

print('For KNN accuracy score is ', accuracy_score(y_test, prediction2))
print('For  KNN confusion_matrix is: \n\n', confusion_matrix(y_test, prediction2))
print ('For  KNN Classification Report: \n\n', classification_report(y_test,prediction2))

For KNN accuracy score is  0.9191919191919192
For  KNN confusion_matrix is: 

 [[497  17]
 [ 31  49]]
For  KNN Classification Report: 

               precision    recall  f1-score   support

           0       0.94      0.97      0.95       514
           1       0.74      0.61      0.67        80

    accuracy                           0.92       594
   macro avg       0.84      0.79      0.81       594
weighted avg       0.91      0.92      0.92       594



In [12]:
# Applying Random forest

rfc=RandomForestClassifier(n_estimators = 100)
rfc.fit(x_train, y_train)

prediction3 = rfc.predict(x_test)

In [13]:
# Calculating accuracy for random forest
#Classification Report

print('For Random Forest accuracy score is ', accuracy_score(y_test,prediction3))
print('For Random Forest confusion_matrix is: \n\n', confusion_matrix(y_test,prediction3))
print ('For Random Forest Classification Report: \n\n', classification_report(y_test,prediction3))

For Random Forest accuracy score is  0.9579124579124579
For Random Forest confusion_matrix is: 

 [[507   7]
 [ 18  62]]
For Random Forest Classification Report: 

               precision    recall  f1-score   support

           0       0.97      0.99      0.98       514
           1       0.90      0.78      0.83        80

    accuracy                           0.96       594
   macro avg       0.93      0.88      0.90       594
weighted avg       0.96      0.96      0.96       594



In [14]:
# Logistic Regression - 92%
# KNN - 92%
# Random forest accuracy - 96%

In [16]:
ip_data = (16, 7, 263, 7, 0, 2, 700, 9, 10, 1153, 832, 9, 2) #1
#ip_data = (17,6,162,31,22,3,3812,39,37,18784,4380,39,8) #0


ip_data_np = np.asarray(ip_data)
ip_data_reshape = ip_data_np.reshape(1, -1)

# Assuming rfc is your trained RandomForestClassifier
pred = rfc.predict(ip_data_reshape)
print(pred)

if pred[0] == 0:
    print("Url attack on the website.")
else:
    print("No Url attack on the website.")


[1]
No Url attack on the website.


In [17]:
import pickle

In [18]:
filename = "url_attack.sav"
pickle.dump(rfc, open(filename, "wb"))

In [19]:
#ip_data = (16, 7, 263, 7, 0, 2, 700, 9, 10, 1153, 832, 9, 2) #1
ip_data = (17,6,162,31,22,3,3812,39,37,18784,4380,39,8) #0

ip_data_np = np.asarray(ip_data)
ip_data_reshape = ip_data_np.reshape(1, -1)

prediction = rfc.predict(ip_data_reshape)
print(prediction)

if (prediction[0] == 0.0):
    print("Url attack on the website.")
else:
    print("No Url attack on the website.")

[0]
Url attack on the website.
