In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Import required libraries
import nltk
import pickle
import pandas as pd
import tensorflow as tf
from keras import layers
from keras.models import Sequential, load_model
#from data_sanitize import clean_data
from sklearn import tree
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


# download stopwords 
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import chardet

with open('/content/drive/MyDrive/Colab Notebooks/sqli.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result['encoding'])

UTF-16


In [None]:
# Load in the data set as required
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sqli.csv', encoding= 'UTF-16')


In [None]:
# Vectorize the dataset
vectorizer = CountVectorizer(min_df = 2, max_df = 0.7, max_features = 4096, stop_words = nltk.corpus.stopwords.words('english'))

# initialize variables X and y
y = df['Label']
X = vectorizer.fit_transform(df['Sentence'].values.astype('U')).toarray()

X.shape


(4200, 4096)

In [None]:
# reshape the X variable into a matrix
X.shape = (4200, 64, 64, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# reshape train and test data
tested = X_test.copy()
tested.shape = (tested.shape[0], tested.shape[1] * tested.shape[2])
trained = X_train.copy()
trained.shape = (X_train.shape[0], X_train.shape[1] * X_train.shape[2])

In [None]:
# define classifiers for the research
# Naive Bayes
nb = GaussianNB()
nb.fit(trained, y_train)
nb_prediction = nb.predict(tested)

# Decision Tree
dt = tree.DecisionTreeClassifier()
dt = dt.fit(trained, y_train)
dt_prediction = dt.predict(tested)

# K Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors = 3)
knn = knn.fit(trained, y_train)
knn_prediction = knn.predict(tested)

# Support Vector Machine
svm = SVC(gamma = 'auto')
svm = svm.fit(trained, y_train)
svm_prediction = svm.predict(tested)


In [None]:
# import keras methods
import keras
from keras.models import Model

# CNN
epochs = 20
batch_size = 128

# Create model
model = Sequential()
model.add(layers.Conv2D(32, kernel_size = (3, 3), activation = tf.nn.relu, input_shape = (64, 64, 1)))
model.add(layers.MaxPooling2D(pool_size = (2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation = tf.nn.relu))
model.add(layers.MaxPooling2D(pool_size = (2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation = tf.nn.relu))
model.add(layers.MaxPooling2D(pool_size = (2, 2)))

model.add(layers.Conv2D(256, (3, 3), activation = 'relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(256, activation = 'relu'))
model.add(layers.Dense(128, activation = 'relu'))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(32, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

In [None]:
# compile CNN model
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 62, 62, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 31, 31, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 29, 29, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 14, 14, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 12, 12, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 6, 6, 128)        0

In [None]:
cnn = model.fit(X_train, y_train, epochs = epochs, verbose = True, validation_data = (X_test, y_test), batch_size = batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

accuracy = cnn.history['accuracy']
val_accuracy = cnn.history['val_accuracy']
loss = cnn.history['loss']
val_loss = cnn.history['val_loss']
epochs = range(len(accuracy))

# plot accuracy values
plt.plot(epochs, accuracy, 'bo', label = 'Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label = 'Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

# plot loss values
plt.plot(epochs, loss, 'bo', label = 'Training loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# add dropout to reduce overfitting
model = Sequential()
model.add(layers.Conv2D(32, kernel_size = (3, 3), activation = tf.nn.relu, input_shape = (64, 64, 1)))
model.add(layers.MaxPooling2D(pool_size = (2, 2)))
model.add(layers.Dropout(0.25))

model.add(layers.Conv2D(64, (3, 3), activation = tf.nn.relu))
model.add(layers.MaxPooling2D(pool_size = (2, 2)))
model.add(layers.Dropout(0.25))

model.add(layers.Conv2D(128, (3, 3), activation = tf.nn.relu))
model.add(layers.MaxPooling2D(pool_size = (2, 2)))
model.add(layers.Dropout(0.4))

model.add(layers.Conv2D(256, (3, 3), activation = 'relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))
model.add(layers.Dropout(0.4))

model.add(layers.Flatten())
model.add(layers.Dense(256, activation = 'relu'))
model.add(layers.Dense(128, activation = 'relu'))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(32, activation = 'relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(1, activation = 'sigmoid'))

In [None]:
# show model summary
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy')
model.summary()

In [None]:
cnn_dropout = model.fit(X_train, y_train, epochs = 20, verbose = True, validation_data = (X_test, y_test), batch_size = batch_size)

In [None]:
accuracy = cnn_dropout.history['accuracy']
val_accuracy = cnn_dropout.history['val_accuracy']
loss = cnn_dropout.history['loss']
val_loss = cnn_dropout.history['val_loss']
epochs = range(len(accuracy))

# plot accuracy values
plt.plot(epochs, accuracy, 'bo', label = 'Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label = 'Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

# plot loss values
plt.plot(epochs, loss, 'bo', label = 'Training loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
cnn_prediction = model.predict(X_test)

# convert array into one-hot encoding vector
for i in range(len(cnn_prediction)):
    if cnn_prediction[i] > 0.5:
        cnn_prediction[i] = 1
    elif cnn_prediction[i] <= 0.5:
        cnn_prediction[i] = 0

In [None]:
# save model for future use
model.save('cnn_model.h5')
with open('cnn_vectorizer', 'wb') as fin:
    pickle.dump(vectorizer, fin)

In [None]:
# instantiate confusion matrix
def confusion_matrix(true_value, predicted_value):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    # loop over params
    for i, j in zip(true_value, predicted_value):
        if i == 1:
            if i == j:
                tp += 1
            elif i != j:
                fn += 1
                
        elif i == 0:
            if i == j:
                tn += 1
            elif i != j:
                fp += 1
                
    return (tp, tn, fp, fn)
                
# set colors for bar chart
colors = ['#0069c0', '#008ac5', '#00a9b5', '#00c698', '#1fe074'] #, '#0E3386', '#BA0021', '#E81828', '#473729', '#D31145', '#0C2340', '#005A9C', '#BD3039', '#EB6E1F', '#C41E3A', '#33006F', '#C6011F', '#004687', '#CE1141', '#134A8E', '#27251F', '#FDB827', '#0C2340', '#FD5A1E', '#00A3E0', '#ffc52f', '#003831', '#005C5C', '#E31937', '#8FBCE6']

In [None]:
# compute evaluation criteria

# Convolutional Neural Network
tp, tn, fp, fn = confusion_matrix(y_test, cnn_prediction)
accuracy = (tp + tn) / (tp + fp + fn + tn)
false_alarm_ratio = fp / (fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * ((recall * precision) / (recall + precision))
cnn_dict = {'Accuracy': accuracy, 'FAR': false_alarm_ratio, 'Recall': recall, 'Precision': precision, 'F1_Score': f1_score}

print(" For CNN \n Accuracy : {0} \n False Alarm Ratio : {1} \n Recall : {2} \n Precision : {3} \n F1_Score : {4}".format(accuracy, false_alarm_ratio, recall, precision, f1_score))
plt.bar(*zip(*cnn_dict.items()), color = colors)
plt.show()

In [None]:
# Naive Bayes
tp, tn, fp, fn = confusion_matrix(y_test, nb_prediction)
accuracy = (tp + tn) / (tp + fp + fn + tn)
false_alarm_ratio = fp / (fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * ((recall * precision) / (recall + precision))
nb_dict = {'Accuracy': accuracy, 'FAR': false_alarm_ratio, 'Recall': recall, 'Precision': precision, 'F1_Score': f1_score}

print(" For Naive Bayes \n Accuracy : {0} \n False Alarm Ratio : {1} \n Recall : {2} \n Precision : {3} \n F1_Score : {4}".format(accuracy, false_alarm_ratio, recall, precision, f1_score))
plt.bar(*zip(*nb_dict.items()), color = colors)
plt.show()

In [None]:
# Decision Tree
tp, tn, fp, fn = confusion_matrix(y_test, dt_prediction)
accuracy = (tp + tn) / (tp + fp + fn + tn)
false_alarm_ratio = fp / (fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * ((recall * precision) / (recall + precision))
dt_dict = {'Accuracy': accuracy, 'FAR': false_alarm_ratio, 'Recall': recall, 'Precision': precision, 'F1_Score': f1_score}

print(" For Decision Tree \n Accuracy : {0} \n False Alarm Ratio : {1} \n Recall : {2} \n Precision : {3} \n F1_Score : {4}".format(accuracy, false_alarm_ratio, recall, precision, f1_score))
plt.bar(*zip(*dt_dict.items()), color = colors)
plt.show()

In [None]:
# K Nearest Neighbor
tp, tn, fp, fn = confusion_matrix(y_test, knn_prediction)
accuracy = (tp + tn) / (tp + fp + fn + tn)
false_alarm_ratio = fp / (fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * ((recall * precision) / (recall + precision))
knn_dict = {'Accuracy': accuracy, 'FAR': false_alarm_ratio, 'Recall': recall, 'Precision': precision, 'F1_Score': f1_score}

print(" For K Nearest Neighbor \n Accuracy : {0} \n False Alarm Ratio : {1} \n Recall : {2} \n Precision : {3} \n F1_Score : {4}".format(accuracy, false_alarm_ratio, recall, precision, f1_score))
plt.bar(*zip(*knn_dict.items()), color = colors)
plt.show()

In [None]:
# Support Vector Machine
tp, tn, fp, fn = confusion_matrix(y_test, svm_prediction)
accuracy = (tp + tn) / (tp + fp + fn + tn)
false_alarm_ratio = fp / (fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * ((recall * precision) / (recall + precision))
svm_dict = {'Accuracy': accuracy, 'FAR': false_alarm_ratio, 'Recall': recall, 'Precision': precision, 'F1_Score': f1_score}

print(" For Support Vector Machine \n Accuracy : {0} \n False Alarm Ratio : {1} \n Recall : {2} \n Precision : {3} \n F1_Score : {4}".format(accuracy, false_alarm_ratio, recall, precision, f1_score))
plt.bar(*zip(*svm_dict.items()), color = colors)
plt.show()

In [None]:
#from data_sanitize import clean_data

# detect SQLI with CNN
input_val = "LIMIT 1-- id=1') OR SLEEP(25)=0 LIMIT 1-- id=1)) OR SLEEP(25)=0 LIMIT 1-- id=SELECT SLEEP(25)--"
#input_val = clean_data(input_val)
input_val = [input_val]
input_val = vectorizer.transform(input_val).toarray()
input_val.shape = (1, 64, 64, 1)
result = model.predict(input_val)
print(result > 0.5)

In [None]:
input_val = "a quick brown fox"
#input_val = clean_data(input_val)
input_val = [input_val]
input_val = vectorizer.transform(input_val).toarray()
input_val.shape = (1, 64, 64, 1)
result = model.predict(input_val)
print(result > 0.5)