In [1]:
import tensorflow as tf

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import pickle
import nltk
nltk.download('stopwords')

2023-11-27 10:08:31.665501: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-27 10:08:31.665547: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-27 10:08:31.667105: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-27 10:08:31.774961: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to /home

True

In [2]:
def accuracy_function(tp, tn, fp, fn):
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return accuracy


def precision_function(tp, fp):
    precision = tp / (tp+fp)
    return precision


def recall_function(tp, fn):
    recall = tp / (tp+fn)
    return recall

def confusion_matrix(truth, predicted):

    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for true, pred in zip(truth, predicted):

        if true == '1' or 1:
            if pred == true:
                true_positive += 1
            elif pred != true:
                false_negative += 1

        elif true == '0' or 1:
            if pred == true:
                true_negative += 1
            elif pred != true:
                false_positive += 1

    accuracy = accuracy_function(true_positive, true_negative, false_positive, false_negative)
    precision = precision_function(true_positive, false_positive)
    recall = recall_function(true_positive, false_negative)

    return accuracy, precision, recall

def build_cnn():

    model = tf.keras.Sequential(name="CNN")

    model.add(tf.keras.layers.Input(shape=(4096, 1)))

    model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

    model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=4, activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(units=128, activation='relu'))
    model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

    print(model.summary())

    return model

In [3]:
df = pd.read_csv("./dataset/SQLiV3.csv")

# Drop duplicate row
df = df.drop_duplicates('Sentence')

df = df.drop('Unnamed: 2', axis=1, errors='ignore')
df = df.drop('Unnamed: 3', axis=1, errors='ignore')

# Drop row with Nan
df = df.dropna(how='any')

# Drop row with incorrect label
df = df[(df['Label'] == '0') | (df['Label'] == '1')]

print(df.head())

# Reset index
df = df.sample(frac=1).reset_index(drop=True)
y = np.array([int(i) for i in df['Label'].values])

                                            Sentence Label
0                  " or pg_sleep  (  __TIME__  )  --     1
2   AND 1  =  utl_inaddr.get_host_address   (    ...     1
3   select * from users where id  =  '1' or @ @1 ...     1
4   select * from users where id  =  1 or 1#"  ( ...     1
5   select name from syscolumns where id   =     ...     1


In [4]:
vectorizer = CountVectorizer(max_features=4096, ngram_range=(1, 2))
#vectorizer = TfidfVectorizer(min_df=2, max_df=0.7, stop_words='english')

count_matrix = vectorizer.fit_transform(df['Sentence']).toarray()
with open('vectorizer_cnn.obj', 'wb') as fin:
    pickle.dump(vectorizer, fin)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(count_matrix, y, test_size=0.2, random_state=42)

In [6]:
# (sentence_index, feature_index) count
print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

Train shape: (24472, 4096)
Test shape: (6118, 4096)


## Logistic Regression

In [7]:
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

""" accuracy, precision, recall = confusion_matrix(y_test, y_pred)
print("=================Logistic Regression Result=================")
print("Accuracy : {:.4f}".format(accuracy))
print("Precision : {:.4f}".format(precision))
print("Recall : {:.4f}".format(recall)) """

target_names = ['Not SQLi', 'SQLi']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    Not SQLi       0.99      1.00      1.00      3874
        SQLi       1.00      0.99      0.99      2244

    accuracy                           0.99      6118
   macro avg       1.00      0.99      0.99      6118
weighted avg       0.99      0.99      0.99      6118



## K-Nearest Neighbors Classification

In [8]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
""" accuracy, precision, recall = confusion_matrix(y_test, y_pred)
print("=================KNN Result=================")
print("Accuracy : {:.4f}".format(accuracy))
print("Precision : {:.4f}".format(precision))
print("Recall : {:.4f}".format(recall)) """

target_names = ['Not SQLi', 'SQLi']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    Not SQLi       0.99      0.98      0.99      3874
        SQLi       0.96      0.99      0.98      2244

    accuracy                           0.98      6118
   macro avg       0.98      0.98      0.98      6118
weighted avg       0.98      0.98      0.98      6118



## 1D Convolutional Neural Networks Classification

In [9]:
X_train = X_train.reshape(-1, 4096, 1)
X_test = X_test.reshape(-1, 4096, 1)

# CNN shape
print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

cnn = build_cnn()
cnn.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4), metrics=['accuracy'])
cnn.fit(X_train, y_train, batch_size=10, epochs=5, validation_data=(X_test, y_test))
cnn.save("model.keras")

Train shape: (24472, 4096, 1)
Test shape: (6118, 4096, 1)


2023-11-27 10:08:56.300946: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-27 10:08:56.323723: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-27 10:08:56.323757: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-27 10:08:56.325508: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-27 10:08:56.325538: I tensorflow/compile

Model: "CNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 4093, 64)          320       
                                                                 
 max_pooling1d (MaxPooling1  (None, 2046, 64)          0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 2043, 128)         32896     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 1021, 128)         0         
 g1D)                                                            
                                                                 
 flatten (Flatten)           (None, 130688)            0         
                                                                 
 dense (Dense)               (None, 128)               16728192

r/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-27 10:08:56.681204: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1977] Could not identify NUMA node of platform GPU id 0, defaulting to 0.  Your kernel may not have been built with NUMA support.
2023-11-27 10:08:56.681239: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-27 10:08:56.681258: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6109 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2070, pci bus id: 0000:01:00.0, compute capability: 7.5


None


2023-11-27 10:08:57.711874: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 801898496 exceeds 10% of free system memory.
2023-11-27 10:08:57.982978: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 801898496 exceeds 10% of free system memory.


Epoch 1/5


2023-11-27 10:08:58.901068: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-27 10:08:59.884334: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f6cbc27daa0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-27 10:08:59.884375: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-11-27 10:08:59.892553: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-27 10:08:59.988580: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
y_pred = cnn.predict(X_test)

# Turn sigmoid result to 0 or 1
y_pred = np.where(y_pred > 0.5, 1, 0)

""" accuracy, precision, recall = confusion_matrix(y_test, y_pred)
print("=================CNN Result=================")
print("Accuracy : {:.4f}".format(accuracy))
print("Precision : {:.4f}".format(precision))
print("Recall : {:.4f}".format(recall)) """

target_names = ['Not SQLi', 'SQLi']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    Not SQLi       0.99      1.00      1.00      3874
        SQLi       1.00      0.99      0.99      2244

    accuracy                           0.99      6118
   macro avg       1.00      0.99      0.99      6118
weighted avg       0.99      0.99      0.99      6118

