In [None]:
import time

## Data Collection / Pre-processing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import urllib.parse
from sklearn import tree
from sklearn import metrics
import io
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

In [None]:
normal_file_raw = 'normalTrafficTraining.txt'
anomaly_file_raw = 'anomalousTrafficTest.txt'

normal_file_parse = 'normalRequestTraining.txt'
anomaly_file_parse = 'anomalousRequestTest.txt'

In [None]:
def parse_file(file_in, file_out):
    fin = open(file_in)
    fout = io.open(file_out, "w", encoding="utf-8")
    lines = fin.readlines()
    res = []
    for i in range(len(lines)):
        line = lines[i].strip()
        if line.startswith("GET"):
            res.append("GET" + line.split(" ")[1])
        elif line.startswith("POST") or line.startswith("PUT"):
            url = line.split(' ')[0] + line.split(' ')[1]
            j = 1
            while True:
                if lines[i + j].startswith("Content-Length"):
                    break
                j += 1
            j += 1
            data = lines[i + j + 1].strip()
            url += '?' + data
            res.append(url)
    for line in res:
        line = urllib.parse.unquote(line).replace('\n','').lower()
        fout.writelines(line + '\n')
    print ("finished parse ",len(res)," requests")
    fout.close()
    fin.close()

def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result

In [None]:
parse_file(normal_file_raw,normal_file_parse)
parse_file(anomaly_file_raw,anomaly_file_parse)

In [None]:
bad_requests = loadData('anomalousRequestTest.txt')
good_requests = loadData('normalRequestTraining.txt')

all_requests = bad_requests + good_requests
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

In [None]:
import pandas as pd
malicious_urls = bad_requests
normal_urls = good_requests
data = pd.DataFrame({'URL': malicious_urls + normal_urls, 'Label': [1] * len(malicious_urls) + [0] * len(normal_urls)})

In [None]:
data

Unnamed: 0,URL,Label
0,gethttp://localhost:8080/tienda1/publico/anadi...,1
1,posthttp://localhost:8080/tienda1/publico/anad...,1
2,gethttp://localhost:8080/tienda1/publico/anadi...,1
3,posthttp://localhost:8080/tienda1/publico/anad...,1
4,gethttp://localhost:8080/asf-logo-wide.gif~,1
...,...,...
97060,gethttp://localhost:8080/tienda1/imagenes/2.gif,0
97061,gethttp://localhost:8080/tienda1/imagenes/3.gif,0
97062,gethttp://localhost:8080/tienda1/imagenes/cmen...,0
97063,gethttp://localhost:8080/tienda1/imagenes/logo...,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97065 entries, 0 to 97064
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     97065 non-null  object
 1   Label   97065 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [None]:
data['Label'].value_counts()

0    72000
1    25065
Name: Label, dtype: int64

In [None]:
data.isnull().sum()

URL      0
Label    0
dtype: int64

# Scikit-learn

In [None]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(data['URL'])
X_train, X_test, y_train, y_test = train_test_split(X, data['Label'], test_size=0.2, random_state=21)

##logistic regression

In [None]:
#logistic regression
lgs = LogisticRegression()


start_time = time.time()
lgs.fit(X_train, y_train)
end_time = time.time()
training_time_logistic = end_time - start_time

start_time = time.time()
y_pred = lgs.predict(X_test)
end_time = time.time()
prediction_time_logistic = end_time - start_time


score_test = metrics.accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print ("Score Logistic Regression :",score_test)
print ("Confusion Matrix: ")
print (matrix)

Score Logistic Regression : 0.9743985988770412
Confusion Matrix: 
[[14339    26]
 [  471  4577]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(training_time_logistic)
print(prediction_time_logistic)

8.875909090042114
0.006536722183227539


In [None]:
import os
from joblib import dump

dump(lgs, 'logistic_regression_model.joblib')

logistic_regression_model_size = os.path.getsize('logistic_regression_model.joblib') / 1024  # Size in KB

In [None]:
logistic_regression_model_size

256.3896484375

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Display Metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

Accuracy: 0.9743985988770412
Precision: 0.9943515098848577
Recall: 0.9066957210776545
F1 Score: 0.9485027458294476
AUC-ROC Score: 0.9524428831632616


##Linear SVM

In [None]:
#Linear SVM
linear_svm=LinearSVC(C=1)


start_time = time.time()
linear_svm.fit(X_train, y_train)
end_time = time.time()
training_time_linear_svm = end_time - start_time

start_time = time.time()
y_pred = linear_svm.predict(X_test)
end_time = time.time()
prediction_time_linear_svm = end_time - start_time


score_test = metrics.accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)
print ("Score Linear SVM :",score_test)
print ("Confusion Matrix: ")
print (matrix)

Score Linear SVM : 0.9960335857415135
Confusion Matrix: 
[[14347    18]
 [   59  4989]]


In [None]:
print(training_time_logistic)
print(prediction_time_logistic)

8.875909090042114
0.006536722183227539


In [None]:
dump(linear_svm, 'linear_svm_model.joblib')
linear_svm_model_size = os.path.getsize('linear_svm_model.joblib') / 1024

In [None]:
linear_svm_model_size

256.2451171875

In [None]:
# Calculate Metrics
accuracy_svm = accuracy_score(y_test, y_pred)
precision_svm = precision_score(y_test, y_pred)
recall_svm = recall_score(y_test, y_pred)
f1_svm = f1_score(y_test, y_pred)
conf_matrix_svm = confusion_matrix(y_test, y_pred)
roc_auc_svm = roc_auc_score(y_test, y_pred)

# Display Metrics
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)
print("AUC-ROC Score:", roc_auc_svm)

Accuracy: 0.9960335857415135
Precision: 0.9964050329538646
Recall: 0.9883122028526149
F1 Score: 0.99234211834908
AUC-ROC Score: 0.993529578627839


# Tensorflow

## Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf

X = data['URL']
y = data['Label']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Sequence Padding
max_sequence_length = 666
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, y, test_size=0.2, random_state=42
)

# Example: Display the shape of the preprocessed data
print("Padded Sequences Shape:", padded_sequences.shape)
# print("Scaled Features Shape:", scaled_features.shape)
print("X_train Shape:", X_train.shape)
print("X_test Shape:", X_test.shape)
print("y_train Shape:", y_train.shape)
print("y_test Shape:", y_test.shape)

Padded Sequences Shape: (97065, 666)
X_train Shape: (77652, 666)
X_test Shape: (19413, 666)
y_train Shape: (77652,)
y_test Shape: (19413,)


#### 1

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

model_logistic_regression = Sequential()

# Add a single neuron Dense layer with a sigmoid activation function
model_logistic_regression.add(Dense(units=1, activation='sigmoid', input_dim=X_train.shape[1]))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model_logistic_regression.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
start_time = time.time()
model_logistic_regression.fit(X_train, y_train, epochs=50, batch_size=512, validation_split=0.2)
end_time = time.time()

training_time_tensorflow = end_time - start_time

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
training_time_tensorflow

27.220130920410156

In [None]:
import os
from tensorflow.keras.models import load_model

# Assuming model is your trained TensorFlow model
model_logistic_regression.save('tensorflow_model.h5')

tensorflow_model_size = os.path.getsize('tensorflow_model.h5') / 1024

  saving_api.save_model(


In [None]:
tensorflow_model_size

27.5703125

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Evaluate the model
import time

start_time = time.time()
y_pred = (model_logistic_regression.predict(X_test) > 0.5).astype(int)
end_time = time.time()

prediction_time_tensorflow = end_time - start_time

# Calculate metrics
accuracy_tf = accuracy_score(y_test, y_pred)
precision_tf = precision_score(y_test, y_pred)
recall_tf = recall_score(y_test, y_pred)
f1_tf = f1_score(y_test, y_pred)
conf_matrix_tf = confusion_matrix(y_test, y_pred)
roc_auc_tf = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy_tf)
print("Precision:", precision_tf)
print("Recall:", recall_tf)
print("F1 Score:", f1_tf)
print("AUC-ROC:", roc_auc_tf)

Accuracy: 0.8215628702415907
Precision: 0.7152130962461191
Recall: 0.5079174183203047
F1 Score: 0.5939990623534928
AUC-ROC: 0.7189822809848888


In [None]:
prediction_time_tensorflow

1.4538044929504395

In [None]:
# Evaluate the TensorFlow model
score_test_tf = accuracy_score(y_test, y_pred)
matrix_tf = confusion_matrix(y_test, y_pred)

print("Score Logistic Regression (TensorFlow):", score_test_tf)
print("Confusion Matrix (TensorFlow): ")
print(matrix_tf)

Score Logistic Regression (TensorFlow): 0.8215628702415907
Confusion Matrix (TensorFlow): 
[[13415  1009]
 [ 2455  2534]]


#### 2

In [None]:
# Build logistic regression model in TensorFlow
model_tf = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_tf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
import tensorflow as tf
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=20,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

In [None]:
# model_tf.fit(X_train, y_train, epochs=50,batch_size = 128)
model_tf.fit(X_train, y_train, epochs=50, batch_size=512, validation_split=0.2,callbacks = early_stopping)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7a3e3bbbda20>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Evaluate the model
y_pred = (model_tf.predict(X_test) > 0.5).astype(int)

# Calculate metrics
accuracy_tf = accuracy_score(y_test, y_pred)
precision_tf = precision_score(y_test, y_pred)
recall_tf = recall_score(y_test, y_pred)
f1_tf = f1_score(y_test, y_pred)
conf_matrix_tf = confusion_matrix(y_test, y_pred)
roc_auc_tf = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy_tf)
print("Precision:", precision_tf)
print("Recall:", recall_tf)
print("F1 Score:", f1_tf)
print("AUC-ROC:", roc_auc_tf)

Accuracy: 0.9595116674393448
Precision: 0.931076923076923
Recall: 0.9098015634395671
F1 Score: 0.9203163017031629
AUC-ROC: 0.9432535271440764


In [None]:
# Evaluate the TensorFlow model
score_test_tf = accuracy_score(y_test, y_pred)
matrix_tf = confusion_matrix(y_test, y_pred)

print("Score Logistic Regression (TensorFlow):", score_test_tf)
print("Confusion Matrix (TensorFlow): ")
print(matrix_tf)

Score Logistic Regression (TensorFlow): 0.9595116674393448
Confusion Matrix (TensorFlow): 
[[14088   336]
 [  450  4539]]


## Linear SVM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

#### 1

In [None]:
class LinearSVM(tf.keras.Model):
    def __init__(self, input_dim):
        super(LinearSVM, self).__init__()
        self.dense = Dense(units=1, activation='linear', input_dim=input_dim)

    def call(self, inputs):
        return self.dense(inputs)

# Instantiate the model
model_linear_svm = LinearSVM(input_dim=X_train.shape[1])

# Compile the model with hinge loss (SVM loss)
optimizer = Adam(learning_rate=0.001)
model_linear_svm.compile(optimizer=optimizer, loss='hinge', metrics=['accuracy'])

# Train the model
import time

start_time = time.time()
model_linear_svm.fit(X_train, y_train, epochs=50, batch_size=256, validation_split=0.2)
end_time = time.time()

training_time_tensorflow = end_time - start_time


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
training_time_tensorflow

41.68709635734558

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Evaluate the model
import time

start_time = time.time()
y_pred = (model_linear_svm.predict(X_test) > 0.5).astype(int)
end_time = time.time()

prediction_time_tensorflow = end_time - start_time



# Calculate metrics
accuracy_tf = accuracy_score(y_test, y_pred)
precision_tf = precision_score(y_test, y_pred)
recall_tf = recall_score(y_test, y_pred)
f1_tf = f1_score(y_test, y_pred)
conf_matrix_tf = confusion_matrix(y_test, y_pred)
roc_auc_tf = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy_tf)
print("Precision:", precision_tf)
print("Recall:", recall_tf)
print("F1 Score:", f1_tf)
print("AUC-ROC:", roc_auc_tf)

Accuracy: 0.8482460207077731
Precision: 0.9866603144354454
Recall: 0.41511324914812586
F1 Score: 0.5843679458239277
AUC-ROC: 0.7065860200260874


In [None]:
prediction_time_tensorflow

1.4123051166534424

In [None]:
# Evaluate the TensorFlow model
score_test_tf = accuracy_score(y_test, y_pred)
matrix_tf = confusion_matrix(y_test, y_pred)

print("Score LinearSVM (TensorFlow):", score_test_tf)
print("Confusion Matrix (TensorFlow): ")
print(matrix_tf)

Score LinearSVM (TensorFlow): 0.8482460207077731
Confusion Matrix (TensorFlow): 
[[14396    28]
 [ 2918  2071]]


In [None]:
import os
import torch

# Assuming model_pt is your trained PyTorch model
torch.save(model_linear_svm, 'pytorch_model.pth')

pytorch_model_size = os.path.getsize('pytorch_model.pth') / 1024

In [None]:
pytorch_model_size

26.8671875

#### 2

In [None]:
# Create a linear SVM model using TensorFlow
model_svm_tf = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    # tf.keras.layers.Dense(80, activation='linear'),
    tf.keras.layers.Dense(50, activation='linear'),
    tf.keras.layers.Dense(1, activation='linear'),  # Linear activation for SVM
])

# Use Hinge loss for SVM
# model_svm_tf.compile(optimizer='adam', loss='hinge', metrics=['accuracy'])
model_svm_tf.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='hinge', metrics=['accuracy'])

In [None]:
model_svm_tf.fit(X_train, y_train, epochs=50, batch_size=512, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7a3e399abaf0>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Evaluate the model
y_pred = (model_svm_tf.predict(X_test) > 0.5).astype(int)

# Calculate metrics
accuracy_tf = accuracy_score(y_test, y_pred)
precision_tf = precision_score(y_test, y_pred)
recall_tf = recall_score(y_test, y_pred)
f1_tf = f1_score(y_test, y_pred)
conf_matrix_tf = confusion_matrix(y_test, y_pred)
roc_auc_tf = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy_tf)
print("Precision:", precision_tf)
print("Recall:", recall_tf)
print("F1 Score:", f1_tf)
print("AUC-ROC:", roc_auc_tf)

Accuracy: 0.8416525009014578
Precision: 0.9284116331096197
Recall: 0.41591501302866307
F1 Score: 0.5744739756367664
AUC-ROC: 0.7024111948116138


In [None]:
# Evaluate the TensorFlow model
score_test_tf = accuracy_score(y_test, y_pred)
matrix_tf = confusion_matrix(y_test, y_pred)

print("Score LinearSVM (TensorFlow):", score_test_tf)
print("Confusion Matrix (TensorFlow): ")
print(matrix_tf)

Score LinearSVM (TensorFlow): 0.8416525009014578
Confusion Matrix (TensorFlow): 
[[14264   160]
 [ 2914  2075]]


# Pytorch

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# Assuming your data is in a DataFrame named df
# Extract features and labels
X = data['URL']
y = data['Label']

# # TF-IDF Vectorization
# vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
# X_tfidf = vectorizer.fit_transform(X)

# Tokenization and Sequence Padding (similar to TensorFlow)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['URL'])
sequences = tokenizer.texts_to_sequences(data['URL'])
max_sequence_length = 666
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.1, random_state=21)

In [None]:
# Convert y_train to a NumPy array
y_train_array = np.array(y_train)

# # Convert sparse matrices to dense tensors
# X_train_tensor = torch.Tensor(X_train.toarray())
# X_test_tensor = torch.Tensor(X_test.toarray())
# y_train_tensor = torch.Tensor(y_train_array)

# Convert sparse matrices to dense tensors
X_train_tensor = torch.Tensor(X_train)
X_test_tensor = torch.Tensor(X_test)
y_train_tensor = torch.Tensor(y_train_array)

## Logistic Regression

In [None]:
# # Define logistic regression model with an MLP
# class LogisticRegressionModel(nn.Module):
#     def __init__(self, input_size):
#         super(LogisticRegressionModel, self).__init__()
#         self.linear_relu_stack = nn.Sequential(
#             nn.Linear(input_size, 100),
#             nn.ReLU(),
#             nn.Linear(100, 50),
#             nn.ReLU(),
#             nn.Linear(50, 1),
#             nn.ReLU()
#         )
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         out = self.linear_relu_stack(x)
#         out = self.sigmoid(out)
#         return out

In [None]:
# Build logistic regression model in PyTorch
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.linear(x)
        out = self.sigmoid(out)
        return out

input_size_pt = X_train_tensor.shape[1]
model_pt = LogisticRegressionModel(input_size_pt)

criterion_pt = nn.BCELoss()
optimizer_pt = optim.Adam(model_pt.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=256)

In [None]:
import time

start_time = time.time()
# Train the model
num_epochs_pt = 50
for epoch in range(num_epochs_pt):
    for inputs, labels in train_loader:
        optimizer_pt.zero_grad()
        outputs_pt = model_pt(inputs)
        loss_pt = criterion_pt(outputs_pt, labels.view(-1, 1))
        loss_pt.backward()
        optimizer_pt.step()
    print(f'Epoch {epoch + 1}/{num_epochs_pt}, Loss: {loss_pt.item()}')
end_time = time.time()

training_time_pytorch = end_time - start_time

Epoch 1/50, Loss: 22.8384952545166
Epoch 2/50, Loss: 22.767635345458984
Epoch 3/50, Loss: 24.4136905670166
Epoch 4/50, Loss: 18.716535568237305
Epoch 5/50, Loss: 14.863003730773926
Epoch 6/50, Loss: 12.270625114440918
Epoch 7/50, Loss: 21.194494247436523
Epoch 8/50, Loss: 20.725568771362305
Epoch 9/50, Loss: 20.124834060668945
Epoch 10/50, Loss: 19.702016830444336
Epoch 11/50, Loss: 16.953041076660156
Epoch 12/50, Loss: 16.389793395996094
Epoch 13/50, Loss: 19.569683074951172
Epoch 14/50, Loss: 17.95709800720215
Epoch 15/50, Loss: 18.063434600830078
Epoch 16/50, Loss: 16.537277221679688
Epoch 17/50, Loss: 18.222261428833008
Epoch 18/50, Loss: 21.422853469848633
Epoch 19/50, Loss: 21.270177841186523
Epoch 20/50, Loss: 19.740802764892578
Epoch 21/50, Loss: 16.355239868164062
Epoch 22/50, Loss: 16.585840225219727
Epoch 23/50, Loss: 16.46124839782715
Epoch 24/50, Loss: 16.39628791809082
Epoch 25/50, Loss: 19.58124351501465
Epoch 26/50, Loss: 16.464879989624023
Epoch 27/50, Loss: 17.9860172

In [None]:
training_time_pytorch

57.70058870315552

In [None]:
# Predict on the test set
import time

start_time = time.time()

with torch.no_grad():
    outputs_pt = model_pt(X_test_tensor)
    y_pred_pt = (outputs_pt >= 0.5).float()
end_time = time.time()

prediction_time_pytorch = end_time - start_time


# Convert PyTorch tensor to NumPy array
y_pred = y_pred_pt.numpy().astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
# Evaluate the model
with torch.no_grad():
    outputs_test_pt = model_pt(X_test_tensor)
    y_pred = (outputs_test_pt > 0.5).numpy().astype(int)

# Calculate metrics
accuracy_tf = accuracy_score(y_test, y_pred)
precision_tf = precision_score(y_test, y_pred)
recall_tf = recall_score(y_test, y_pred)
f1_tf = f1_score(y_test, y_pred)
conf_matrix_tf = confusion_matrix(y_test, y_pred)
roc_auc_tf = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy_tf)
print("Precision:", precision_tf)
print("Recall:", recall_tf)
print("F1 Score:", f1_tf)
print("AUC-ROC:", roc_auc_tf)

Accuracy: 0.8561862573400638
Precision: 0.6709096553865259
Recall: 0.8654385262314778
F1 Score: 0.7558586918502973
AUC-ROC: 0.8592102478591508


In [None]:
prediction_time_pytorch

0.008027315139770508

In [None]:
# Evaluate the PyTorch model
matrix_pt = confusion_matrix(y_test, y_pred_pt)

print("Confusion Matrix (PyTorch): ")
print(matrix_pt)

Confusion Matrix (PyTorch): 
[[6150 1060]
 [ 336 2161]]


In [None]:
import os
import torch

# Assuming model_pt is your trained PyTorch model
torch.save(model_pt, 'pytorch_model.pth')

pytorch_model_size = os.path.getsize('pytorch_model.pth') / 1024

In [None]:
pytorch_model_size

5.05859375

## Linear SVM

In [None]:
# Define the linear SVM model for binary classification
class LinearSVMModelBinary(nn.Module):
    def __init__(self, input_size):
        super(LinearSVMModelBinary, self).__init__()
        self.linear = nn.Linear(input_size, 1)  # Output size is 1 for binary classification

    def forward(self, x):
        return self.linear(x)

# Use Hinge loss for SVM
model_svm_pytorch_binary = LinearSVMModelBinary(X_train_tensor.shape[1])
criterion_svm_binary = nn.HingeEmbeddingLoss()  # Hinge loss for binary SVM
optimizer_svm_binary = optim.SGD(model_svm_pytorch_binary.parameters(), lr=0.01)

# Convert y_train_tensor to the appropriate format for binary classification
y_train_tensor_binary = 2 * y_train_tensor - 1  # Convert labels to -1 and 1

# Create a DataLoader for efficient batch processing
train_dataset_binary = TensorDataset(X_train_tensor, y_train_tensor_binary)
train_loader_binary = DataLoader(train_dataset_binary, batch_size=256, shuffle=True)


In [None]:
# Train the model
num_epochs = 50
for epoch in range(num_epochs):
    for inputs, labels in train_loader_binary:
        optimizer_svm_binary.zero_grad()
        outputs_binary = model_svm_pytorch_binary(inputs)
        loss_binary = criterion_svm_binary(outputs_binary, labels)
        loss_binary.backward()
        optimizer_svm_binary.step()
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss_binary.item()}')

Epoch 1/50, Loss: 13448.5810546875
Epoch 2/50, Loss: 11860.1201171875
Epoch 3/50, Loss: 113555.1796875
Epoch 4/50, Loss: 29014.673828125
Epoch 5/50, Loss: 10052.255859375
Epoch 6/50, Loss: 17835.671875
Epoch 7/50, Loss: 46286.26171875
Epoch 8/50, Loss: 11794.0087890625
Epoch 9/50, Loss: 51860.7421875
Epoch 10/50, Loss: 41336.12890625
Epoch 11/50, Loss: 9379.369140625
Epoch 12/50, Loss: 123638.296875
Epoch 13/50, Loss: 109784.1953125
Epoch 14/50, Loss: 32989.78125
Epoch 15/50, Loss: 19832.0234375
Epoch 16/50, Loss: 28602.14453125
Epoch 17/50, Loss: 75550.453125
Epoch 18/50, Loss: 139734.0625
Epoch 19/50, Loss: 35905.4453125
Epoch 20/50, Loss: 83025.8125
Epoch 21/50, Loss: 55967.8984375
Epoch 22/50, Loss: 10665.81640625
Epoch 23/50, Loss: 78869.328125
Epoch 24/50, Loss: 12034.494140625
Epoch 25/50, Loss: 14522.9560546875
Epoch 26/50, Loss: 13018.76171875
Epoch 27/50, Loss: 29753.44140625
Epoch 28/50, Loss: 5447.0205078125
Epoch 29/50, Loss: 131350.421875
Epoch 30/50, Loss: 91651.734375
E

In [None]:
# Make predictions on the test set
with torch.no_grad():
    outputs_test_binary = model_svm_pytorch_binary(X_test_tensor)
    predictions_binary = (outputs_test_binary > 0).float()

# Convert predictions to NumPy array
predictions_array_binary = predictions_binary.numpy()

# Calculate accuracy
accuracy_binary = accuracy_score(y_test, predictions_array_binary)
print(f"Accuracy for Binary Linear SVM: {accuracy_binary}")

Accuracy for Binary Linear SVM: 0.29298444421551456


In [None]:
# Predict on the test set
import time

start_time = time.time()

with torch.no_grad():
    outputs_pt = model_svm_pytorch_binary(X_test_tensor)
    y_pred_pt = (outputs_pt >= 0.5).float()
end_time = time.time()

prediction_time_pytorch = end_time - start_time


# Convert PyTorch tensor to NumPy array
y_pred = y_pred_pt.numpy().astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
# Evaluate the model
with torch.no_grad():
    outputs_test_pt = model_svm_pytorch_binary(X_test_tensor)
    y_pred = (outputs_test_pt > 0.5).numpy().astype(int)

# Calculate metrics
accuracy_tf = accuracy_score(y_test, y_pred)
precision_tf = precision_score(y_test, y_pred)
recall_tf = recall_score(y_test, y_pred)
f1_tf = f1_score(y_test, y_pred)
conf_matrix_tf = confusion_matrix(y_test, y_pred)
roc_auc_tf = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy_tf)
print("Precision:", precision_tf)
print("Recall:", recall_tf)
print("F1 Score:", f1_tf)
print("AUC-ROC:", roc_auc_tf)

Accuracy: 0.29298444421551456
Precision: 0.2535561074734703
Recall: 0.8994793752503003
F1 Score: 0.39559665345662703
AUC-ROC: 0.4912098679302819


In [None]:
# Evaluate the PyTorch model
matrix_pt = confusion_matrix(y_test, y_pred_pt)

print("Confusion Matrix (PyTorch): ")
print(matrix_pt)

Confusion Matrix (PyTorch): 
[[ 598 6612]
 [ 251 2246]]


In [None]:
prediction_time_pytorch

0.0038301944732666016

In [None]:
import os
import torch

# Assuming model_pt is your trained PyTorch model
torch.save('LinearSVMModelBinary(nn.Module)', 'pytorch_model.pth')

pytorch_model_size = os.path.getsize('pytorch_model.pth') / 1024

In [None]:
pytorch_model_size

0.8671875