<a href="https://colab.research.google.com/github/PaytonOAnderson/Detection-and-Prevention-of-Phishing-Website-Attacks-using-Machine-Learning/blob/main/Phishing_Detection_429.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://github.com/ebubekirbbr/pdd/blob/master/input/data_phishing_37175.json

import json
import numpy as np
import os

# Load the phishing data from the JSON file
with open('data_legitimate_36400.json') as f:
    ldata = json.load(f)

# Load the legitimate data from the JSON file
with open('data_phishing_37175.json') as f:
    pdata = json.load(f)

# Create an empty array to store the combined data
combined_data = []

# Add the phishing data to the combined data array, and add a label of 1 to indicate that it is phishing
for item in pdata:
    combined_data.append([item, 1])

# Add the legitimate data to the combined data array, and add a label of 0 to indicate that it is legitimate
for item in ldata:
    combined_data.append([item, 0])

# Shuffle the combined data array
np.random.shuffle(combined_data)

# Convert the combined data array to a numpy array
combined_data = np.array(combined_data)

print(combined_data[:10])



In [None]:
import numpy as np

# Extract the links from the combined data
links = [item[0] for item in combined_data]

# Calculate the percentiles
percentiles = np.percentile(np.char.str_len(links), np.arange(0, 101))


In [None]:
urls = []
labels = []

for item in combined_data:
  urls.append(item[0])
  labels.append(item[1])


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


# Tokenize the URLs
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(urls)
sequences = tokenizer.texts_to_sequences(urls)

# Pad the sequences
max_length = 200
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert labels to numpy array
labels = np.array(labels, dtype=int)

In [None]:
import numpy as np
# Save the padded sequences to a file
np.save('padded_sequences.npy', padded_sequences)

# Save the labels to a file
np.save('labels.npy', labels)


#Base Model

In [None]:
# Create the model
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=200, output_dim=128, mask_zero=True),
    tf.keras.layers.Conv1D(128, 5, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='elu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='elu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history1 = model1.fit(padded_sequences, labels, epochs=10, validation_split=0.2)
print(history1.history)

In [None]:

# Save the model
model1.save('model_1.keras')



In [None]:
import tensorflow as tf

# Load the model
model1 = tf.keras.models.load_model('model_1.keras')


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot accuracy
plt.figure()
plt.plot(np.arange(1, len(history1.history['accuracy']) + 1), history1.history['accuracy'])
plt.plot(np.arange(1, len(history1.history['val_accuracy']) + 1), history1.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history1.history['accuracy']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.figure()
plt.plot(np.arange(1, len(history1.history['loss']) + 1), history1.history['loss'])
plt.plot(np.arange(1, len(history1.history['val_loss']) + 1), history1.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history1.history['loss']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:

# Make predictions on the test data
predictions1 = model1.predict(padded_sequences)

# Convert the predictions to binary values
for i in range(len(predictions1)):
    if predictions1[i] > 0.5:
        predictions1[i] = 1
    else:
        predictions1[i] = 0

# Evaluate the model on the test data
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels, predictions1)
print("Accuracy:", accuracy)


In [None]:

# Calculate the number of true positives, false negatives, true negatives, and false positives.
tp = 0
fn = 0
tn = 0
fp = 0

for i in range(len(labels)):
  if labels[i] == 1 and predictions1[i] == 1:
    tp += 1
  elif labels[i] == 1 and predictions1[i] == 0:
    fn += 1
  elif labels[i] == 0 and predictions1[i] == 0:
    tn += 1
  elif labels[i] == 0 and predictions1[i] == 1:
    fp += 1

# Calculate the percentages of true positives, false negatives, true negatives, and false positives.
tp_percent = tp / (tp + fn) * 100
fn_percent = fn / (tp + fn) * 100
tn_percent = tn / (tn + fp) * 100
fp_percent = fp / (tn + fp) * 100

# Print the percentages of true positives, false negatives, true negatives, and false positives.
print("True Positives:", tp_percent)
print("False Negatives:", fn_percent)
print("True Negatives:", tn_percent)
print("False Positives:", fp_percent)


#Modifiying the model being created taking out the droupout layers

In [None]:
# Create the model
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=200, output_dim=128, mask_zero=True),
    tf.keras.layers.Conv1D(128, 5, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(128, activation='elu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(64, activation='elu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history2 = model2.fit(padded_sequences, labels, epochs=10, validation_split=0.2)
print(history2.history)

In [None]:
# Save the model
model2.save('model_2.keras')
# Load the model
model2 = tf.keras.models.load_model('model_2.keras')

In [None]:
# Plot accuracy
plt.figure()
plt.plot(np.arange(1, len(history2.history['accuracy']) + 1), history2.history['accuracy'])
plt.plot(np.arange(1, len(history2.history['val_accuracy']) + 1), history22.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history2.history['accuracy']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.figure()
plt.plot(np.arange(1, len(history2.history['loss']) + 1), history2.history['loss'])
plt.plot(np.arange(1, len(history2.history['val_loss']) + 1), history2.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history2.history['loss']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Make predictions on the test data
predictions2 = model.predict(padded_sequences)

# Convert the predictions to binary values
for i in range(len(predictions2)):
    if predictions2[i] > 0.5:
        predictions2[i] = 1
    else:
        predictions2[i] = 0

# Evaluate the model on the test data
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels, predictions2)
print("Accuracy:", accuracy)

In [None]:
# Calculate the number of true positives, false negatives, true negatives, and false positives.
tp = 0
fn = 0
tn = 0
fp = 0

for i in range(len(labels)):
  if labels[i] == 1 and predictions2[i] == 1:
    tp += 1
  elif labels[i] == 1 and predictions2[i] == 0:
    fn += 1
  elif labels[i] == 0 and predictions2[i] == 0:
    tn += 1
  elif labels[i] == 0 and predictions2[i] == 1:
    fp += 1

# Calculate the percentages of true positives, false negatives, true negatives, and false positives.
tp_percent = tp / (tp + fn) * 100
fn_percent = fn / (tp + fn) * 100
tn_percent = tn / (tn + fp) * 100
fp_percent = fp / (tn + fp) * 100

# Print the percentages of true positives, false negatives, true negatives, and false positives.
print("True Positives:", tp_percent)
print("False Negatives:", fn_percent)
print("True Negatives:", tn_percent)
print("False Positives:", fp_percent)

# Modifiying the model being created, removing layer tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))

In [None]:
# Create the model
model3 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=200, output_dim=128, mask_zero=True),
    tf.keras.layers.Conv1D(128, 5, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='elu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='elu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history3 = model3.fit(padded_sequences, labels, epochs=10, validation_split=0.2)
print(history3.history)

In [None]:
# Save the model
model3.save('model_3.keras')
# Load the model
model3 = tf.keras.models.load_model('model_3.keras')
# Plot accuracy
plt.figure()
plt.plot(np.arange(1, len(history3.history['accuracy']) + 1), history3.history['accuracy'])
plt.plot(np.arange(1, len(history3.history['val_accuracy']) + 1), history3.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history3.history['accuracy']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.figure()
plt.plot(np.arange(1, len(history3.history['loss']) + 1), history3.history['loss'])
plt.plot(np.arange(1, len(history3.history['val_loss']) + 1), history3.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history3.history['loss']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Make predictions on the test data
predictions3 = model.predict(padded_sequences)

# Convert the predictions to binary values
for i in range(len(predictions3)):
    if predictions3[i] > 0.5:
        predictions3[i] = 1
    else:
        predictions3[i] = 0

# Evaluate the model on the test data
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels, predictions3)
print("Accuracy:", accuracy)

#Model4

In [None]:
# Create the model
model4 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=200, output_dim=128, mask_zero=True),
    tf.keras.layers.Conv1D(128, 5, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history4 = model1.fit(padded_sequences, labels, epochs=10, validation_split=0.2)
print(history4.history)

In [None]:
# Save the model
model4.save('model_4.keras')
# Load the model
model4 = tf.keras.models.load_model('model_4.keras')
# Plot accuracy
plt.figure()
plt.plot(np.arange(1, len(history4.history['accuracy']) + 1), history4.history['accuracy'])
plt.plot(np.arange(1, len(history4.history['val_accuracy']) + 1), history4.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history4.history['accuracy']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.figure()
plt.plot(np.arange(1, len(history4.history['loss']) + 1), history4.history['loss'])
plt.plot(np.arange(1, len(history4.history['val_loss']) + 1), history4.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.xticks(np.arange(1, len(history3.history['loss']) + 1, 1))
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Make predictions on the test data
predictions4 = model.predict(padded_sequences)

# Convert the predictions to binary values
for i in range(len(predictions4)):
    if predictions4[i] > 0.5:
        predictions4[i] = 1
    else:
        predictions4[i] = 0

# Evaluate the model on the test data
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels, predictions4)
print("Accuracy:", accuracy)