## ***AntiphishX : To explore advanced hybrid model frameworks that can resist adversarial phishing websites crafted using Generative Adversarial Networks (GANs). This research will focus on improving robustness through adversarial training combined with multi-modal feature extraction (e.g., URL, HTML content & metadata).***

In [None]:
# to stay away from warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# necessary packages to include for data manipulations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# to shuffle the combined datasets and randomizing the order during training
from sklearn.utils import shuffle

In [None]:
# packages for training the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import torch.optim as optim
import torch.nn as nn
import torch
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# load the data to a dataframe
df = pd.read_csv('/content/data_folder/urls_data.csv')

In [None]:
# explore the data loaded on to the system - "head" prints first 5 instances of data
df.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,graphicriver.net,0,0,1,1,0,0,0,0,0,1,1,1,0,0,1,0,0
1,ecnavi.jp,0,0,1,1,1,0,0,0,0,1,1,1,0,0,1,0,0
2,hubpages.com,0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0
3,extratorrent.cc,0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0,0
4,icicibank.com,0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0,0


In [None]:
# checking the dimension of dataset we included
df.shape

(10000, 18)

In [None]:
# checking for null values
df.isnull().sum()

Unnamed: 0,0
Domain,0
Have_IP,0
Have_At,0
URL_Length,0
URL_Depth,0
Redirection,0
https_Domain,0
TinyURL,0
Prefix/Suffix,0
DNS_Record,0


In [None]:
# training the data in a traditional machine learning models
X = df.drop(columns=['Label','Domain'], axis = 1)
Y = df['Label']

In [None]:
# display the X and Y
print(X)
print(Y)

      Have_IP  Have_At  URL_Length  URL_Depth  Redirection  https_Domain  \
0           0        0           1          1            0             0   
1           0        0           1          1            1             0   
2           0        0           1          1            0             0   
3           0        0           1          3            0             0   
4           0        0           1          3            0             0   
...       ...      ...         ...        ...          ...           ...   
9995        0        0           1          5            0             0   
9996        0        0           1          4            0             0   
9997        0        1           1          3            0             0   
9998        0        0           1          5            0             0   
9999        0        1           1          4            0             0   

      TinyURL  Prefix/Suffix  DNS_Record  Web_Traffic  Domain_Age  Domain_End  \
0     

In [None]:
# training data using traditional ml model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(10000, 16) (8000, 16) (2000, 16)


In [None]:
# Define a function to train and evaluate a model
def train_and_evaluate(model, model_name):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.4f}")
    print(classification_report(Y_test, y_pred))
    print("-" * 50)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
train_and_evaluate(log_reg, "Logistic Regression")

# Decision Tree
dt_classifier = DecisionTreeClassifier()
train_and_evaluate(dt_classifier, "Decision Tree")

# Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100)
train_and_evaluate(rf_classifier, "Random Forest")

# Support Vector Machine
svm_classifier = SVC()
train_and_evaluate(svm_classifier, "Support Vector Machine")

# Naive Bayes
nb_classifier = GaussianNB()
train_and_evaluate(nb_classifier, "Naive Bayes")

# K-Nearest Neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=5)
train_and_evaluate(knn_classifier, "K-Nearest Neighbors")


Model: Logistic Regression
Accuracy: 0.7955
              precision    recall  f1-score   support

           0       0.73      0.95      0.82      1000
           1       0.92      0.64      0.76      1000

    accuracy                           0.80      2000
   macro avg       0.83      0.80      0.79      2000
weighted avg       0.83      0.80      0.79      2000

--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8480
              precision    recall  f1-score   support

           0       0.80      0.93      0.86      1000
           1       0.91      0.77      0.84      1000

    accuracy                           0.85      2000
   macro avg       0.86      0.85      0.85      2000
weighted avg       0.86      0.85      0.85      2000

--------------------------------------------------
Model: Random Forest
Accuracy: 0.8515
              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1000
           1   

In [None]:
!pip install tensorflow




In [None]:
import pandas as pd

# Load the datasets
legitimate = pd.read_csv('.csv')
phishing = pd.read_csv('4.phishing.csv')


legitimate_data = legitimate.drop(columns=['Domain'])
phishing_data = phishing.drop(columns=['Domain'])

# Display the structure of the datasets
print("Legitimate Data:")
print(legitimate_data.head())
print("\nPhishing Data:")
print(phishing_data.head())


Legitimate Data:
   Have_IP  Have_At  URL_Length  URL_Depth  Redirection  https_Domain  \
0        0        0           1          1            0             0   
1        0        0           1          1            1             0   
2        0        0           1          1            0             0   
3        0        0           1          3            0             0   
4        0        0           1          3            0             0   

   TinyURL  Prefix/Suffix  DNS_Record  Web_Traffic  Domain_Age  Domain_End  \
0        0              0           0            1           1           1   
1        0              0           0            1           1           1   
2        0              0           0            1           0           1   
3        0              0           0            1           0           1   
4        0              0           0            1           0           1   

   iFrame  Mouse_Over  Right_Click  Web_Forwards  Label  
0       0        

In [None]:
phishing_data

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,Tiny_URL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,0,0,0,4,0,0,0,0,0,1,0,0,0,0,1,0,1
1,0,0,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1
3,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1
4,0,0,0,2,0,0,0,0,0,1,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,1,5,0,0,1,1,0,1,1,1,0,0,1,0,1
4996,0,0,1,4,0,0,0,0,0,1,0,1,0,0,1,0,1
4997,0,1,1,3,0,0,1,0,0,0,1,1,1,0,1,0,1
4998,0,0,1,5,0,0,1,1,0,1,1,1,0,0,1,0,1


In [None]:
legitimate_data.shape

(5000, 17)

PyTorch for implementing a Generative Adversarial Network (GAN)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Load the datasets
phishing = pd.read_csv('/content/data_folder/phish_data.csv')
legitimate = pd.read_csv('/content/data_folder/legit_data.csv')

phishing_data = phishing.drop(columns=['Domain'])
legitimate_data = legitimate.drop(columns=['Domain'])

# Combine the data for GAN training (focusing on phishing for now)
data = phishing_data.to_numpy()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Normalize the data if it's not done already
# Assuming data is standardized already, we can skip normalization
# If needed: from sklearn.preprocessing import MinMaxScaler and scale between [0, 1]

class PhishingDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Define Generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Sigmoid()
        )

    def forward(self, z):
        return self.model(z)

# Define Discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Set dimensions
latent_dim = 100  # Size of the latent space (random noise)
data_dim = data.shape[1]  # Number of features in phishing dataset

# Instantiate the networks
generator = Generator(latent_dim, data_dim)
discriminator = Discriminator(data_dim)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# Loss function (binary cross-entropy)
adversarial_loss = nn.BCELoss()

# Hyperparameters
num_epochs = 10000
batch_size = 64

# Dataloader for phishing data
phishing_dataset = PhishingDataset(data)
dataloader = DataLoader(phishing_dataset, batch_size=batch_size, shuffle=True)

# Training loop for GAN
for epoch in range(num_epochs):
    for real_samples in dataloader:
        batch_size = real_samples.size(0)

        # Train Discriminator
        real_labels = torch.ones(batch_size, 1)
        fake_labels = torch.zeros(batch_size, 1)

        # Train on real phishing samples
        real_samples = real_samples
        d_loss_real = adversarial_loss(discriminator(real_samples), real_labels)

        # Train on fake phishing samples generated by the Generator
        z = torch.randn(batch_size, latent_dim)
        fake_samples = generator(z)
        d_loss_fake = adversarial_loss(discriminator(fake_samples), fake_labels)

        # Total loss for Discriminator
        d_loss = (d_loss_real + d_loss_fake) / 2

        optimizer_D.zero_grad()
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        z = torch.randn(batch_size, latent_dim)
        generated_samples = generator(z)
        g_loss = adversarial_loss(discriminator(generated_samples), real_labels)

        optimizer_G.zero_grad()
        g_loss.backward()
        optimizer_G.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss D: {d_loss.item()}, Loss G: {g_loss.item()}")
    if epoch == 1100:
        print("Trained successfully")
        break

# After the training loop
# Generate 2000 new phishing samples
z = torch.randn(2000, latent_dim).to(device)  # Generate 2000 samples
generated_phishing_samples = generator(z).detach().cpu().numpy()

# Apply threshold to convert probabilities to binary values
binary_samples = (generated_phishing_samples > 0.5).astype(int)

# Save the generated binary samples to CSV
generated_phishing_df = pd.DataFrame(binary_samples, columns=phishing_data.columns)
generated_phishing_df.to_csv('GAN_phishing_samples.csv', index=False)


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Load the legitimate dataset
legitimate = pd.read_csv('/content/data_folder/legit_data.csv')

# Drop non-numerical columns (if any) like 'Domain'
legitimate_data = legitimate.drop(columns=['Domain'])

# Convert the data to a NumPy array for easier handling
data = legitimate_data.to_numpy()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LegitimateDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Define Generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Sigmoid()
        )

    def forward(self, z):
        return self.model(z)

# Define Discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Set dimensions
latent_dim = 100  # Size of the latent space (random noise)
data_dim = data.shape[1]  # Number of features in legitimate dataset

# Instantiate the networks
generator = Generator(latent_dim, data_dim).to(device)
discriminator = Discriminator(data_dim).to(device)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# Loss function (binary cross-entropy)
adversarial_loss = nn.BCELoss()

# Hyperparameters
num_epochs = 10000
batch_size = 64

# Dataloader for legitimate data
legitimate_dataset = LegitimateDataset(data)
dataloader = DataLoader(legitimate_dataset, batch_size=batch_size, shuffle=True)

# Training loop for GAN
for epoch in range(num_epochs):
    for real_samples in dataloader:
        batch_size = real_samples.size(0)

        # Train Discriminator
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)

        # Train on real legitimate samples
        real_samples = real_samples.to(device)
        d_loss_real = adversarial_loss(discriminator(real_samples), real_labels)

        # Train on fake legitimate samples generated by the Generator
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_samples = generator(z)
        d_loss_fake = adversarial_loss(discriminator(fake_samples), fake_labels)

        # Total loss for Discriminator
        d_loss = (d_loss_real + d_loss_fake) / 2

        optimizer_D.zero_grad()
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        z = torch.randn(batch_size, latent_dim).to(device)
        generated_samples = generator(z)
        g_loss = adversarial_loss(discriminator(generated_samples), real_labels)

        optimizer_G.zero_grad()
        g_loss.backward()
        optimizer_G.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss D: {d_loss.item()}, Loss G: {g_loss.item()}")
    if epoch == 1500:
        print("Trained successfully")
        break

# Generate new legitimate samples
z = torch.randn(2000, latent_dim).to(device)
generated_legitimate_samples = generator(z).detach().cpu().numpy()

# Apply threshold to convert probabilities to binary values (if needed)
binary_samples = (generated_legitimate_samples > 0.5).astype(int)

# Save the generated binary samples to CSV
generated_legitimate_df = pd.DataFrame(binary_samples, columns=legitimate_data.columns)
generated_legitimate_df.to_csv('GAN_legitimate_samples.csv', index=False)


In [None]:
generated_phishing_df = pd.read_csv('/content/data_folder/GAN_phishing_samples.csv')

In [None]:
generated_legitimate_df = pd.read_csv('/content/data_folder/GAN_legitimate_samples.csv')
generated_legitimate_df = generated_legitimate_df.drop(columns=['Label'])

In [None]:
# # Use the below line once per run to ensure the 'Label' column is removed
generated_phishing_df = generated_phishing_df.drop(columns=['Label'])

# Splitting the data from the actual dataset
x_original = df.drop(columns=['Domain', 'Label'], axis=1)  # Dropping 'Label' and 'Domain'
y_original = df['Label']

# Combining original data with generated phishing samples
x_combined = np.vstack((x_original, generated_phishing_df))
y_combined = np.hstack((y_original, np.ones(generated_phishing_df.shape[0], dtype=int)))

# Assuming `generated_legitimate_df` contains legitimate samples (without the 'Label' column)
# If you have legitimate samples, you can add them here
# For example:
# generated_legitimate_df = <your legitimate samples dataframe>

# Append legitimate samples to the combined dataset
x_combined = np.vstack((x_combined, generated_legitimate_df))
y_combined = np.hstack((y_combined, np.zeros(generated_legitimate_df.shape[0], dtype=int)))

# Now, x_combined contains the features of both phishing and legitimate samples
# y_combined contains labels (1 for phishing, 0 for legitimate)


In [None]:
print(np.zeros(generated_phishing_df.shape[0], dtype=int))

[0 0 0 ... 0 0 0]


In [None]:
generated_phishing_df

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,Tiny_URL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards
0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0
1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0
2,0,0,0,1,0,0,1,0,0,1,1,1,0,0,1,1
3,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,1,0,0,1,0,0,1,1,1,0,0,1,0
1996,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0
1997,0,0,0,1,0,0,1,0,0,1,1,1,0,0,1,1
1998,0,0,0,1,0,0,1,0,0,1,1,1,0,0,1,1


In [None]:
generated_legitimate_df

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards
0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
2,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
3,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
4,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
1996,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
1997,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1
1998,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1


In [None]:
# randomizing the value
X_combined, y_combined = shuffle(x_combined, y_combined, random_state=42)

In [None]:
# training the data over the spliting function
x_train_combined, x_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y_combined, test_size=0.2, stratify=y_combined, random_state=2
)
# The train_test_split function returns two tuples, one for the training set and one for the testing set.
# Each tuple contains the features (X) and the labels (y)
# The original code was trying to unpack these two tuples into only two variables, causing the error.
x_train_combined.shape, x_test_combined.shape, y_train_combined.shape, y_test_combined.shape

((11200, 16), (2800, 16), (11200,), (2800,))

In [None]:
# train the model by using the above data
log_reg_upd = LogisticRegression()
log_reg_upd.fit(x_train_combined, y_train_combined)

In [None]:
y_pred = log_reg_upd.predict(x_test_combined)
print(classification_report(y_test_combined, y_pred))
print(accuracy_score(y_test_combined, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85      1400
           1       0.88      0.79      0.83      1400

    accuracy                           0.84      2800
   macro avg       0.84      0.84      0.84      2800
weighted avg       0.84      0.84      0.84      2800

0.8396428571428571


In [None]:
# train the model by using the above data
decision_upd = DecisionTreeClassifier()
decision_upd.fit(x_train_combined, y_train_combined)

In [None]:
y_pred = decision_upd.predict(x_test_combined)
print(classification_report(y_test_combined, y_pred))
print(accuracy_score(y_test_combined, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      1400
           1       0.94      0.87      0.91      1400

    accuracy                           0.91      2800
   macro avg       0.91      0.91      0.91      2800
weighted avg       0.91      0.91      0.91      2800

0.91


In [None]:
# train the model by using the above data
rf_classifier_upd = RandomForestClassifier(n_estimators=100)
rf_classifier_upd.fit(x_train_combined, y_train_combined)

In [None]:
y_pred = rf_classifier_upd.predict(x_test_combined)
print(classification_report(y_test_combined, y_pred))
print(accuracy_score(y_test_combined, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1400
           1       0.94      0.88      0.91      1400

    accuracy                           0.91      2800
   macro avg       0.91      0.91      0.91      2800
weighted avg       0.91      0.91      0.91      2800

0.91


In [None]:
# train the model by using the above data
svm_classifier_upd = SVC()
svm_classifier_upd.fit(x_train_combined, y_train_combined)

In [None]:
y_pred = svm_classifier_upd.predict(x_test_combined)
print(classification_report(y_test_combined, y_pred))
print(accuracy_score(y_test_combined, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.98      0.89      1400
           1       0.97      0.79      0.87      1400

    accuracy                           0.89      2800
   macro avg       0.90      0.89      0.88      2800
weighted avg       0.90      0.89      0.88      2800

0.885


In [None]:
# train the model by using the above data
nb_classifier_upd = GaussianNB()
nb_classifier_upd.fit(x_train_combined, y_train_combined)

In [None]:
y_pred = nb_classifier_upd.predict(x_test_combined)
print(classification_report(y_test_combined, y_pred))
print(accuracy_score(y_test_combined, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.99      0.86      1400
           1       0.99      0.70      0.82      1400

    accuracy                           0.84      2800
   macro avg       0.88      0.84      0.84      2800
weighted avg       0.88      0.84      0.84      2800

0.8442857142857143


In [None]:
knn_classifier_upd = KNeighborsClassifier(n_neighbors=5)
knn_classifier_upd.fit(x_train_combined, y_train_combined)

In [None]:
y_pred = knn_classifier_upd.predict(x_test_combined)
print(classification_report(y_test_combined, y_pred))
print(accuracy_score(y_test_combined, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      1400
           1       0.95      0.83      0.88      1400

    accuracy                           0.89      2800
   macro avg       0.90      0.89      0.89      2800
weighted avg       0.90      0.89      0.89      2800

0.8903571428571428
