In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [81]:
# Load the dataset
data = pd.read_csv("dataset_phishing.csv")

# Selecting features and target
X = data.drop(columns=['url', 'status'])  # Drop non-numerical and target column
y = data['status']  # Target column

# Encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data into training and testing sets
random_seed = 42
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=random_seed, stratify=y_encoded
)

# Converting to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Creating DataLoader for batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [82]:
class PhishingDetectionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PhishingDetectionModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.35),  
            nn.Linear(hidden_size, hidden_size // 2),  # Reduce neurons in second layer
            nn.BatchNorm1d(hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, output_size)
        )
    
    def forward(self, x):
        return self.model(x)

# Model parameters
input_size = X_train.shape[1]
hidden_size = 64
output_size = len(label_encoder.classes_)

model = PhishingDetectionModel(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.01)  # L2 regularization


In [83]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, save_path):
    best_f1 = 0.0  # Track the best F1 score

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        all_labels = []
        all_preds = []

        # Training phase
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Collect predictions and true labels for F1 score and accuracy
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(y_batch.numpy())
            all_preds.extend(predicted.numpy())
        
        # Calculate F1 score and accuracy for the epoch
        epoch_f1 = f1_score(all_labels, all_preds, average='macro')
        epoch_accuracy = accuracy_score(all_labels, all_preds)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, "
              f"F1 Score: {epoch_f1:.4f}, Accuracy: {epoch_accuracy:.4f}")

        # Evaluation phase
        model.eval()
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)

                # Collect predictions and true labels for F1 score and accuracy
                all_labels.extend(y_batch.numpy())
                all_preds.extend(predicted.numpy())
        
        # Calculate F1 score and accuracy
        f1 = f1_score(all_labels, all_preds, average='macro')
        accuracy = accuracy_score(all_labels, all_preds)

        print(f"Test Accuracy: {accuracy * 100:.2f}%")
        print(f"Test F1 Score: {f1:.4f}")        

        # Save the model if it performs better
        if f1 > best_f1:
            best_f1 = f1
            best_epoch = epoch
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with F1 Score: {f1:.4f} at epoch {epoch}")

    print(f'Best modal saved at epoch {best_epoch} with F1 score of {best_f1}')
# Parameters for training
num_epochs = 30
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
save_path = "best_model.pth"

In [84]:
# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, save_path)

Epoch 1/30, Loss: 0.2883, F1 Score: 0.8902, Accuracy: 0.8903
Test Accuracy: 93.70%
Test F1 Score: 0.9370
New best model saved with F1 Score: 0.9370 at epoch 0
Epoch 2/30, Loss: 0.1692, F1 Score: 0.9389, Accuracy: 0.9389
Test Accuracy: 94.23%
Test F1 Score: 0.9422
New best model saved with F1 Score: 0.9422 at epoch 1
Epoch 3/30, Loss: 0.1527, F1 Score: 0.9442, Accuracy: 0.9442
Test Accuracy: 94.49%
Test F1 Score: 0.9449
New best model saved with F1 Score: 0.9449 at epoch 2
Epoch 4/30, Loss: 0.1488, F1 Score: 0.9486, Accuracy: 0.9486
Test Accuracy: 94.93%
Test F1 Score: 0.9493
New best model saved with F1 Score: 0.9493 at epoch 3
Epoch 5/30, Loss: 0.1405, F1 Score: 0.9488, Accuracy: 0.9488
Test Accuracy: 94.53%
Test F1 Score: 0.9453
Epoch 6/30, Loss: 0.1420, F1 Score: 0.9483, Accuracy: 0.9483
Test Accuracy: 95.10%
Test F1 Score: 0.9510
New best model saved with F1 Score: 0.9510 at epoch 5
Epoch 7/30, Loss: 0.1348, F1 Score: 0.9510, Accuracy: 0.9510
Test Accuracy: 95.32%
Test F1 Score: 0.

In [87]:
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)

        # Collect predictions and true labels for F1 score and accuracy
        all_labels.extend(y_batch.numpy())
        all_preds.extend(predicted.numpy())

# Calculate F1 score and accuracy
f1 = f1_score(all_labels, all_preds, average='macro')
accuracy = accuracy_score(all_labels, all_preds)

print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test F1 Score: {f1:.4f}")

Test Accuracy: 96.28%
Test F1 Score: 0.9628


  model.load_state_dict(torch.load("best_model.pth"))


In [88]:
# Initialize the model
model = PhishingDetectionModel(input_size, hidden_size, output_size)

# Load the saved weights
model_path = "best_model.pth"  # Replace with your .pth file path
model.load_state_dict(torch.load(model_path))

# Access weights
state_dict = model.state_dict()

# Print weights and biases
for name, param in state_dict.items():
    print(f"Layer: {name}, Shape: {param.shape}")
    print(param)


Layer: model.0.weight, Shape: torch.Size([64, 87])
tensor([[ 0.0830, -0.0645, -0.1006,  ...,  0.0512, -0.2791,  0.0336],
        [-0.0550, -0.1966, -0.0790,  ..., -0.0518, -0.2835,  0.1668],
        [ 0.0317,  0.1991, -0.0049,  ...,  0.0760,  0.1687, -0.3109],
        ...,
        [-0.2633,  0.1097, -0.0089,  ...,  0.1002,  0.1717, -0.2584],
        [-0.0791, -0.0758, -0.0759,  ...,  0.0527, -0.2274,  0.0700],
        [-0.0737,  0.0762, -0.1433,  ..., -0.0618, -0.0219,  0.2484]])
Layer: model.0.bias, Shape: torch.Size([64])
tensor([-0.0373, -0.0236, -0.1029,  0.0289,  0.0922, -0.0760, -0.0890, -0.0039,
        -0.0154, -0.0137,  0.0940, -0.0688,  0.0258,  0.0375,  0.0122,  0.0642,
        -0.0064,  0.0329, -0.0574,  0.0960, -0.0708, -0.0555,  0.0502, -0.1035,
         0.0762, -0.0793, -0.0939, -0.0526, -0.0917, -0.0490,  0.0981, -0.0757,
        -0.0094,  0.0409, -0.1114,  0.0966, -0.0503,  0.0908, -0.1036,  0.0896,
         0.0910,  0.0313,  0.1004,  0.0675, -0.0939, -0.0087, -0.0370,

  model.load_state_dict(torch.load(model_path))
