In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import joblib

In [14]:
# Load the dataset
data = pd.read_csv("dataset_phishing.csv", encoding='unicode_escape')

# Selecting features and target
X = data.drop(columns=['url', 'status'])  # Drop non-numerical and target column
y = data['status']  # Target column

# Encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Standardizing the features
scaler = StandardScaler()
print(X)
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")

# Splitting the data into training and testing sets
random_seed = 42
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=random_seed, stratify=y_encoded
)

# Converting to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Creating DataLoader for batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


       length_url  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_eq  \
0              37        3           0      0      0       0      0   
1              77        1           0      0      0       0      0   
2             126        4           1      0      1       2      3   
3              18        2           0      0      0       0      0   
4              55        2           2      0      0       0      0   
...           ...      ...         ...    ...    ...     ...    ...   
11425          45        2           0      0      0       0      0   
11426          84        5           0      1      1       0      1   
11427         105        2           6      0      1       0      1   
11428          38        2           0      0      0       0      0   
11429         477       24           0      1      1       9      9   

       nb_underscore  nb_tilde  nb_percent  ...  nb_semicolumn  nb_dollar  \
0                  0         0           0  ...              0        

In [3]:
class PhishingDetectionModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PhishingDetectionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn2 = nn.BatchNorm1d(hidden_size // 2)
        self.fc3 = nn.Linear(hidden_size // 2, 1)  # Single output for binary classification
        self.dropout = nn.Dropout(0.3)
        self.activation = nn.ReLU()  # Activation for hidden layers

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc3(x)  # Output logits (raw scores)
        return x

# Model parameters
input_size = X_train.shape[1]
hidden_size = 64
model = PhishingDetectionModel(input_size, hidden_size)

In [4]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, save_path):
    best_f1 = 0.0  # Track the best F1 score

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        all_labels = []
        all_preds = []

        # Training phase
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)  # Raw logits

            # Calculate loss
            loss = criterion(outputs, y_batch.unsqueeze(1).float())  # Match output and label shapes
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Convert logits to probabilities
            probabilities = torch.sigmoid(outputs)

            # Apply threshold to get binary predictions
            predicted = (probabilities >= 0.5).int()

            # Collect predictions and true labels for F1 score and accuracy
            all_labels.extend(y_batch.numpy())
            all_preds.extend(predicted.numpy().flatten())
        
        # Calculate F1 score and accuracy for the epoch
        epoch_f1 = f1_score(all_labels, all_preds)
        epoch_accuracy = accuracy_score(all_labels, all_preds)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, "
              f"F1 Score: {epoch_f1:.4f}, Accuracy: {epoch_accuracy:.4f}")

        # Evaluation phase
        model.eval()
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                outputs = model(X_batch)

                # Convert logits to probabilities
                probabilities = torch.sigmoid(outputs)

                # Apply threshold to get binary predictions
                predicted = (probabilities >= 0.5).int()

                # Collect predictions and true labels for F1 score and accuracy
                all_labels.extend(y_batch.numpy())
                all_preds.extend(predicted.numpy().flatten())
        
        # Calculate F1 score and accuracy
        f1 = f1_score(all_labels, all_preds)
        accuracy = accuracy_score(all_labels, all_preds)

        print(f"Test Accuracy: {accuracy * 100:.2f}%")
        print(f"Test F1 Score: {f1:.4f}")        

        # Save the model if it performs better
        if f1 > best_f1:
            best_f1 = f1
            best_epoch = epoch
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with F1 Score: {f1:.4f} at epoch {epoch + 1}")

    print(f'Best model saved at epoch {best_epoch + 1} with F1 score of {best_f1:.4f}')

# Parameters for training
num_epochs = 30
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
save_path = "best_model.pth"

In [5]:
# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, save_path)

Epoch 1/30, Loss: 0.4893, F1 Score: 0.7555, Accuracy: 0.7626
Test Accuracy: 80.80%
Test F1 Score: 0.8133
New best model saved with F1 Score: 0.8133 at epoch 1
Epoch 2/30, Loss: 0.4312, F1 Score: 0.8042, Accuracy: 0.7990
Test Accuracy: 82.02%
Test F1 Score: 0.8281
New best model saved with F1 Score: 0.8281 at epoch 2
Epoch 3/30, Loss: 0.4186, F1 Score: 0.8094, Accuracy: 0.8035
Test Accuracy: 81.67%
Test F1 Score: 0.8299
New best model saved with F1 Score: 0.8299 at epoch 3
Epoch 4/30, Loss: 0.4188, F1 Score: 0.8099, Accuracy: 0.8060
Test Accuracy: 81.76%
Test F1 Score: 0.8203
Epoch 5/30, Loss: 0.4126, F1 Score: 0.8125, Accuracy: 0.8072
Test Accuracy: 81.63%
Test F1 Score: 0.8191
Epoch 6/30, Loss: 0.4114, F1 Score: 0.8143, Accuracy: 0.8100
Test Accuracy: 82.02%
Test F1 Score: 0.8222
Epoch 7/30, Loss: 0.4088, F1 Score: 0.8154, Accuracy: 0.8111
Test Accuracy: 81.98%
Test F1 Score: 0.8105
Epoch 8/30, Loss: 0.4072, F1 Score: 0.8186, Accuracy: 0.8147
Test Accuracy: 82.20%
Test F1 Score: 0.830

In [7]:
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        probabilities = torch.sigmoid(outputs)  # Shape: (batch_size, 1)
        predicted = (probabilities >= 0.5).int()  # Shape: (batch_size, 1)
        all_labels.extend(y_batch.numpy())  # True labels
        all_preds.extend(predicted.numpy().flatten())  # Predicted binary labels

# Calculate F1 score and accuracy
f1 = f1_score(all_labels, all_preds)
accuracy = accuracy_score(all_labels, all_preds)

# Calculate confusion matrix
cm = confusion_matrix(all_labels, all_preds)

# Extract TP, TN, FP, FN from the confusion matrix
if cm.shape == (2, 2):  # Binary classification
    TN, FP, FN, TP = cm.ravel()
else:  # Multiclass: calculate separately for each class
    TP = cm.diagonal()  # True positives for each class
    FP = cm.sum(axis=0) - TP  # False positives for each class
    FN = cm.sum(axis=1) - TP  # False negatives for each class
    TN = cm.sum() - (FP + FN + TP)  # True negatives for each class

# Print results
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test F1 Score: {f1:.4f}")
print("\nClassification Metrics:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f'Total Samples: {TP + FP + TN + FN}')



tensor([[ 1.2867],
        [ 1.5979],
        [ 0.4923],
        [ 0.5326],
        [-2.4490],
        [-1.9556],
        [-0.0795],
        [ 4.3311],
        [-1.0993],
        [ 2.5815],
        [ 0.5972],
        [-2.0470],
        [ 1.9960],
        [ 0.6216],
        [ 3.0627],
        [-1.4423],
        [-1.2284],
        [ 2.5661],
        [-0.3619],
        [-0.8787],
        [-0.6366],
        [ 1.7347],
        [ 2.9025],
        [-1.1887],
        [-1.0278],
        [ 2.9196],
        [ 2.4373],
        [ 0.6556],
        [ 0.2340],
        [ 2.9460],
        [-1.7679],
        [ 0.5281]])
tensor([[ 5.1820],
        [ 0.3782],
        [ 1.7139],
        [ 5.0363],
        [-1.2024],
        [ 1.8374],
        [ 0.3117],
        [-1.5422],
        [ 4.2922],
        [-2.4108],
        [-0.1141],
        [ 2.4646],
        [-0.2009],
        [ 2.2826],
        [-2.2606],
        [ 1.2844],
        [-0.4546],
        [ 1.9761],
        [-2.4073],
        [ 1.4483],
        [ 3

  model.load_state_dict(torch.load("best_model.pth"))
