# Neural Network

## Load Data

In [2]:
import pandas as pd

df = pd.read_csv("../Data/phishing_train.csv")
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,0,1,1
1,-1,-1,1,1,1,-1,0,1,1,1,...,1,1,1,1,0,-1,-1,0,1,-1
2,1,1,1,1,1,1,0,1,-1,1,...,1,1,-1,1,0,-1,-1,0,1,1
3,-1,-1,-1,-1,-1,-1,1,-1,-1,1,...,1,1,1,-1,0,-1,-1,1,1,1
4,-1,-1,1,1,1,-1,-1,-1,-1,1,...,1,1,1,1,1,-1,-1,0,1,-1


## Preprocessing

Because networks initialize with random weights, running the same code twice can generate different results by chance. We created a set_seed() function to ensures that the result is the same everytime we run it.

In [3]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import itertools

def set_seed(seed = 42):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

set_seed()

The dataset labels are -1 (Phishing) and 1 (Legitimate). Since PyTorch's loss functions expect binary targets to be 0 and 1, we use a lambda function to map -1 to 0. For the tensor Conversion, PyTorch cannot directly read pd Dataframe , so we converted the data into FloatTensors.

In [4]:
X = df.drop('Result', axis=1)
y = df['Result']

# Convert -1 to 0, and keep 1 as 1 -> 0 (Phishing) and 1 (Legitimate)
y = y.apply(lambda x: 0 if x == -1 else 1)

## Neural Network

The dataset has 30 features. We used a Multilayer Perceptron. The input layer has a size of 30 to match the features. The hidden layer is the linear layer, and we will tune the size of this (32, 64, 128) to see which one captures patterns better. The output layer returns 1 value. The loss function we use is BCEWithLogitsLoss using a Sigmoid activation.

In [5]:
class MLP(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(MLP, self).__init__()
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x
  
# Hyperparameter Tuning
param_grid = {
    'hidden_size': [32, 64, 128],
    'learning_rate': [0.001, 0.01, 0.1],
    'batch_size': [32, 64, 128]
}

# Combinations
key, values = zip(*param_grid.items())
param_values = [dict(zip(key, v)) for v in itertools.product(*values)]

best_acc = 0.0
best_params = {}

for param in param_values:
  print(f"Validating params: {param}")

  # K-Fold

  k = 5
  kf = KFold(n_splits=k, shuffle=True, random_state=42)
  accuracy_scores = []

  # K-Fold Cross Validation
  for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    X_val = torch.tensor(X_val.values, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    y_val = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)

    # Dataloader
    train_dataset = TensorDataset(X_train, y_train)

    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Call Model
    input_size = X_train.shape[1]
    output_size = 1
    model = MLP(input_size, param['hidden_size'], output_size)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=param['learning_rate'])

    # Training loop
    num_epochs = 100
    for epoch in range(num_epochs):
      model.train()
      for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluation Loop
    model.eval()
    with torch.no_grad():
      y_pred = model(X_val)
      y_test_pred = torch.sigmoid(y_pred).round()
      y_test_pred = y_test_pred.detach().cpu().numpy()
      y_true = y_val.detach().cpu().numpy()

      acc = accuracy_score(y_true, y_test_pred)
      accuracy_scores.append(acc)

    print(f"Fold {fold} - NN Val Accuracy {acc*100:.2f}%")

  avg_acc = np.mean(accuracy_scores)
  print(f"Average Accuracy: {avg_acc*100:.2f}%")
  print()

  if avg_acc > best_acc:
    best_acc = avg_acc
    best_params = param

# Final result
print(f"Best Accuracy: {best_acc*100:.2f}%")
print(f"Best Parameters: {best_params}")

Validating params: {'hidden_size': 32, 'learning_rate': 0.001, 'batch_size': 32}
Fold 1 - NN Val Accuracy 95.03%
Fold 2 - NN Val Accuracy 95.36%
Fold 3 - NN Val Accuracy 95.70%
Fold 4 - NN Val Accuracy 95.59%
Fold 5 - NN Val Accuracy 95.81%
Average Accuracy: 95.50%

Validating params: {'hidden_size': 32, 'learning_rate': 0.001, 'batch_size': 64}
Fold 1 - NN Val Accuracy 95.70%
Fold 2 - NN Val Accuracy 95.53%
Fold 3 - NN Val Accuracy 95.82%
Fold 4 - NN Val Accuracy 95.53%
Fold 5 - NN Val Accuracy 95.87%
Average Accuracy: 95.69%

Validating params: {'hidden_size': 32, 'learning_rate': 0.001, 'batch_size': 128}
Fold 1 - NN Val Accuracy 95.42%
Fold 2 - NN Val Accuracy 95.36%
Fold 3 - NN Val Accuracy 96.33%
Fold 4 - NN Val Accuracy 95.59%
Fold 5 - NN Val Accuracy 95.53%
Average Accuracy: 95.65%

Validating params: {'hidden_size': 32, 'learning_rate': 0.01, 'batch_size': 32}
Fold 1 - NN Val Accuracy 96.10%
Fold 2 - NN Val Accuracy 95.93%
Fold 3 - NN Val Accuracy 97.12%
Fold 4 - NN Val Accura