# **Tak 2: LSTM on Raw URLs**

## **Step 1: Install and Import Required Libraries**

In [1]:
# These installations are only needed if you're using Google Colab or a fresh environment
!pip install torch torchvision torchaudio  # PyTorch for deep learning
!pip install pandas numpy scikit-learn tensorflow keras  # For data handling and tokenization

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch  # Core PyTorch library
import torch.nn as nn  # Tools to build neural networks
import torch.optim as optim  # Optimizers like Adam
import pandas as pd  # DataFrame operations
import numpy as np  # Numerical processing
from torch.utils.data import Dataset, DataLoader  # Efficient data handling for training
from sklearn.model_selection import train_test_split  # Split data into training/testing
from tensorflow.keras.preprocessing.text import Tokenizer  # For character-level tokenizing of URLs
from tensorflow.keras.preprocessing.sequence import pad_sequences  # To pad sequences to same length


## **Step 2: Load and Inspect the Dataset**

In [3]:
df = pd.read_csv('phishing_site_urls.csv')  # Load the dataset containing URLs and labels
print(df.head())  # View the first 5 rows to understand the data format
print(df.info())  # Check for missing values or incorrect data types

# WHY: This step helps ensure the data is correctly loaded and ready for processing.


                                                 URL Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...   bad
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   bad
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....   bad
3  mail.printakid.com/www.online.americanexpress....   bad
4  thewhiskeydregs.com/wp-content/themes/widescre...   bad
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB
None


## **Step 3: Preprocess the Data; Tokenization and Padding**

In [4]:
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'bad' else 0)  # Convert 'bad' to 1 and 'good' to 0

# WHY: Models require numeric labels. We assign 1 for phishing (bad) and 0 for legitimate (good).

In [5]:
df = df.drop_duplicates()  # Remove any duplicate rows

# WHY: Duplicate data can cause overfitting and artificially inflate model accuracy.


In [6]:
X = df['URL'].values  # Extract the URLs as input features
y = df['Label'].values  # Extract the labels (0 or 1) as target

# WHY: Separating features and labels is essential before training a model.


In [7]:
# === STEP 5: Tokenize the URLs ===
tokenizer = Tokenizer(char_level=True)  # Create a tokenizer that works at the character level
tokenizer.fit_on_texts(X)  # Learn character index mapping from the full dataset

# WHY: Tokenizing each URL at the character level helps LSTM learn character patterns (like suspicious substrings).


In [8]:
sequences = tokenizer.texts_to_sequences(X)  # Convert URLs to sequences of integers (characters → numbers)

# WHY: Neural networks can't process raw strings. We convert characters to integer indices.


In [9]:
max_len = 200  # Define maximum length of a URL sequence
X_padded = pad_sequences(sequences, maxlen=max_len, padding='post')  # Pad shorter sequences with zeros

# WHY: LSTMs need fixed-length input. Padding ensures all URLs have the same number of characters.


## **Step 4: Split the Dataset into Training and Testing**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# WHY: We split data to evaluate how well the model performs on unseen (test) data.

## **Step 5: Convert Data into PyTorch Tensors**

In [11]:
X_train_tensor = torch.tensor(X_train, dtype=torch.long)  # Sequences must be integers (for embedding)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)  # Labels as floats for loss function
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# WHY: PyTorch models work with tensor data. Also, embedding layers need long-type indices.

## **Step 6: Define Custom Dataset and Dataloader for Batching**

In [12]:
class PhishingDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)  # Total number of samples

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]  # Returns one (URL, label) pair

# WHY: This structure allows efficient batch loading using PyTorch's DataLoader.

In [13]:
# Initialize training and testing datasets
train_dataset = PhishingDataset(X_train_tensor, y_train_tensor)
test_dataset = PhishingDataset(X_test_tensor, y_test_tensor)

# Load batches for training and evaluation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# WHY: DataLoaders automatically handle batching, shuffling, and parallel loading during training.

## **Step 7: Define LSTM Model**

In [14]:
class PhishingClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(PhishingClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Convert token ids to dense vectors
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)  # Main LSTM layer
        self.fc = nn.Linear(hidden_dim, output_dim)  # Fully connected output layer
        self.sigmoid = nn.Sigmoid()  # Convert raw output to probability (0–1)

    def forward(self, x):
        embedded = self.embedding(x)  # x: [batch_size, seq_len] → embedded: [batch_size, seq_len, emb_dim]
        _, (hidden, _) = self.lstm(embedded)  # Get the final hidden state from LSTM
        out = self.fc(hidden[-1])  # Feed last hidden state to output layer
        return self.sigmoid(out)  # Output a probability between 0 and 1

# WHY: LSTM processes character sequences and learns to recognize phishing patterns. Final sigmoid gives phishing probability.

## **Step 8: Initialize and Train the Model**

In [15]:
# Set model parameters
vocab_size = len(tokenizer.word_index) + 1  # Total number of unique characters + 1 for padding
embedding_dim = 32  # Size of embedding vector for each character
hidden_dim = 64  # Number of hidden units in LSTM
output_dim = 1  # Binary output (phishing or not)

# Initialize model
model = PhishingClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer for efficient learning

# WHY: BCELoss works for binary classification. Adam is a robust and widely used optimizer.

In [16]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()  # Reset gradients
        output = model(data)  # Forward pass
        loss = criterion(output.squeeze(), target)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model weights
        total_loss += loss.item()  # Accumulate loss

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

# WHY: Each epoch trains the model on all training data. We monitor loss to ensure learning.

Epoch 1/10, Loss: 0.1975
Epoch 2/10, Loss: 0.1017
Epoch 3/10, Loss: 0.0812
Epoch 4/10, Loss: 0.0698
Epoch 5/10, Loss: 0.0620
Epoch 6/10, Loss: 0.0563
Epoch 7/10, Loss: 0.0528
Epoch 8/10, Loss: 0.0494
Epoch 9/10, Loss: 0.0469
Epoch 10/10, Loss: 0.0452


## **Step 9: Evaluate the Model on Test Dataset**

In [17]:
model.eval()  # Set model to evaluation mode (disables dropout, etc.)
y_preds = []

with torch.no_grad():  # No gradient computation needed during testing
    for data, _ in test_loader:
        output = model(data)
        predictions = (output.squeeze() > 0.5).int()  # Convert probability to binary label
        y_preds.extend(predictions.numpy())

# WHY: We use 0.5 as threshold to classify output into phishing or not.

In [18]:
# Calculate Accuracy
from sklearn.metrics import accuracy_score
y_test_numpy = y_test_tensor.numpy()
accuracy = accuracy_score(y_test_numpy, y_preds)
print(f"LSTM Model Accuracy: {accuracy * 100:.2f}%")

# WHY: Accuracy tells us how many test samples were correctly classified.

LSTM Model Accuracy: 97.94%
