In [4]:
import pandas as pd

# Load the dataset

In [5]:
df = pd.read_csv("websites.csv")
df.head()

Unnamed: 0,rec_id,url,website,result,created_date
0,1,http://intego3.info/EXEL/index.php,1613573972338075.html,1,2021-02-17 20:29:32
1,2,https://www.mathopenref.com/segment.html,1635698138155948.html,0,2021-10-31 16:35:38
2,3,https://www.computerhope.com/issues/ch000254.htm,1635699228889266.html,0,2021-10-31 16:53:48
3,4,https://www.investopedia.com/terms/n/next-elev...,1635750062162701.html,0,2021-11-01 12:31:02
4,5,https://jobs.emss.org.uk/lcc.aspx,161356510250721.html,0,2021-02-17 18:01:42


In [6]:
df = df[['url', 'result']]
#df = df[['url', 'result']].iloc[:1000]   The whole dataset takes too much time
df.head()

Unnamed: 0,url,result
0,http://intego3.info/EXEL/index.php,1
1,https://www.mathopenref.com/segment.html,0
2,https://www.computerhope.com/issues/ch000254.htm,0
3,https://www.investopedia.com/terms/n/next-elev...,0
4,https://jobs.emss.org.uk/lcc.aspx,0


# Data Augmentation
Implement rule-based augmentation by modifying existing phishing and legitimate URLs. The goal is to increase the dataset by 30% augmented data. The ratio will be maintained. If there are 1000 phishing websites in the dataset, 300 new urls of phishing sites will be created, on the other hand if there are 2000 legit websites in the original dataset, 600 new urls of the legits sites will be created.

In [7]:
import os
import random

# Split phishing and legit URLs
phishing_urls = df[df["result"] == 1]["url"].tolist()
legit_urls = df[df["result"] == 0]["url"].tolist()

# Create the "Augmented Data" folder if it doesn't exist
os.makedirs("Augmented Data", exist_ok=True)

# Rule-based URL Augmentation Functions
def augment_phishing_url(url):
    """Generate variations of phishing URLs."""
    variations = [
        url.replace(".", "-"),  # Replace dots with hyphens
        "secure-" + url,  # Add a deceptive prefix
        url.replace("login", "verify"),  # Swap keywords
        url + "/update-info",  # Append fake path
        url.replace("http://", "https://"),  # Enforce HTTPS
        "www-" + url,  # Add 'www-' prefix
        url.replace("bank", "securebank"),  # Modify known phishing words
    ]
    return random.choice(variations)

def augment_legit_url(url):
    """Generate variations of legitimate URLs."""
    variations = [
        url.replace("www.", "blog."),  # Change subdomain
        url + "?ref=homepage",  # Add query parameter
        url.replace("http://", "https://"),  # Enforce HTTPS
        "https://news-" + url.replace("https://", ""),  # Add a news subdomain
        url + "/contact-us",  # Append an extra path
    ]
    return random.choice(variations)

# Generate Augmented URLs
augmented_phishing_urls = [augment_phishing_url(url) for url in phishing_urls[:len(phishing_urls) // 3]]
augmented_legit_urls = [augment_legit_url(url) for url in legit_urls[:len(legit_urls) // 3]]

# Create DataFrame for Augmented Data
augmented_data = pd.DataFrame({
    "url": augmented_phishing_urls + augmented_legit_urls,
    "result": [1] * len(augmented_phishing_urls) + [0] * len(augmented_legit_urls)
})

# Save Augmented Data Separately
augmented_data.to_csv("Augmented Data/augmented_urls.csv", index=False)

# Save Label Info
with open("Augmented Data/labels_info.txt", "w") as f:
    f.write("Augmented phishing URLs: {}\n".format(len(augmented_phishing_urls)))
    f.write("Augmented legitimate URLs: {}\n".format(len(augmented_legit_urls)))
    f.write("Total augmented URLs: {}\n".format(len(augmented_data)))

print("Augmented data saved in 'Augmented Data/' folder.")


Augmented data saved in 'Augmented Data/' folder.


# Let's take a look at the Augmented Data

In [8]:
augmented_data.head(5)

Unnamed: 0,url,result
0,https://intego3.info/EXEL/index.php,1
1,secure-https://paribas-biznesplanet-logowanie.com,1
2,https://easc.do/abc/pre_qualify.php/update-info,1
3,https://www.tontonfree-getxx8.duckdns.org/,1
4,https://sms-labanquepostale-sms-labanquepostal...,1


# Data Preprocessing

In [9]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer

# Combine Both Datasets
df = pd.concat([df, augmented_data]).reset_index(drop=True)
df.shape



(106666, 2)

**Removing duplicate rows after combining**

In [10]:
# Remove Duplicate URLs
df = df.drop_duplicates(subset=["url"], keep="first").reset_index(drop=True)

df.shape

(98767, 2)

In [11]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

**Tokenizing Inputs**

In [12]:
import pickle
# Initialize BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize URLs
def tokenize_urls(urls):
    return tokenizer(
        urls.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Extract URLs and Labels
X = df["url"].astype(str)
y = df["result"].values

# 5-Fold Cross-Validation Split
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store folds
folds = []
for train_idx, test_idx in kf.split(X, y):
    folds.append((train_idx, test_idx))

# Save preprocessed dataset
df.to_csv("preprocessed_dataset.csv", index=False)

# Save folds using pickle
with open("folds.pkl", "wb") as f:
    pickle.dump(folds, f)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# BERT-based model

In [13]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification

# Define the BERT-based classification model
class BERTClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BERTClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask).logits

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = BERTClassifier().to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train the model using 5 fold cross validation

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset

# Function to calculate accuracy, precision, recall, and F1-score
def evaluate_model(predictions, labels):
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return acc, precision, recall, f1

# 5-Fold Cross-Validation
results = {"accuracy": [], "precision": [], "recall": [], "f1": []}

for fold, (train_idx, test_idx) in enumerate(folds):
    print(f"Training fold {fold+1}/5...")

    # Split data into train and test sets
    train_urls, train_labels = X.iloc[train_idx], y[train_idx]
    test_urls, test_labels = X.iloc[test_idx], y[test_idx]

    # Tokenize the data
    train_encodings = tokenize_urls(train_urls)
    test_encodings = tokenize_urls(test_urls)

    # Convert to DataLoader format
    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
    test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels))

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # Set up the model for this fold
    model = BERTClassifier().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    # Train the model
    model.train()
    for epoch in range(1):  # 1 epochs
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    acc, precision, recall, f1 = evaluate_model(all_preds, all_labels)
    results["accuracy"].append(acc)
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1"].append(f1)

    print(f"Fold {fold+1} Results - Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Calculate average metrics
avg_acc = sum(results["accuracy"]) / 5
avg_precision = sum(results["precision"]) / 5
avg_recall = sum(results["recall"]) / 5
avg_f1 = sum(results["f1"]) / 5

print("\nAverage Results Across 5 Folds:")
print(f"Accuracy: {avg_acc:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1-Score: {avg_f1:.4f}")


Training fold 1/5...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Results - Accuracy: 0.9774, Precision: 0.9773, Recall: 0.9613, F1: 0.9693
Training fold 2/5...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2 Results - Accuracy: 0.9751, Precision: 0.9657, Recall: 0.9672, F1: 0.9665
Training fold 3/5...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 3 Results - Accuracy: 0.9761, Precision: 0.9569, Recall: 0.9796, F1: 0.9681
Training fold 4/5...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 4 Results - Accuracy: 0.9721, Precision: 0.9782, Recall: 0.9456, F1: 0.9616
Training fold 5/5...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 5 Results - Accuracy: 0.9758, Precision: 0.9618, Recall: 0.9733, F1: 0.9675

Average Results Across 5 Folds:
Accuracy: 0.9753
Precision: 0.9680
Recall: 0.9654
F1-Score: 0.9666


# Saving the trained model

In [15]:
model_save_path = "bert_model.pt"

# Save the trained model
torch.save(model.state_dict(), model_save_path)

print(f"Model saved as {model_save_path}")


Model saved as bert_model.pt
