### LIBRARIES

In [18]:
import torch
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import numpy as np
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader

### DATA LOADING AND VISUALISATION

In [2]:
df = pd.read_csv("Balanced_dataset.csv")
print(df.head())  # Show first 5 rows
print(df.info())  # Get data types and non-null values


                                                 url    type scheme  \
0    https://ethnicelebs.com/eddie-anderson-comedian  benign  https   
1  https://closinglogos.com/page/Sony+Pictures+Ho...  benign  https   
2  https://terezowens.com/golfer-greg-norman-tryi...  benign  https   
3                        https://americannortel.com/  benign  https   
4                               https://welding.org/  benign  https   

               domain subdomain top_level_domain  \
0     ethnicelebs.com      none              com   
1    closinglogos.com      none              com   
2      terezowens.com      none              com   
3  americannortel.com      none              com   
4         welding.org      none              org   

                                                path  path_length  \
0                           /eddie-anderson-comedian           24   
2  /golfer-greg-norman-trying-to-unload-55-millio...           54   
3                                                  / 

In [3]:
df

Unnamed: 0,url,type,scheme,domain,subdomain,top_level_domain,path,path_length,num_path_segments,query_params,num_query_params,has_https,file_extension,has_fragment,has_special_chars_in_path,has_port,port_number,is_ip_address
0,https://ethnicelebs.com/eddie-anderson-comedian,benign,https,ethnicelebs.com,none,com,/eddie-anderson-comedian,24,1,0,0,1,none,0,0,0,0,0
1,https://closinglogos.com/page/Sony+Pictures+Ho...,benign,https,closinglogos.com,none,com,/page/Sony+Pictures+Home+Entertainment+Warning...,53,2,0,0,1,none,0,0,0,0,0
2,https://terezowens.com/golfer-greg-norman-tryi...,benign,https,terezowens.com,none,com,/golfer-greg-norman-trying-to-unload-55-millio...,54,1,0,0,1,none,0,0,0,0,0
3,https://americannortel.com/,benign,https,americannortel.com,none,com,/,1,0,0,0,1,none,0,0,0,0,0
4,https://welding.org/,benign,https,welding.org,none,org,/,1,0,0,0,1,none,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59600,http://archives.li.man.ac.uk/ead/search/?opera...,spam,http,archives.li.man.ac.uk,archives,uk,/ead/search/,12,2,6,6,0,none,0,0,0,0,0
59601,http://amber.ch.ic.ac.uk/archive/all/10253.html,spam,http,amber.ch.ic.ac.uk,amber,uk,/archive/all/10253.html,23,3,0,0,0,html,0,0,0,0,0
59602,http://archive.thisischeshire.co.uk/2000/1/28/...,spam,http,archive.thisischeshire.co.uk,archive,uk,/2000/1/28/225238.html,22,4,0,0,0,html,0,0,0,0,0
59603,http://ads.guardian.co.uk/click.ng/Params.rich...,spam,http,ads.guardian.co.uk,ads,uk,/click.ng/Params.richmedia=yes&location=middle...,128,2,0,0,0,richmedia=yes&location=middle&spacedesc=06&sit...,0,1,0,0,0


In [4]:
df.columns

Index(['url', 'type', 'scheme', 'domain', 'subdomain', 'top_level_domain',
       'path', 'path_length', 'num_path_segments', 'query_params',
       'num_query_params', 'has_https', 'file_extension', 'has_fragment',
       'has_special_chars_in_path', 'has_port', 'port_number',
       'is_ip_address'],
      dtype='object')

In [5]:
df['type'].value_counts()

type
benign        11921
defacement    11921
phishing      11921
malware       11921
spam          11921
Name: count, dtype: int64

In [6]:
df[df.columns][:1]

Unnamed: 0,url,type,scheme,domain,subdomain,top_level_domain,path,path_length,num_path_segments,query_params,num_query_params,has_https,file_extension,has_fragment,has_special_chars_in_path,has_port,port_number,is_ip_address
0,https://ethnicelebs.com/eddie-anderson-comedian,benign,https,ethnicelebs.com,none,com,/eddie-anderson-comedian,24,1,0,0,1,none,0,0,0,0,0


### USING TEXTUAL FEATURES ONLY

In [7]:

df2 = pd.read_csv("Balanced_dataset.csv")
df2 = df2.fillna("")
df2["text"] = df2["url"].astype(str) + " " + df2["scheme"].astype(str) + " " + df2["domain"].astype(str) + " " + \
              df2["subdomain"].astype(str) + " " + df2["top_level_domain"].astype(str) + " " + \
              df2["path"].astype(str) + " " + df2["file_extension"].astype(str)
assert df2["text"].apply(lambda x: isinstance(x, str)).all(), "Error: Some values are not strings!"
print(df2[["text", "type"]].head())


                                                text    type
0  https://ethnicelebs.com/eddie-anderson-comedia...  benign
1  https://closinglogos.com/page/Sony+Pictures+Ho...  benign
2  https://terezowens.com/golfer-greg-norman-tryi...  benign
3  https://americannortel.com/ https americannort...  benign
4  https://welding.org/ https welding.org none or...  benign


In [8]:
df2[["text", "type"]]

Unnamed: 0,text,type
0,https://ethnicelebs.com/eddie-anderson-comedia...,benign
1,https://closinglogos.com/page/Sony+Pictures+Ho...,benign
2,https://terezowens.com/golfer-greg-norman-tryi...,benign
3,https://americannortel.com/ https americannort...,benign
4,https://welding.org/ https welding.org none or...,benign
...,...,...
59600,http://archives.li.man.ac.uk/ead/search/?opera...,spam
59601,http://amber.ch.ic.ac.uk/archive/all/10253.htm...,spam
59602,http://archive.thisischeshire.co.uk/2000/1/28/...,spam
59603,http://ads.guardian.co.uk/click.ng/Params.rich...,spam


### SUBSETING THE DATA

In [21]:

num_samples_per_class = 1000  # Adjust as needed
df_sampled = df2.groupby("type").apply(lambda x: x.sample(min(len(x), num_samples_per_class)))
df_sampled = df_sampled.reset_index(drop=True)

# Check new class distribution
print(Counter(df_sampled["type"]))

Counter({'benign': 1000, 'defacement': 1000, 'malware': 1000, 'phishing': 1000, 'spam': 1000})


In [22]:


# Check if GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")


Using device: cuda


### UTILIZING ALBERTA-BASE-V2 Pre-Trained

In [23]:

model_name = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(text_list):
    return tokenizer(text_list, padding="max_length", truncation=True, return_tensors="pt")
tokenized_data = tokenize_function(df_sampled["text"].tolist())
print(tokenized_data.keys())  # Check available keys (input_ids, attention_mask, etc.)




dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


### DATA LOADER CLASS

In [24]:
class URLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

label_mapping = {"benign": 0, "defacement": 1, "phishing": 2, "malware": 3, "spam": 4}
df_sampled["label"] = df_sampled["type"].map(label_mapping)

# Create dataset
dataset = URLDataset(tokenized_data, df_sampled["label"].tolist())



### TRAINING AND TESTING SPLIT

In [25]:

from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print(f"Training Samples: {len(train_dataset)}, Validation Samples: {len(test_dataset)}")


Training Samples: 4000, Validation Samples: 1000


In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=5  # Change '2' based on the number of classes in your dataset
).to(device)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### MODEL TRAINER

In [30]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# Define training arguments
training_args = TrainingArguments(
    output_dir="./info sec a2/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./info sec a2/logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=False,  # Don't use fp16 on Ampere GPUs, use bf16 instead
    bf16=True,   # Enable bf16 mixed precision for faster training
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



### MODEL TRAINING

In [31]:
# Train the model
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.311,0.241969,0.917,0.919849,0.917,0.917845
2,0.2315,0.228935,0.921,0.922697,0.921,0.92137
3,0.0837,0.2384,0.938,0.94096,0.938,0.938074


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=750, training_loss=0.24193946584065756, metrics={'train_runtime': 4320.3072, 'train_samples_per_second': 2.778, 'train_steps_per_second': 0.174, 'total_flos': 286862266368000.0, 'train_loss': 0.24193946584065756, 'epoch': 3.0})

### MODEL TESTING

In [32]:
metrics = trainer.evaluate()
print(metrics)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.2384004145860672, 'eval_accuracy': 0.938, 'eval_precision': 0.9409601831770728, 'eval_recall': 0.938, 'eval_f1': 0.9380735249969259, 'eval_runtime': 11.9121, 'eval_samples_per_second': 83.948, 'eval_steps_per_second': 5.289, 'epoch': 3.0}
