In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

In [2]:
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf", num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Preprocessing step
# Check if the tokenizer has a padding token, if not, set one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# IMPORTANT: Update the model configuration to recognize the new padding token
model.config.pad_token_id = tokenizer.pad_token_id

# Load the dataset
data = pd.read_csv("../Dataset_with_Features/dataset_11430.csv")

# Separate features
text_data = data["url"]
numerical_data = data.drop(columns=["url", "label"])

# Preprocess numerical data
scaler = StandardScaler()
X_num = scaler.fit_transform(numerical_data)

# Define a max_length for tokenization
max_length = 128  # Example value, you should choose based on your data

# Preprocess text data with a progress bar
print("Tokenizing text data...")
tokenized_texts = []
for text in tqdm(text_data.tolist()):
    tokenized_text = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    tokenized_texts.append(tokenized_text["input_ids"][0])

# Make sure to tokenize with padding
tokenized_data = tokenizer(
    text_data.tolist(),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt",
)  # Adjust max_length as needed
X_text = tokenized_data["input_ids"]

# Labels
y = data["label"].apply(lambda x: 1 if x == "bad" else 0)

# Split dataset
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42
)

# Create TensorDatasets
train_dataset = TensorDataset(
    X_train_text,
    torch.tensor(X_train_num, dtype=torch.float),
    torch.tensor(y_train.values, dtype=torch.long),
)
test_dataset = TensorDataset(
    X_test_text,
    torch.tensor(X_test_num, dtype=torch.float),
    torch.tensor(y_test.values, dtype=torch.long),
)

Tokenizing text data...


100%|██████████| 11430/11430 [00:03<00:00, 3731.05it/s]


In [4]:
# Define collate function
def collate_batch(batch):
    texts, nums, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    nums = torch.stack(nums)
    labels = torch.tensor(labels)
    return texts, nums, labels

In [5]:
# DataLoader and training setup
batch_size = 16
epochs = 6
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [6]:
# Training settings
device = torch.device("cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

In [7]:
# training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        desc=f"Epoch {epoch + 1}/{epochs}",
    )

    for step, (texts, nums, labels) in progress_bar:
        texts = texts.to(device)
        labels = labels.to(device)

        model.zero_grad()
        outputs = model(texts, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": total_loss / (step + 1)})

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} | Loss: {average_loss}")


Epoch 1/6: 100%|██████████| 572/572 [14:21:06<00:00, 90.33s/it, loss=0.262]  


Epoch 1/6 | Loss: 0.26155641119743184


Epoch 2/6:  74%|███████▍  | 424/572 [10:43:38<3:55:17, 95.39s/it, loss=0.0915]

In [None]:
# Save the model at the end of each epoch
model_save_path = f"../Model/savedModel/savedModel_11430"
model.save_pretrained(model_save_path)

# Optionally, you can also save the tokenizer at the end of each epoch if it's being updated
tokenizer_save_path = f"../Model/savedTokenizer/savedModel_11430"
tokenizer.save_pretrained(tokenizer_save_path)

In [None]:
# Evaluation loop
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

model.eval()
with torch.no_grad():
    predictions = []
    true_labels = []
    for texts, nums, labels in tqdm(test_loader, desc='Evaluating', leave=False):
        texts = texts.to(device)
        labels = labels.to(device)
        nums = nums.to(device)

        outputs = model(texts,nums, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        
        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

metrics_df = pd.DataFrame(
    [[accuracy, precision, recall, f1]],
    columns=["Accuracy", "Precision", "Recall", "F1 Score"],
)
print(metrics_df)
metrics_df.to_csv("Evaluation_dataset_20000.csv", index=False)



In [None]:
# extract_numerical_features function that accept only one url
import re
from urllib.parse import urlparse
from googlesearch import search
from urllib.parse import urlparse

def extract_numerical_features(url):
    features = {}

    # Use of IP or not in domain
    def having_ip_address(url):
        match = re.search(
            "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\."
            "([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|"  # IPv4
            "((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)"  # IPv4 in hexadecimal
            "(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}",
            url,
        )  # Ipv6
        return 1 if match else 0

    features["use_of_ip"] = having_ip_address(url)

    # Abnormal URL
    def abnormal_url(url):
        hostname = urlparse(url).hostname
        match = re.search(hostname, url)
        return 1 if match else 0

    features["abnormal_url"] = abnormal_url(url)

    # Google Index
    def google_index(url):
        site = search(url, 5)
        return 1 if site else 0

    features["google_index"] = google_index(url)

    # Count Dot
    def count_dot(url):
        return url.count(".")

    features["count_dot"] = count_dot(url)

    # Count WWW
    def count_www(url):
        return url.count("www")

    features["count_www"] = count_www(url)

    # Count @
    def count_atrate(url):
        return url.count("@")

    # No. of Directories
    def no_of_dir(url):
        urldir = urlparse(url).path
        return urldir.count("/")

    features["count_dir"] = no_of_dir(url)

    # No. of Embedded Domains
    def no_of_embed(url):
        urldir = urlparse(url).path
        return urldir.count("//")

    features["count_embed_domian"] = no_of_embed(url)

    # Shortening Service
    def shortening_service(url):
        match = re.search(
            "bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
            "yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
            "short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
            "doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|"
            "db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|"
            "q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|"
            "x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|"
            "tr\.im|link\.zip\.net",
            url,
        )
        return 1 if match else 0

    features["short_url"] = shortening_service(url)

    # Count HTTPS
    def count_https(url):
        return url.count("https")

    features["count-https"] = count_https(url)

    # Count HTTP
    def count_http(url):
        return url.count("http")

    features["count%"] = count_http(url)

    # Count ?
    def count_ques(url):
        return url.count("?")

    features["count?"] = count_ques(url)

    # Count -
    def count_hyphen(url):
        return url.count("-")

    features["count-"] = count_hyphen(url)

    # Count =
    def count_equal(url):
        return url.count("=")

    features["count="] = count_equal(url)

    # URL Length
    def url_length(url):
        return len(str(url))

    features["url_length"] = url_length(url)

    # Hostname Length
    def hostname_length(url):
        return len(urlparse(url).netloc)

    features["hostname_length"] = hostname_length(url)

    # Suspicious Words
    def suspicious_words(url):
        match = re.search(
            "PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr",
            url,
        )
        return 1 if match else 0

    features["sus_url"] = suspicious_words(url)

    # Count Digits
    def digit_count(url):
        return sum(1 for c in url if c.isnumeric())

    features["count-digits"] = digit_count(url)

    # Count Letters
    def letter_count(url):
        return sum(1 for c in url if c.isalpha())

    features["count-letters"] = letter_count(url)

    # First Directory Length
    def fd_length(url):
        urlpath = urlparse(url).path
        try:
            return len(urlpath.split("/")[1])
        except:
            return 0

    features["fd_length"] = fd_length(url)

    return features

# # Example usage:
# url = "https://example.com"
# numerical_features = extract_numerical_features(url)
# print(numerical_features)
