In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from matplotlib.pyplot import step
import gc

In [15]:
# Check if CUDA is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)

# torch.cuda.empty_cache()

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure the padding token is set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Update the model configuration to recognize the new padding token ID
model.config.pad_token_id = tokenizer.pad_token_id

# Resize model token embeddings to accommodate new special token
model.resize_token_embeddings(len(tokenizer))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(32001, 4096)

In [16]:
# Load dataset
data = pd.read_csv('../Dataset_with_Features/dataset_420464.csv', low_memory=False)

# Basic data analysis
print(data.head())  # View first few rows
print(data.describe())  # Statistical summary
print(data.isnull().sum())  # Check for missing values
print(data['label'].value_counts())  # Class distribution

                      url label  use_of_ip  abnormal_url  google_index  \
0  diaryofagameaddict.com   bad          0             0             1   
1        espdesign.com.au   bad          0             0             1   
2      iamagameaddict.com   bad          0             0             1   
3           kalantzis.net   bad          0             0             1   
4   slightlyoffcenter.net   bad          0             0             1   

   count.  count-www  count@  count_dir  count_embed_domian  ...  count-  \
0       1          0       0          0                   0  ...       0   
1       2          0       0          0                   0  ...       0   
2       1          0       0          0                   0  ...       0   
3       1          0       0          0                   0  ...       0   
4       1          0       0          0                   0  ...       0   

   count=  url_length  hostname_length  sus_url  count-digits  count-letters  \
0       0         

In [17]:
data['label'] = data['label'].map({'good': 0, 'bad': 1})
print(data.head())  # View first few rows

# Fill missing values
data['label'] = data['label'].fillna(data['label'].median())

                      url  label  use_of_ip  abnormal_url  google_index  \
0  diaryofagameaddict.com      1          0             0             1   
1        espdesign.com.au      1          0             0             1   
2      iamagameaddict.com      1          0             0             1   
3           kalantzis.net      1          0             0             1   
4   slightlyoffcenter.net      1          0             0             1   

   count.  count-www  count@  count_dir  count_embed_domian  ...  count-  \
0       1          0       0          0                   0  ...       0   
1       2          0       0          0                   0  ...       0   
2       1          0       0          0                   0  ...       0   
3       1          0       0          0                   0  ...       0   
4       1          0       0          0                   0  ...       0   

   count=  url_length  hostname_length  sus_url  count-digits  count-letters  \
0       0   

In [18]:
# Split Data
train_data, test_data = train_test_split(data, test_size=0.2)
train_data, val_data = train_test_split(train_data, test_size=0.25)

train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)


In [19]:
def tokenize_and_extract_features(data, max_length=512):
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
    input_ids = []
    attention_masks = []

    for url in tqdm(data['url'].tolist(), desc="Tokenizing"):
        encodings = tokenizer(url, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        input_ids.append(encodings['input_ids'][0])
        attention_masks.append(encodings['attention_mask'][0])

    feature_columns = [col for col in data.columns if col not in ['url', 'label']]
    features = data[feature_columns].values

    return input_ids, attention_masks, features, data['label']

# Tokenize and extract features
train_input_ids, train_attention_masks, train_features, train_labels = tokenize_and_extract_features(train_data)
val_input_ids, val_attention_masks, val_features, val_labels = tokenize_and_extract_features(val_data)
test_input_ids, test_attention_masks, test_features, test_labels = tokenize_and_extract_features(test_data)



Tokenizing:   0%|          | 0/252278 [00:00<?, ?it/s]

Tokenizing: 100%|██████████| 252278/252278 [01:34<00:00, 2669.62it/s]
Tokenizing: 100%|██████████| 84093/84093 [00:33<00:00, 2524.61it/s]
Tokenizing: 100%|██████████| 84093/84093 [00:31<00:00, 2668.50it/s]


In [20]:
class URLDataset(Dataset):
    def __init__(self, input_ids, attention_masks, features, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx]
        }

train_dataset = URLDataset(train_input_ids, train_attention_masks, train_features, train_labels)
val_dataset = URLDataset(val_input_ids, val_attention_masks, val_features, val_labels)
test_dataset = URLDataset(test_input_ids, test_attention_masks, test_features, test_labels)



In [21]:
class URLDataset(Dataset):
    def __init__(self, input_ids, attention_masks, features, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx],
        }

# Create datasets
train_dataset = URLDataset(train_input_ids, train_attention_masks, train_features, train_labels)
val_dataset = URLDataset(val_input_ids, val_attention_masks, val_features, val_labels)
test_dataset = URLDataset(test_input_ids, test_attention_masks, test_features, test_labels)

In [22]:



class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        model.train()
        inputs = self._prepare_inputs(inputs)
        
        with torch.cuda.amp.autocast():
            loss = self.compute_loss(model, inputs)
        
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps
        
        self.scaler.scale(loss).backward()
        
        if (step + 1) % self.args.gradient_accumulation_steps == 0:
            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.optimizer.zero_grad()
            torch.cuda.empty_cache()  # Free up GPU memory

        return loss.detach()

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset

        with torch.no_grad():  # No gradients computed during evaluation
            return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
    def on_epoch_end(self):
        # Identify the current GPU
        device = next(self.model.parameters()).device
        print(f"End of epoch. Current device: {device}")

    # Free up GPU memory
        gc.collect()  # Collect garbage
        torch.cuda.empty_cache()  # Empty cache

        super().on_epoch_end()



In [23]:
device = next(model.parameters()).device
print(f"Current device: {device}")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    fp16=True
)

device = next(model.parameters()).device
print(f"Current device: {device}")


Current device: cpu
Current device: cpu


In [24]:
device = next(model.parameters()).device
print(f"Current device: {device}")
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
device = next(model.parameters()).device
print(f"Current device: {device}")
trainer.train()
del model, train_dataset, val_dataset, test_dataset
gc.collect()
torch.cuda.empty_cache()

Current device: cpu


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
