### Import Libraries


In [1]:
import os
import pandas as pd
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

### Model and Tokenizer Setup


In [2]:
model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf", num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load and Preprocess the Dataset


In [3]:
data = pd.read_csv("../Datasets/dataset_13500.csv")


text_data = data["url"]
scaler = StandardScaler()


max_length = 128
tokenized_data = tokenizer(
    text_data.tolist(),
    padding="max_length",
    truncation=True,
    max_length=max_length,
    return_tensors="pt",
)
X_text = tokenized_data["input_ids"]


y = data["label"].apply(lambda x: 1 if x == "bad" else 0)


X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42
)

### Dataset and DataLoader Setup


In [4]:
train_dataset = TensorDataset(
    X_train_text, torch.tensor(y_train.values, dtype=torch.long)
)
test_dataset = TensorDataset(X_test_text, torch.tensor(y_test.values, dtype=torch.long))


def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.tensor(labels)
    return texts, labels


batch_size = 16
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### Training Setup


In [5]:
device = torch.device("cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
epochs = 6
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

In [6]:
model_save_path = "../Model/savedModels/savedModel_13500"
tokenizer_save_path = "../Model/savedTokenizers/savedTokenizer_13500"
checkpoint_path = "../Model/savedCheckPoints/checkpoint_13500.pth"
losses_csv_file = "../Model/savedLosses/losses_13500.csv"

### Save and Load Checkpoint Functions


In [7]:
def save_checkpoint(epoch, model, optimizer, scheduler, training_loss):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict(),
        "Training_loss": training_loss,
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

    model.save_pretrained(model_save_path)
    print(f"Model saved at {model_save_path}")

    tokenizer.save_pretrained(tokenizer_save_path)
    print(f"Tokenizer saved at {tokenizer_save_path}")


def load_checkpoint(model, optimizer, scheduler):
    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        print(f"Checkpoint loaded from {checkpoint_path}")

        training_loss = checkpoint.get("Training_loss", None)

        return checkpoint["epoch"], training_loss
    else:
        print("No checkpoint found.")
        return 0, None


def append_loss_to_csv(epoch, train_loss, file_path):
    df = pd.DataFrame({"Epoch": [epoch], "Train_Loss": [train_loss]})

    with open(file_path, "a") as f:
        df.to_csv(f, header=f.tell() == 0, index=False)

In [8]:
start_epoch, training_loss = load_checkpoint(model, optimizer, scheduler)

Checkpoint loaded from ../Model/savedCheckPoints/checkpoint_13500.pth


### Training Loop


In [9]:
train_losses = []


if training_loss is not None:
    train_losses.append(training_loss)

for epoch in range(start_epoch, epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        desc=f"Epoch {epoch + 1}/{epochs}",
    )

    for step, (texts, labels) in progress_bar:
        texts = texts.to(device)
        labels = labels.to(device)

        model.zero_grad()
        outputs = model(texts, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()
        progress_bar.set_postfix({"loss": total_train_loss / (step + 1)})

    average_train_loss = total_train_loss / len(train_loader)
    train_losses.append(average_train_loss)

    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {average_train_loss:.4f}")
    save_checkpoint(epoch + 1, model, optimizer, scheduler, average_train_loss)

Epoch 3/6: 100%|██████████| 675/675 [16:28:34<00:00, 87.87s/it, loss=0.0444]  


Epoch [3/6], Train Loss: 0.0444
Checkpoint saved at ../Model/savedCheckPoints/checkpoint_13500.pth
Model saved at ../Model/savedModels/savedModel_13500
Tokenizer saved at ../Model/savedTokenizers/savedTokenizer_13500


Epoch 4/6: 100%|██████████| 675/675 [18:04:40<00:00, 96.42s/it, loss=0.0241]   


Epoch [4/6], Train Loss: 0.0241
Checkpoint saved at ../Model/savedCheckPoints/checkpoint_13500.pth
Model saved at ../Model/savedModels/savedModel_13500
Tokenizer saved at ../Model/savedTokenizers/savedTokenizer_13500


Epoch 5/6: 100%|██████████| 675/675 [16:02:28<00:00, 85.55s/it, loss=0.0114]   


Epoch [5/6], Train Loss: 0.0114
Checkpoint saved at ../Model/savedCheckPoints/checkpoint_13500.pth
Model saved at ../Model/savedModels/savedModel_13500
Tokenizer saved at ../Model/savedTokenizers/savedTokenizer_13500


Epoch 6/6: 100%|██████████| 675/675 [15:57:57<00:00, 85.15s/it, loss=0.00125]  


Epoch [6/6], Train Loss: 0.0013
Checkpoint saved at ../Model/savedCheckPoints/checkpoint_13500.pth
Model saved at ../Model/savedModels/savedModel_13500
Tokenizer saved at ../Model/savedTokenizers/savedTokenizer_13500


### Evaluation Loop


In [10]:
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for texts, labels in tqdm(test_loader, desc="Evaluating", leave=False):
        texts = texts.to(device)
        labels = labels.to(device)
        outputs = model(texts, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average="binary"
    )
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
metrics_df = pd.DataFrame(
    [[accuracy, precision, recall, f1]],
    columns=["Accuracy", "Precision", "Recall", "F1 Score"],
)
print(metrics_df)
metrics_df.to_csv("../Model/savedMetrices/Evaluation_dataset_13500_testing.csv", index=False)

                                                               

Test Accuracy: 0.9782
Precision: 0.9719
Recall: 0.9710
F1 Score: 0.9714
   Accuracy  Precision    Recall  F1 Score
0  0.978156   0.971899  0.970958  0.971429


Training Accuarcy

In [11]:
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for texts, labels in tqdm(train_loader, desc="Evaluating", leave=False):
        texts = texts.to(device)
        labels = labels.to(device)
        outputs = model(texts, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average="binary"
    )
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
metrics_df = pd.DataFrame(
    [[accuracy, precision, recall, f1]],
    columns=["Accuracy", "Precision", "Recall", "F1 Score"],
)
print(metrics_df)
metrics_df.to_csv("../Model/savedMetrices/Evaluation_dataset_13500_training.csv", index=False)

Evaluating:  37%|███▋      | 249/675 [1:32:08<3:01:32, 25.57s/it]