### Import Libraries

In [1]:
import os
import pandas as pd
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt

### Model and Tokenizer Setup

In [2]:
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Check if the tokenizer has a padding token, if not, set one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load and Preprocess the Dataset

In [3]:
# Load the dataset
data = pd.read_csv("../Datasets/dataset_12500.csv")

# Separate features
text_data = data["url"]
scaler = StandardScaler()


# Tokenize text data
max_length = 128
tokenized_data = tokenizer(text_data.tolist(), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
X_text = tokenized_data["input_ids"]

# Labels
y = data["label"].apply(lambda x: 1 if x == "bad" else 0)

# Split dataset
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

### Dataset and DataLoader Setup

In [4]:
# Create TensorDatasets
train_dataset = TensorDataset(X_train_text,  torch.tensor(y_train.values, dtype=torch.long))
test_dataset = TensorDataset(X_test_text, torch.tensor(y_test.values, dtype=torch.long))

# Define collate function
def collate_batch(batch):
    texts,labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.tensor(labels)
    return texts, labels

# DataLoader and training setup
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


### Training Setup

In [5]:
# Training settings
device = torch.device("cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
epochs = 6
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# Paths for saving model, tokenizer, and checkpoint
model_save_path = "../Model/savedModel/savedModel_12500"
tokenizer_save_path = "../Model/savedTokenizer/savedModel_12500"
checkpoint_path = "../Model/checkpoint.pth"

### Save and Load Checkpoint Functions

In [6]:
# Function to save a checkpoint
def save_checkpoint(epoch, model, optimizer, scheduler):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict()
    }
    torch.save(checkpoint, checkpoint_path)
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(tokenizer_save_path)

# Function to load a checkpoint
def load_checkpoint(model, optimizer, scheduler):
    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        return checkpoint['epoch']
    else:
        return 0

# Load checkpoint if available
start_epoch = load_checkpoint(model, optimizer, scheduler)

### Training and Evaluation Loop

In [7]:
# Initialize lists to store metrics
train_losses = []
test_accuracies = []
test_precisions = []
test_recalls = []
test_f1_scores = []

# Training loop
for epoch in range(start_epoch, epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        desc=f"Epoch {epoch + 1}/{epochs}",
    )

    for step, (texts, labels) in progress_bar:
        texts = texts.to(device)
        labels = labels.to(device)

        model.zero_grad()
        outputs = model(texts, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": total_loss / (step + 1)})

    average_loss = total_loss / len(train_loader)
    train_losses.append(average_loss)

    # Evaluation loop
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for texts, labels in tqdm(test_loader, desc="Evaluating", leave=False):
            texts = texts.to(device)
            labels = labels.to(device)

            outputs = model(texts, labels=labels)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average="binary"
    )

    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    metrics_df = pd.DataFrame(
        [[accuracy, precision, recall, f1]],
        columns=["Accuracy", "Precision", "Recall", "F1 Score"],
    )
    print(metrics_df)
    

    test_accuracies.append(accuracy)
    test_precisions.append(precision)
    test_recalls.append(recall)
    test_f1_scores.append(f1)

    # Save checkpoint after each epoch
    save_checkpoint(epoch + 1, model, optimizer, scheduler)

Epoch 1/6: 100%|██████████| 625/625 [15:28:47<00:00, 89.16s/it, loss=0.201]  
                                                             

Test Accuracy: 0.9824
Precision: 0.9755
Recall: 0.9795
F1 Score: 0.9775
   Accuracy  Precision    Recall  F1 Score
0  0.982407   0.975535  0.979529  0.977528


Epoch 2/6:   8%|▊         | 50/625 [1:14:02<13:57:38, 87.41s/it, loss=0.0377]

In [None]:
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

metrics_df = pd.DataFrame(
    [[accuracy, precision, recall, f1]],
    columns=["Accuracy", "Precision", "Recall", "F1 Score"],
)
print(metrics_df)
metrics_df.to_csv("Evaluation_dataset_12500.csv", index=False)

### Plotting the Metrics

In [None]:
# Plotting
plt.figure(figsize=(10, 6))
epochs_range = range(1, epochs + 1)

# Plot train loss
plt.plot(epochs_range, train_losses, label='Train Loss')

# Plot evaluation metrics
plt.plot(epochs_range, test_accuracies, label='Test Accuracy')
plt.plot(epochs_range, test_precisions, label='Test Precision')
plt.plot(epochs_range, test_recalls, label='Test Recall')
plt.plot(epochs_range, test_f1_scores, label='Test F1 Score')

plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.title('Training and Test Metrics Over Epochs')
plt.legend()
plt.show()