### Import Libraries

In [15]:
import os
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from torch.cuda.amp import GradScaler, autocast
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import traceback

In [16]:
# Define paths for model, tokenizer, and checkpoints
model_save_path = "../Model/savedModel/savedModel_12500"
tokenizer_save_path = "../Model/savedTokenizer/savedModel_12500"
checkpoint_path = "../Model/checkpoint.pth"

epochs = 6  
batch_size = 32  
gradient_accumulation_steps = 2

In [17]:
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

In [18]:
def model_and_tokenizer():
    model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer


In [19]:
def data_preparation():
    data = pd.read_csv("../Datasets/dataset_12500.csv")
    text_data = data["url"]
    scaler = StandardScaler()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    max_length = 128
    tokenized_data = tokenizer(text_data.tolist(), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    X_text = tokenized_data["input_ids"]
    y = data["label"].apply(lambda x: 1 if x == "bad" else 0)

    X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)
    train_dataset = TensorDataset(X_train_text, torch.tensor(y_train.values, dtype=torch.long))
    test_dataset = TensorDataset(X_test_text, torch.tensor(y_test.values, dtype=torch.long))
    
    return train_dataset, test_dataset


In [20]:
def data_loader_setup(train_dataset, test_dataset, batch_size, rank):
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, sampler=train_sampler)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.tensor(labels)
    return texts, labels


def save_checkpoint(epoch, model, optimizer, scheduler):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict()
    }
    torch.save(checkpoint, checkpoint_path)
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(tokenizer_save_path)

def load_checkpoint(model, optimizer, scheduler):
    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        return checkpoint['epoch']
    else:
        return 0

def plot_metrics(epochs, train_losses, test_accuracies, test_precisions, test_recalls, test_f1_scores):
    plt.figure(figsize=(10, 6))
    epochs_range = range(1, epochs + 1)

    plt.plot(epochs_range, train_losses, label='Train Loss')
    plt.plot(epochs_range, test_accuracies, label='Test Accuracy')
    plt.plot(epochs_range, test_precisions, label='Test Precision')
    plt.plot(epochs_range, test_recalls, label='Test Recall')
    plt.plot(epochs_range, test_f1_scores, label='Test F1 Score')

    plt.xlabel('Epochs')
    plt.ylabel('Metrics')
    plt.title('Training and Test Metrics Over Epochs')
    plt.legend()
    plt.show()

In [21]:
def train(rank, world_size):
    try:
        setup(rank, world_size)

        model, tokenizer = model_and_tokenizer()
        train_dataset, test_dataset = data_preparation()
        train_loader, test_loader = data_loader_setup(
            train_dataset, test_dataset, batch_size=16, rank=rank
        )

        device = torch.device("cuda:{}".format(rank))
        model.to(device)
        model = DDP(model, device_ids=[rank])

        optimizer = optim.AdamW(model.parameters(), lr=5e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
        )
        scaler = GradScaler()

        train_losses = []
        test_accuracies = []
        test_precisions = []
        test_recalls = []
        test_f1_scores = []

        # Load checkpoint if available
        start_epoch = load_checkpoint(model, optimizer, scheduler)

        for epoch in range(start_epoch, epochs):
            model.train()
            total_loss = 0

            for step, (texts, labels) in enumerate(train_loader):
                texts = texts.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with autocast():
                    outputs = model(texts, labels=labels)
                    loss = outputs.loss

                scaler.scale(loss).backward()

                # Perform Gradient Accumulation
                if (step + 1) % gradient_accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()

                total_loss += loss.item()

            average_loss = total_loss / len(train_loader)
            train_losses.append(average_loss)

            if rank == 0:
                # Evaluation loop
                model.eval()
                predictions = []
                true_labels = []
                with torch.no_grad():
                    for texts, labels in test_loader:
                        texts = texts.to(device)
                        labels = labels.to(device)

                        with autocast():
                            outputs = model(texts, labels=labels)
                            logits = outputs.logits
                            _, predicted_labels = torch.max(logits, dim=1)

                        predictions.extend(predicted_labels.cpu().numpy())
                        true_labels.extend(labels.cpu().numpy())

                accuracy = accuracy_score(true_labels, predictions)
                precision, recall, f1, _ = precision_recall_fscore_support(
                    true_labels, predictions, average="binary"
                )

                test_accuracies.append(accuracy)
                test_precisions.append(precision)
                test_recalls.append(recall)
                test_f1_scores.append(f1)
                print(f"Test Accuracy: {accuracy:.4f}")
                print(f"Precision: {precision:.4f}")
                print(f"Recall: {recall:.4f}")
                print(f"F1 Score: {f1:.4f}")

                metrics_df = pd.DataFrame(
                    [[accuracy, precision, recall, f1]],
                    columns=["Accuracy", "Precision", "Recall", "F1 Score"],
                )
                print(metrics_df)
                metrics_df.to_csv("Evaluation_dataset_12500.csv", index=False)

                # Save checkpoint after each epoch
                save_checkpoint(epoch + 1, model.module, optimizer, scheduler)

        if rank == 0:
            plot_metrics(
                epochs,
                train_losses,
                test_accuracies,
                test_precisions,
                test_recalls,
                test_f1_scores,
            )
    except Exception as e:
        print(f"Error in process {rank}: {e}")
        traceback.print_exc() 

In [22]:
def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()

ProcessExitedException: process 0 terminated with exit code 1