In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import torch
import matplotlib.pyplot as plt
import os

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
data = pd.read_csv("mental_health.csv")  # Replace with your file path

# Ensure required columns exist
if not {"text", "label"}.issubset(data.columns):
    raise KeyError("The dataset must contain 'text' and 'label' columns.")

# Convert the dataset to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Split into train, validation, and test sets
total_samples = len(dataset)
train_size = max(1, int(0.8 * total_samples))
valid_size = max(1, int(0.1 * total_samples))
test_size = total_samples - train_size - valid_size

train_dataset = dataset.select(range(train_size))
valid_dataset = dataset.select(range(train_size, train_size + valid_size))
test_dataset = dataset.select(range(train_size + valid_size, total_samples))

# Tokenizer (LLaMA2-7B-specific)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Assign a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
tokenized_datasets = DatasetDict({
    "train": train_dataset.map(tokenize_function, batched=True),
    "validation": valid_dataset.map(tokenize_function, batched=True),
    "test": test_dataset.map(tokenize_function, batched=True)
})

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model (LLaMA2-7B-specific)
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=2)
model = model.to(device)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",  # Directory for logs
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,  # Adjusted for LLaMA2-7B memory requirements
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    logging_steps=50  # Log every 50 steps
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Custom Callback for Saving Logs
from transformers import TrainerCallback

class SaveTrainingLogsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            os.makedirs("./logs", exist_ok=True)
            with open("./logs/custom_trainer_state.json", "a") as f:
                f.write(str(logs) + "\n")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[SaveTrainingLogsCallback()]  # Add custom callback
)

# Train the model
trainer.train()

# Evaluate on test set
results = trainer.evaluate(tokenized_datasets["test"])
print("Evaluation Results on Test Set:", results)

# Generate Confusion Matrix
predictions = trainer.predict(tokenized_datasets["test"])
y_true = predictions.label_ids
y_pred = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

# Metrics on test dataset
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
accuracy = accuracy_score(y_true, y_pred)
print(f"Test Metrics - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

# Confusion Matrix for test dataset
conf_matrix = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No Signs", "Signs of Anxiety/Depression"])
disp.plot(cmap="viridis")
plt.title("Confusion Matrix (Test Dataset)")
plt.show()



Using device: cuda


Map:   0%|          | 0/22381 [00:00<?, ? examples/s]

Map:   0%|          | 0/2797 [00:00<?, ? examples/s]

Map:   0%|          | 0/2799 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 1.06 MiB is free. Process 3009360 has 14.74 GiB memory in use. Of the allocated memory 14.65 GiB is allocated by PyTorch, and 1.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Training and Validation Loss Plot
if os.path.exists("./logs/custom_trainer_state.json"):
    with open("./logs/custom_trainer_state.json", "r") as f:
        logs = [eval(line) for line in f.readlines()]
    training_loss = [log["loss"] for log in logs if "loss" in log]
    validation_loss = [log["eval_loss"] for log in logs if "eval_loss" in log]
    steps = range(len(training_loss))  # Steps for training loss

    # Ensure validation_steps matches validation_loss length
    validation_steps = range(0, len(training_loss), max(1, len(training_loss) // len(validation_loss)))[:len(validation_loss)]

    plt.plot(steps, training_loss, label="Training Loss")
    if validation_loss:
        plt.plot(validation_steps, validation_loss, label="Validation Loss")
    plt.xlabel("Steps")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Training and Validation Loss")
    plt.show()
else:
    print("Log file not found. Cannot plot training and validation loss.")
