In [43]:
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from evaluate  import load as load_metric

In [None]:
import torch
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon GPU (MPS) for computations.")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU for computations.")


In [14]:
MODEL_ID = "distilbert-base-uncased"
DATASET_PATH = '../Datasets/processed_english_final.parquet'
TEXT_COLUMN = 'text_clean'
LABEL_COLUMN = 'label'
NUM_LABELS = 3
ID2LABEL = {0: "negative", 1: "positive", 2: "neutral"}
LABEL2ID = {v: k for k, v in ID2LABEL.items()}

In [None]:
df = pd.read_parquet(DATASET_PATH)
df.head()

In [17]:
if TEXT_COLUMN not in df.columns or LABEL_COLUMN not in df.columns:
    raise ValueError(
        f"CSV must contain '{TEXT_COLUMN}' and '{LABEL_COLUMN}' columns."
    )

In [18]:
df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN], inplace=True)
df[LABEL_COLUMN] = df[LABEL_COLUMN].astype(int)

In [19]:
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df[LABEL_COLUMN]
)

In [20]:
train_dataset = Dataset.from_pandas(train_df)

In [22]:
test_dataset = Dataset.from_pandas(test_df)

In [None]:
print(train_dataset[0])

In [None]:
print(test_dataset[0])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [26]:
def tokenize_function(examples):
    """
    Tokenizes the text data. Padding and truncation are handled to ensure
    all sequences have the same length.
    """
    return tokenizer(
        examples[TEXT_COLUMN], padding="max_length", truncation=True, max_length=512
    )

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

In [28]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns([TEXT_COLUMN])
tokenized_test_dataset = tokenized_test_dataset.remove_columns([TEXT_COLUMN])

In [None]:
print(F"using {device} for computations")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_LABELS,
    id2label=ID2LABEL,
    label2id=LABEL2ID,
).to(device)

In [None]:
print(model.config)

In [38]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

In [None]:
peft_model = get_peft_model(model, peft_config)

In [None]:
peft_model.print_trainable_parameters()

In [None]:
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

In [45]:
def compute_metrics(eval_pred):
    """
    Computes and returns a dictionary of metrics for evaluation.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./distilbert-sentiment-english-lora-mps-model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # fp16 is not fully supported on MPS, so we disable it for stability.
    # PyTorch will automatically use mixed precision where available on MPS.
    fp16=False,
    save_safetensors=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
final_model_path = "./TSA_DistilBERT_Model"
trainer.save_model(final_model_path)

In [None]:
eval_results = trainer.evaluate()

In [None]:
print(eval_results)