In [None]:
# !pip install -q transformers peft datasets evaluate accelerate torch scikit-learn
# !pip install -U git+https://github.com/huggingface/accelerate.git
# !pip install -U git+https://github.com/huggingface/transformers.git
# !pip install tensorflow
# !pip install -U datasets huggingface_hub fsspec
# !pip install peft trl

## Imports

In [None]:
import torch
import tensorflow.compat.v1 as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig, AutoPeftModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Loading and Evaluating a Foundation Model

## Loading the model, tokenizer, dataset

In [None]:
def set_seed(seed: int):
    """Sets the seed for reproducibility."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

SEED = 42
set_seed(SEED)
print(f"Random seed set to {SEED}")

### Load dataset

In [None]:
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")
print("IMDb dataset loaded.")
print(f"Dataset structure: {dataset}")

### Load and Train Tokenizer

In [None]:
print("Initializing GPT2Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer padding side set to: {tokenizer.padding_side}")
print(f"Tokenizer pad token set to: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")

In [None]:
# Define a preprocessing function to tokenize the text and truncate sequences
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

# Apply the preprocessing function to the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print("Dataset tokenization complete.")

# Rename the 'label' column to 'labels' as expected by the Trainer
tokenized_dataset = tokenized_dataset.rename_columns({"label": "labels"})

# Remove the original 'text' column as it's no longer needed
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Create a DataCollatorWithPadding for dynamic padding
# This is more efficient than padding the entire dataset to maximum length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("DataCollatorWithPadding initialized.")


In [None]:
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

### Model and Tokenizer Setup

In [None]:
# Load model

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)
print("AutoModelForSequenceClassification model loaded.")

# Set the model's padding token ID to match the tokenizer's end-of-sequence token.
# This ensures consistency between the tokenizer and the model's internal padding handling.
model.config.pad_token_id = tokenizer.eos_token_id
print(f"Model's pad_token_id set to: {model.config.pad_token_id}")


In [None]:
# Define a compute_metrics function that calculates accuracy, precision, recall, and F1-score
def compute_metrics(p):
    predictions, labels = p
    # For binary classification, apply argmax to get the predicted class
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
# Define training hyperparameters
output_dir = "./results/baseline_model_evaluation "
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)
print("Training arguments defined.")

# Initialize the Trainer class
print("Initializing Trainer...")
baseline_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer initialized.")

# Start training
print("Starting training...")

baseline_eval_results = baseline_trainer.evaluate()
print(f"Evaluation baseline model results: {baseline_eval_results}")

baseline_trainer.train()

print("Training complete.")

## Train the Model

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"]
)

print("LoRA configuration defined:")
print(lora_config)

# # Wrap the AutoModelForSequenceClassification model with the LoraConfig using get_peft_model()
model = get_peft_model(model, lora_config)
print("PEFT model created with LoRA.")

# Print the number of trainable parameters
model.print_trainable_parameters()

In [None]:
# Define training hyperparameters
output_dir = "./results/peft"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)
print("Training arguments defined.")

# Initialize the Trainer class
print("Initializing Trainer...")
peft_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer initialized.")

# Start training
print("Starting training...")

peft_eval_results = peft_trainer.evaluate()
print(f"Evaluation peft model results: {peft_eval_results}")

peft_trainer.train()

print("Training complete.")

### Evaluating the model

In [None]:
print(f"Evaluation base model results: {baseline_eval_results}")
print(f"Evaluation peft model results: {peft_eval_results}")

# Generate predictions for classification report and confusion matrix
predictions_output = peft_trainer.predict(test_dataset)
predictions = np.argmax(predictions_output.predictions, axis=1)
true_labels = predictions_output.label_ids

# Use classification_report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Negative", "Positive"]))

# Plot a confusion matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Sentiment Classification")
plt.show()
print("Confusion matrix plotted.")

## Saving and Loading PEFT Adapter

In [None]:
# Demonstrate how to save the lightweight PEFT adapter
peft_model_id = os.path.join(output_dir, "save_peft")
print(f"Saving PEFT adapter to: {peft_model_id}")
peft_trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id) # Save tokenizer as well for consistency
print("PEFT adapter and tokenizer saved.")

# Demonstrate how it can be loaded later
print(f"Loading PEFT adapter from: {peft_model_id}")

In [None]:
# Define training hyperparameters
output_dir = "./results/finetune"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)
print("Training arguments defined.")

# Initialize the Trainer class
print("Initializing Trainer...")
finetune_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer initialized.")

# Start training
print("Starting training...")

finetune_eval_results = finetune_trainer.evaluate()
print(f"Evaluation peft model results: {finetune_eval_results}")

finetune_trainer.train()

print("Training complete.")

In [None]:
# Load the PEFT adapter
loaded_peft_model = AutoPeftModelForSequenceClassification.from_pretrained(peft_model_id)

print("PEFT adapter successfully loaded for inference demonstration.")

In [None]:
# # loaded_peft_model for inference.
text = "This movie was absolutely fantastic! I loved every minute of it."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = loaded_peft_model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    print(f"Review: '{text}' -> Predicted Sentiment: {loaded_peft_model.config.id2label[predicted_class_id]}")
