# QLoRA Finetuning on IMDb Sentiment Dataset using GPT-Neo 1.3B

In [1]:
# 1. Install required libraries 
import subprocess
subprocess.run(["pip", "install", "-U", "bitsandbytes>=0.43.0"], stdout=subprocess.DEVNULL)
subprocess.run(["pip", "install", "-U", "git+https://github.com/huggingface/transformers.git"], stdout=subprocess.DEVNULL)
subprocess.run(["pip", "install", "-U", "git+https://github.com/huggingface/accelerate.git"], stdout=subprocess.DEVNULL)
subprocess.run(["pip", "install", "-U", "peft"], stdout=subprocess.DEVNULL)

print("All required packages installed successfully.")

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-gchuq0ej

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-req-build-4n1q4e9l



All required packages installed successfully.


In [2]:
# 2. Load model and tokenizer (4-bit quantized using BitsAndBytesConfig)
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, logging
import torch
from peft import get_peft_model, LoraConfig, TaskType

# Disable unnecessary generation logging
logging.set_verbosity_error()

model_name = "EleutherAI/gpt-neo-1.3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

print('Model loaded successfully!')

Model loaded successfully!


In [3]:
# 3. Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["attention.q_proj", "attention.v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,317,148,672 || trainable%: 0.1194


In [4]:
# 4. Load TweetEval Sentiment Dataset
from datasets import load_dataset

# Load the sentiment split of TweetEval
dataset = load_dataset("tweet_eval", "sentiment")

label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

def to_tweet_sentiment(example):
    return {
        "tweet": example["text"],
        "sentiment": label_map[example["label"]]
    }

# Step 1: 80-20 split → train + test
split_1 = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_val_raw = split_1["train"]
raw_test_data = split_1["test"]

# Step 2: 80-20 split → train + val (on the 80% training set)
split_2 = train_val_raw.train_test_split(test_size=0.2, seed=42)
raw_train_data = split_2["train"]
raw_val_data = split_2["test"]

# Apply mapping to convert to {"tweet", "sentiment"} format
raw_train_data = raw_train_data.map(to_tweet_sentiment)
raw_val_data = raw_val_data.map(to_tweet_sentiment)
raw_test_data = raw_test_data.map(to_tweet_sentiment)

# Visualize some examples
print("\nSample Tweets and their Sentiments:\n")
for i in range(3):
    print(f"Tweet: {raw_val_data[i]['tweet'][:300]}")
    print(f"Sentiment: {raw_val_data[i]['sentiment']}\n")


Sample Tweets and their Sentiments:

Tweet: [Update] Oct 30th: World Date with SHINee  Key and Jonghyun mentioned in one of cast member's tweet...
Sentiment: Neutral

Tweet: Happy Friday!!!   Watching this gives me hope in our youth. Out of all the rambling  Kanye West did this past...
Sentiment: Positive

Tweet: Murray's in major crisis here - 5 - 1 down in the 2nd set after losing the 1st! scenes!!
Sentiment: Negative



In [5]:
# 5. Tokenize 
def tokenize(example):
    prompt = f"Tweet: {example['tweet']}\nSentiment: {example['sentiment']}"
    
    encoded = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=512
    )
    
    # In next-token prediction, labels = input_ids
    encoded["labels"] = encoded["input_ids"].copy()
    return encoded

# Tokenize
train_data = raw_train_data.map(tokenize, remove_columns=raw_train_data.column_names)
val_data = raw_val_data.map(tokenize, remove_columns=raw_val_data.column_names)
test_data = raw_test_data.map(tokenize, remove_columns=raw_test_data.column_names)

Map:   0%|          | 0/29193 [00:00<?, ? examples/s]

Map:   0%|          | 0/7299 [00:00<?, ? examples/s]

Map:   0%|          | 0/9123 [00:00<?, ? examples/s]

In [6]:
# 6. Restrict model output to only the sentiment labels
from transformers import LogitsProcessor

class RestrictVocabLogitsProcessor(LogitsProcessor):
    def __init__(self, allowed_token_ids):
        self.allowed_token_ids = allowed_token_ids

    def __call__(self, input_ids, scores):
        mask = torch.full_like(scores, float("-inf"))
        mask[:, self.allowed_token_ids] = scores[:, self.allowed_token_ids]
        return mask

sentiment_words = ["Positive", "Negative", "Neutral"]
allowed_ids = [tokenizer(word, add_special_tokens=False)["input_ids"][0] for word in sentiment_words]
logits_processor = [RestrictVocabLogitsProcessor(allowed_ids)]

print(tokenizer.convert_ids_to_tokens(allowed_ids))

['Pos', 'Neg', 'Ne']


In [7]:
# 7. Sanity Check
def predict_sentiment(review):
    prompt = f"Review: {review}\nSentiment:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=1,
        do_sample=False,
        logits_processor=logits_processor
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    prediction = response.replace(prompt, "").strip().split()[0]
    if prediction.lower().startswith("pos"):
        return "Positive"
    elif prediction.lower().startswith("neg"):
        return "Negative"
    else:
        return "Neutral"

prompts = [
    "Tweet: Just finished the new Spider-Man movie — absolutely loved it! Sentiment:",
    "Tweet: The product exceeded my expectations. Would definitely buy again. Sentiment:",
    "Tweet: The event starts at 8 PM and runs until midnight. Sentiment:",
    "Tweet: I received the package yesterday. It was exactly as described. Sentiment:",
    "Tweet: This was one of the most disappointing experiences I've had. Sentiment:",
    "Tweet: Terrible UI, constant crashes, and slow performance. Do better. Sentiment:"
]

for prompt in prompts:
    print(prompt[:-11])
    print("Predicted Sentiment:", predict_sentiment(prompt))

Tweet: Just finished the new Spider-Man movie — absolutely loved it!


Predicted Sentiment: Positive
Tweet: The product exceeded my expectations. Would definitely buy again.
Predicted Sentiment: Positive
Tweet: The event starts at 8 PM and runs until midnight.
Predicted Sentiment: Negative
Tweet: I received the package yesterday. It was exactly as described.
Predicted Sentiment: Positive
Tweet: This was one of the most disappointing experiences I've had.
Predicted Sentiment: Negative
Tweet: Terrible UI, constant crashes, and slow performance. Do better.
Predicted Sentiment: Negative


In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer.pad_token = tokenizer.eos_token

# Load LoRA configuration from checkpoint
checkpoint_dir = "/teamspace/studios/this_studio/checkpoints_tweeteval_lora/checkpoint-200"  # or final model path
peft_config = PeftConfig.from_pretrained(checkpoint_dir)

# Load base model and apply LoRA
model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, checkpoint_dir)

model.eval()
model.to("cuda")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoForCausalLM(
      (transformer): GPTNeoModel(
        (wte): Embedding(50257, 2048)
        (wpe): Embedding(2048, 2048)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPTNeoBlock(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPTNeoAttention(
              (attention): GPTNeoSelfAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
                (resid_dropout): Dropout(p=0.0, inplace=False)
                (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_

In [15]:
# 8. Pretraining Evaluation: Accuracy before fine-tuning
from tqdm import tqdm

correct = 0
for example in tqdm(raw_test_data, desc="Evaluating before fine-tuning"):
    prediction = predict_sentiment(example["tweet"])
    if prediction.lower() == example["sentiment"].lower():
        correct += 1
        
accuracy = correct / len(raw_test_data)
print(f"Accuracy before fine-tuning: {accuracy * 100:.2f}%")

Evaluating before fine-tuning: 100%|██████████| 9123/9123 [06:53<00:00, 22.07it/s]

Accuracy before fine-tuning: 49.42%





In [8]:
# 9. Save pretrained model
pretrained_save_path = "./gptneo_pre_finetuned_lora"

model.save_pretrained(pretrained_save_path)
tokenizer.save_pretrained(pretrained_save_path)
print(f"Pretrained model saved to: {pretrained_save_path}")

Pretrained model saved to: ./gptneo_pre_finetuned_lora


In [9]:
import gc
import torch

# Delete any existing model/data variables if needed
#del trainer
gc.collect()

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
# 10. Training

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import TrainerCallback  # Add missing import

class LossPrinterCallback(TrainerCallback):  # Now properly inherits
    def on_epoch_end(self, args, state, control, **kwargs):
        # Find most recent training and validation losses
        train_loss = None
        val_loss = None
        
        # Search log_history in reverse order
        for log in reversed(state.log_history):
            if "loss" in log and train_loss is None:
                train_loss = log["loss"]
            if "eval_loss" in log and val_loss is None:
                val_loss = log["eval_loss"]
            if train_loss is not None and val_loss is not None:
                break
                
        print(f"\nEpoch {state.epoch}/{state.num_train_epochs}")
        print(f"Train Loss: {train_loss:.4f}" if train_loss else "No training loss recorded")
        print(f"Val Loss: {val_loss:.4f}\n" if val_loss else "No validation loss recorded\n")


# Updated training arguments
training_args = TrainingArguments(
    logging_strategy="epoch",  # Log at end of each epoch
    output_dir="./checkpoints_tweeteval_lora",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=10,
    learning_rate=2e-4,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    remove_unused_columns=False,
    report_to="none",  # Disable external logging
    disable_tqdm=False  # Keep progress bars
)

# Initialize trainer with custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[LossPrinterCallback]
)

# Start training
trainer.train()

# After training is complete
# The best model is already loaded because you set load_best_model_at_end=True
best_model_path = "./best_model_tweeteval"

# Save the best model (LoRA adapter only)
model.save_pretrained(best_model_path)
tokenizer.save_pretrained(best_model_path)

print(f"Best model saved to: {best_model_path}")

# To verify which checkpoint was selected as best:
import os
import re

checkpoints = os.listdir("./checkpoints_tweeteval_lora")
checkpoint_steps = [int(re.search(r'checkpoint-(\d+)', cp).group(1)) 
                    for cp in checkpoints if "checkpoint" in cp]
if checkpoint_steps:
    best_step = min(checkpoint_steps)  # Since we're minimizing eval_loss
    best_epoch = best_step // len(train_data) * training_args.per_device_train_batch_size
    print(f"Best model was at step {best_step} (approximately epoch {best_epoch})")

Epoch,Training Loss,Validation Loss
1,3.293,3.321787
2,3.1711,3.283777
3,3.1122,3.268982
4,3.0648,3.264747
5,3.0238,3.265459
6,2.9894,3.270247
7,2.9593,3.273029



Epoch 1.0/10
No training loss recorded
No validation loss recorded


Epoch 2.0/10
Train Loss: 3.2930
Val Loss: 3.3218


Epoch 3.0/10
Train Loss: 3.1711
Val Loss: 3.2838


Epoch 4.0/10
Train Loss: 3.1122
Val Loss: 3.2690


Epoch 5.0/10
Train Loss: 3.0648
Val Loss: 3.2647


Epoch 6.0/10
Train Loss: 3.0238
Val Loss: 3.2655


Epoch 7.0/10
Train Loss: 2.9894
Val Loss: 3.2702

