# Askllama-reddit: Fine-tuning Llama-2-7b on Reddit ML Discussions

This notebook fine-tunes Meta's Llama-2-7b model on Reddit machine learning discussion data
using QLoRA (4-bit quantization + LoRA adapters) for parameter-efficient training.

**Requirements:** Google Colab with a T4 GPU (free tier works).

**Steps:**
1. Install dependencies
2. Prepare data (deduplicate, format, split)
3. Load base model with 4-bit quantization
4. Configure LoRA adapters
5. Train with SFTTrainer
6. Evaluate and visualize training loss
7. Merge adapters and save final model
8. Test inference

In [None]:
# Cell 0: Install dependencies
!pip install -q \
    transformers>=4.36.0 \
    trl>=0.7.0 \
    peft>=0.7.0 \
    accelerate>=0.25.0 \
    datasets>=2.14.0 \
    bitsandbytes>=0.41.0 \
    huggingface_hub \
    sentencepiece \
    protobuf \
    einops \
    scipy \
    matplotlib

# Optional: install wandb for experiment tracking (skip if you don't have an account)
# !pip install -q wandb

In [None]:
# Cell 1: Imports and Hugging Face login
import os
import json
import random
import torch
import matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from huggingface_hub import notebook_login

# Login to Hugging Face (needed for Llama-2 gated model access)
notebook_login()

In [None]:
# Cell 2: Upload data and prepare it
# Upload your custjsonl.jsonl file to Colab, or mount Google Drive.
#
# Option A: Upload directly
# from google.colab import files
# uploaded = files.upload()  # upload custjsonl.jsonl
#
# Option B: Mount Google Drive (if data is stored there)
# from google.colab import drive
# drive.mount('/content/drive')

RAW_DATA_PATH = "custjsonl.jsonl"  # adjust path if using Drive

# --- Data preparation (inline version of scripts/prepare_data.py) ---
MIN_COMMENT_LENGTH = 10
VAL_RATIO = 0.1
RANDOM_SEED = 42


def format_prompt(title, post_content, comments):
    return (
        f"### Post Title:\n{title.strip()}\n\n"
        f"### Post Content:\n{post_content.strip()}\n\n"
        f"### Top Comments:\n{comments.strip()}"
    )


# Load
raw_records = []
with open(RAW_DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                raw_records.append(json.loads(line))
            except json.JSONDecodeError:
                pass

print(f"Raw records: {len(raw_records)}")

# Deduplicate
seen = set()
unique = []
for r in raw_records:
    key = (r.get("title", ""), r.get("post_content", ""), r.get("comments", ""))
    if key not in seen:
        seen.add(key)
        unique.append(r)

print(f"After deduplication: {len(unique)}")

# Filter short comments
filtered = [r for r in unique if len(r.get("comments", "").strip()) >= MIN_COMMENT_LENGTH]
print(f"After filtering: {len(filtered)}")

# Format prompts
for r in filtered:
    r["text"] = format_prompt(r["title"], r["post_content"], r["comments"])

# Split
random.seed(RANDOM_SEED)
random.shuffle(filtered)
val_size = max(1, int(len(filtered) * VAL_RATIO))
val_data = filtered[:val_size]
train_data = filtered[val_size:]

print(f"Train: {len(train_data)}, Validation: {len(val_data)}")

# Convert to HF Datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# Preview
print("\n--- Sample formatted entry ---")
print(train_dataset[0]["text"][:500])

In [None]:
# Cell 3: Load base model with 4-bit quantization
BASE_MODEL_NAME = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Model loaded. GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [None]:
# Cell 4: Configure LoRA and training
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)

OUTPUT_DIR = "./results"

# Set report_to="wandb" if you have a wandb account, otherwise use "none"
REPORT_TO = "none"  # Change to "wandb" to enable experiment tracking

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    warmup_steps=30,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to=REPORT_TO,
)

MAX_SEQ_LENGTH = 512

trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_args,
)

print(f"Trainer initialized.")
print(f"  Trainable params: {sum(p.numel() for p in base_model.parameters() if p.requires_grad):,}")
print(f"  Total params: {sum(p.numel() for p in base_model.parameters()):,}")

In [None]:
# Cell 5: Train
torch.cuda.empty_cache()
train_result = trainer.train()
print(f"\nTraining complete!")
print(f"  Total steps: {trainer.state.global_step}")
print(f"  Final train loss: {train_result.training_loss:.4f}")

In [None]:
# Cell 6: Visualize training loss
log_history = trainer.state.log_history

train_steps = [entry["step"] for entry in log_history if "loss" in entry]
train_losses = [entry["loss"] for entry in log_history if "loss" in entry]

eval_steps = [entry["step"] for entry in log_history if "eval_loss" in entry]
eval_losses = [entry["eval_loss"] for entry in log_history if "eval_loss" in entry]

plt.figure(figsize=(10, 5))
plt.plot(train_steps, train_losses, label="Training Loss", alpha=0.8)
if eval_losses:
    plt.plot(eval_steps, eval_losses, label="Validation Loss", marker="o", alpha=0.8)
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Cell 7: Save LoRA adapters and merge into base model
import os
import gc

# Save the LoRA adapters
adapter_dir = os.path.join(OUTPUT_DIR, "final_adapter")
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print(f"LoRA adapters saved to {adapter_dir}")

# Free GPU memory from training before merging
# (T4 only has 16GB â€” can't hold both the training model and merge model)
del trainer
del base_model
gc.collect()
torch.cuda.empty_cache()
print(f"\nFreed GPU memory. Current usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

# Merge LoRA into base model for easier inference
print("Merging LoRA adapters into base model...")
merged_dir = os.path.join(OUTPUT_DIR, "merged")

# Reload base model in fp16 for merging
base_model_for_merge = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load and merge adapters
merged_model = PeftModel.from_pretrained(base_model_for_merge, adapter_dir)
merged_model = merged_model.merge_and_unload()

# Save merged model
merged_model.save_pretrained(merged_dir)
tokenizer.save_pretrained(merged_dir)
print(f"Merged model saved to {merged_dir}")

# Clean up
del merged_model, base_model_for_merge
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Cell 8: Test inference
from transformers import pipeline

# Load the merged model for inference
merged_dir = os.path.join(OUTPUT_DIR, "merged")

pipe = pipeline(
    "text-generation",
    model=merged_dir,
    tokenizer=merged_dir,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Test prompts
test_prompts = [
    "### Post Title:\nWhat is the best way to fine-tune a large language model?\n\n### Post Content:\nI have a dataset of domain-specific text and want to adapt an LLM. What approaches work best for a 7B model?\n\n### Top Comments:",
    "### Post Title:\nHow does LoRA compare to full fine-tuning?\n\n### Post Content:\nI'm considering using LoRA for my project. What are the trade-offs vs full fine-tuning?\n\n### Top Comments:",
]

for i, prompt in enumerate(test_prompts):
    print(f"\n{'='*60}")
    print(f"Test {i+1}")
    print(f"{'='*60}")
    output = pipe(
        prompt,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
    )
    print(output[0]["generated_text"])