# Take-Home Task - Quantamind

## Converting JSON data to PDF

In [1]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.2-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.2-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.0 MB[0m [31m14.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m42.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.2


In [2]:
import json
from reportlab.platypus import SimpleDocTemplate, Table, Paragraph, Spacer
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet

# Path to your TAT-QA JSON file
json_path = "/content/tatqa_dataset_train.json"  # or "tatqa_dataset_train.json"

# Load TAT-QA data
with open(json_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# Prepare PDF
doc = SimpleDocTemplate("tatqa_output.pdf", pagesize=letter)
styles = getSampleStyleSheet()
elements = []

# Process first N samples (adjust as needed)
N = 10
for item in dataset[:N]:
    table_data = item["table"]["table"]
    if not table_data or not isinstance(table_data, list):
        continue

    # Extract header and rows
    header, *rows = table_data
    t = Table([header] + rows, repeatRows=1)
    t.setStyle([
        ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
        ("BACKGROUND", (0, 0), (-1, 0), colors.lightgrey),
    ])
    elements.append(t)
    elements.append(Spacer(1, 12))

    for q in item["questions"]:
        question = q["question"]
        answer = q.get("answer", "N/A")
        elements.append(Paragraph(f"<b>Q:</b> {question}", styles["BodyText"]))
        elements.append(Paragraph(f"<b>A:</b> {answer}", styles["BodyText"]))
        elements.append(Spacer(1, 10))

    elements.append(Spacer(1, 24))  # space between tables

# Build PDF
doc.build(elements)
print("✅ PDF saved as: tatqa_output.pdf")

✅ PDF saved as: tatqa_output.pdf


# Fine-tuning LLM

## Fine-tuning setup

In [13]:
!pip install -q transformers datasets peft bitsandbytes accelerate evaluate nltk pandas

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

In [5]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="meta-llama/Llama-3.2-1B-Instruct", repo_type="model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/41.7k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

LICENSE.txt:   0%|          | 0.00/7.71k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

original/consolidated.00.pth:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

USE_POLICY.md:   0%|          | 0.00/6.02k [00:00<?, ?B/s]

params.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

original/tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

'/root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6'

## Fine-tuning LLM using LoRa PEFT

In [6]:
import pdfplumber
import json

output_data = []

with pdfplumber.open("tatqa_output.pdf") as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            continue

        # Optional: split Q/A
        qas = [line.strip() for line in text.split("\n") if line.startswith("Q:") or line.startswith("A:")]

        for i in range(0, len(qas), 2):
            if i + 1 < len(qas):
                question = qas[i].replace("Q:", "").strip()
                answer = qas[i + 1].replace("A:", "").strip()
                output_data.append({
                    "instruction": question,
                    "output": answer
                })

# Save as JSONL
with open("dataset.jsonl", "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")

print(f"✅ Extracted and saved {len(output_data)} samples")

✅ Extracted and saved 60 samples


In [8]:
import torch
import json
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType

output_dir = "llama3.2-lora-finetuned"

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", token=True, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    token=True,
    quantization_config=bnb_config,
    device_map="auto"
)

# Inspect to find target modules — e.g., 'q_proj', 'v_proj'
target_modules = ["q_proj", "v_proj"]  # You can verify via `model`'s named modules

# LoRA config
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules = target_modules
)

model = get_peft_model(model, peft_config)

# Load from raw .jsonl manually to avoid cache issues
with open("dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

# Create HuggingFace Dataset from list of dicts
dataset = Dataset.from_list(data)

# # Load your dataset
# dataset = Dataset.from_json("dataset.jsonl")

# Format prompt: instruction + output (no input field)
def format_prompt(example):
    return {"text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"}

dataset = dataset.map(format_prompt)

# Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize)

# Training setup
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Save LoRA adapter
model.save_pretrained(output_dir)

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,4.5167
20,4.1045


In [9]:
from google.colab import files
import shutil

shutil.make_archive("llama3.2-finetune-weights", 'zip', output_dir)
files.download("llama3.2-finetune-weights.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Fine-tuning using Prefix method

In [11]:
import torch
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import get_peft_model, PrefixTuningConfig, TaskType

output_dir = "llama3.2-prefix-tuned"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", token=True, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token

# Quantization config
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    token=True,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prefix Tuning config
peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=20,
    encoder_hidden_size=128,
    prefix_projection=True
)

# Apply PEFT
model = get_peft_model(model, peft_config)

# Load from raw .jsonl manually to avoid cache issues
with open("dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

# Create HuggingFace Dataset from list of dicts
dataset = Dataset.from_list(data)

# Format prompt
def format_prompt(example):
    return {"text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"}
dataset = dataset.map(format_prompt)

# Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
tokenized = dataset.map(tokenize)

# Training config
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
trainer.train()

# Save
model.save_pretrained(output_dir)


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,5.7213
20,4.3931


In [12]:
from google.colab import files
import shutil

shutil.make_archive("llama3.2-prefix-tune-weights", 'zip', output_dir)
files.download("llama3.2-prefix-tune-weights.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Sample testing

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from peft import PeftModel
import evaluate
import nltk
import pandas as pd
import json
nltk.download("punkt")

# Configuration
base_model = "meta-llama/Llama-3.2-1B-Instruct"
lora_path = "llama3.2-lora-finetuned"
prefix_path = "llama3.2-prefix-finetuned"
test_data_path = "tatqa_dataset_test.json"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token

# Load from standard JSON array file
with open(test_data_path, "r") as f:
    json_list = json.load(f)

# Flatten the nested structure into list of {question, output, context}
flattened = []
for entry in json_list:
    # Create context from table and paragraphs
    context_parts = []

    # Include paragraphs as context
    if "paragraphs" in entry:
        context_parts.extend(p["text"] for p in entry["paragraphs"])

    # Include table rows as text
    if "table" in entry:
        table_rows = entry["table"]["table"]
        for row in table_rows:
            context_parts.append(" | ".join(row))

    context_text = "\n".join(context_parts)

    # For each question in the entry
    for q in entry["questions"]:
        if "question" in q and "uid" in q:
            flattened.append({
                "question": q["question"],
                "output": q.get("answer", "N/A"),  # Or provide correct key for reference answer
                "context": context_text
            })

# Convert to HF dataset
test_dataset = Dataset.from_list(flattened)

# Metrics
exact_match = evaluate.load("exact_match")
bleu = evaluate.load("bleu")

# Formatting prompt
def build_prompt(example):
    return f"### Instruction:\n{example['question']}\n\n### Response:\n"

# Evaluation function
def evaluate_model(peft_path, model_label):
    model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16, device_map="auto", local_files_only=True)
    model = PeftModel.from_pretrained(model, peft_path)
    model.eval()

    predictions = []
    references = []
    prompts = []

    for item in test_dataset:
        prompt = build_prompt(item)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=128)
        decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        response = decoded.split("### Response:")[-1].strip()

        prompts.append(prompt)
        predictions.append(response)
        references.append(item["output"].strip())

    # Compute scores
    em_score = exact_match.compute(predictions=predictions, references=references)["exact_match"]
    bleu_score = bleu.compute(predictions=predictions, references=references)["bleu"]

    print(f"🔍 {model_label} — EM: {em_score:.4f}, BLEU: {bleu_score:.4f}")

    return pd.DataFrame({
        "prompt": prompts,
        f"{model_label}_prediction": predictions,
        "ground_truth": references,
    }), em_score, bleu_score

# Run evaluation
df_lora, em_lora, bleu_lora = evaluate_model(lora_path, "lora")
df_prefix, em_prefix, bleu_prefix = evaluate_model(prefix_path, "prefix")

# Combine results
comparison_df = df_lora.copy()
comparison_df["prefix_prediction"] = df_prefix["prefix_prediction"]

# Optional: Save to CSV
comparison_df.to_csv("finetuning_comparison_results.csv", index=False)

# Print summary
print("\n📊 Final Comparison:")
print(f"LoRA   → EM: {em_lora:.4f}, BLEU: {bleu_lora:.4f}")
print(f"Prefix → EM: {em_prefix:.4f}, BLEU: {bleu_prefix:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_