<a href="https://colab.research.google.com/github/ShouryaBatra/psbs-research-project/blob/main/BorkenPipelene.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# === Install Dependencies ===
!pip install datasets==4.0.0
!pip install einops
!pip install uv
!pip install --upgrade uv
!pip install datasets huggingface_hub scikit-learn pandas --quiet
!pip install transformers
!pip install --upgrade transformers

# === Imports ===
from huggingface_hub import snapshot_download
import pandas as pd
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import os

# === Download the Dataset from Hugging Face ===
snapshot_download(
    repo_id="parameterlab/leaky_thoughts",
    repo_type="dataset",
    local_dir="./datasets",
    ignore_patterns=["*.arrow", "*.lock"]
)

# === Read and Clean the JSON File ===
input_path = "./datasets/airgapagent-r.json"
output_path = "./datasets/airgapagent_cleaned.json"

# Step 1: Read file safely
with open(input_path, "rb") as f:
    raw = f.read()

try:
    text = raw.decode("utf-8")
except UnicodeDecodeError as e:
    print("UnicodeDecodeError:", e)
    text = raw.decode("utf-8", errors="replace")

# Step 2: Parse and normalize `ref_answer`
parsed = json.loads(text)

for entry in parsed:
    val = entry.get("ref_answer")
    if isinstance(val, list):
        entry["ref_answer"] = ", ".join(map(str, val))
    elif isinstance(val, (int, float, bool)):
        entry["ref_answer"] = str(val)
    elif val is None:
        entry["ref_answer"] = ""

# Step 3: Save cleaned JSON (optional)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(parsed, f, indent=2, ensure_ascii=False)

# === Convert to DataFrame and Split ===
df = pd.DataFrame(parsed)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# === Convert to Hugging Face Datasets ===
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# ✅ Done!
print("✅ Dataset cleaned, normalized, and split into train/test!")
dataset_dict


Collecting datasets==4.0.0
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets==4.0.0)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.7.0
    Uninstalling fsspec-2025.7.0:
      Successfully uninstalled fsspec-2025.7.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

airgapagent-r-small.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

(…)irgapagent-r-ablation-swap-original.json: 0.00B [00:00, ?B/s]

airgapagent-r.json: 0.00B [00:00, ?B/s]

airgapagent-r-ablation-swap-flipped.json: 0.00B [00:00, ?B/s]

✅ Dataset cleaned, normalized, and split into train/test!


DatasetDict({
    train: Dataset({
        features: ['profile', 'domain', 'scenario', 'field', 'prompt', 'label', 'ref_answer'],
        num_rows: 3328
    })
    test: Dataset({
        features: ['profile', 'domain', 'scenario', 'field', 'prompt', 'label', 'ref_answer'],
        num_rows: 832
    })
})

In [3]:
# Creating a "debug" dataset for faster iteration
debug_dataset = train_dataset.select(range(0, 5))  # Select the first 5 for debug set


train_dataset[0]
debug_dataset[0]

from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
debug_dataloader = DataLoader(debug_dataset, batch_size=1, shuffle=False)



#Grabbing Model


!pip list | grep transformers

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set default device to CUDA (i.e GPU)
torch.set_default_device("cuda")

# Load the model and the corresponding tokenizer
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B", trust_remote_code=True)


#commented out for testing, there no "question" line in airgapagent, so this will need to be reworked for the columns/rows in our dataset
'''
question, true_answer = debug_dataloader.dataset[0]['question'], debug_dataloader.dataset[0]['answer']

print("Question: ", question)

inputs = tokenizer(question, return_tensors="pt")
inputs = inputs.to("cuda")

# Read up on what max_new_tokens and do_sample do! These are two very important parameters.
outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)

output_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(f"Raw output answer: {output_answer}")
'''

Dataset({
    features: ['profile', 'domain', 'scenario', 'field', 'prompt', 'label', 'ref_answer'],
    num_rows: 5
})
sentence-transformers                 4.1.0
transformers                          4.53.3


'\nquestion, true_answer = debug_dataloader.dataset[0][\'question\'], debug_dataloader.dataset[0][\'answer\']\n\nprint("Question: ", question)\n\ninputs = tokenizer(question, return_tensors="pt")\ninputs = inputs.to("cuda")\n\n# Read up on what max_new_tokens and do_sample do! These are two very important parameters.\noutputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)\n\noutput_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]\nprint(f"Raw output answer: {output_answer}")\n'