<a href="https://colab.research.google.com/github/RegNLP/ObliQA-ML/blob/main/ObliQA_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ════════════════════════════════
# Colab Cell 1 ▶️ Mount Drive & Enter Project Folder
# ════════════════════════════════
from google.colab import drive
drive.mount('/content/drive')

# change this to wherever your repo lives in Drive
%cd /content/drive/MyDrive/ObliQA-MultiPassage


In [None]:
# ════════════════════════════════
# Colab Cell 2 ▶️ Install Dependencies
# ════════════════════════════════
!pip install --upgrade openai pandas


In [None]:
# ════════════════════════════════
# Colab Cell 3 ▶️ Imports & Configuration
# ════════════════════════════════
import hashlib, json, os, time, random
from datetime import datetime
import openai

# ————— CONFIGURATION —————
INPUT_JSON          = "ObliQA_MultiPassage.json"
OUTPUT_JSON         = "ObliQA_Validated_MultiPassage.json"
OUTPUT_JSONL        = "ObliQA_Validated_MultiPassage.jsonl"
CACHE_JSON          = "ObliQA_Validation_Cache.json"
BATCH_PROGRESS_JSON = "Batch_Progress.json"

MODEL        = "gpt-4.1-2025-04-14"
SAMPLE_SIZE  = None        # or an int for debugging
SLEEP_RANGE  = (1, 3)
FLUSH_EVERY  = 5

openai.api_key = os.getenv("OPENAI_API_KEY")  # or set directly here
TOTAL_QS = len(json.load(open(INPUT_JSON, "r")))


In [None]:
# ════════════════════════════════
# Colab Cell 4 ▶️ Utility Functions
# ════════════════════════════════
def compute_cache_key(qid, pid):
    return hashlib.sha256(f"{qid}|{pid}".encode()).hexdigest()

def load_json(path):
    return json.load(open(path)) if os.path.exists(path) else {}

def save_json(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def append_jsonl(entry, path):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

def flush_jsonl(jsonl_path, json_path):
    arr = []
    with open(jsonl_path, "r") as f:
        for line in f:
            try: arr.append(json.loads(line))
            except: pass
    save_json(arr, json_path)
    print(f"[{datetime.now()}] Flushed {len(arr)} entries → {json_path}")

def retry(fn, retries=3, backoff=1.0):
    for i in range(retries):
        try: return fn()
        except Exception:
            if i < retries-1: time.sleep(backoff * 2**i)
            else: raise

def get_done_ids(path):
    s = set()
    if os.path.exists(path):
        with open(path) as f:
            for L in f:
                try: s.add(json.loads(L)["QuestionID"])
                except: pass
    return s

def count_done():
    return sum(1 for _ in open(OUTPUT_JSONL)) if os.path.exists(OUTPUT_JSONL) else 0

def update_progress(total, done, batch):
    prog = {
        "total": total,
        "validated": done,
        "remaining": total - done,
        "last_batch": batch,
        "updated": datetime.now().isoformat()
    }
    save_json(prog, BATCH_PROGRESS_JSON)

def show_progress():
    if not os.path.exists(BATCH_PROGRESS_JSON): return
    p = json.load(open(BATCH_PROGRESS_JSON))
    print(f"✔️ Total: {p['total']}, Validated: {p['validated']}, Remaining: {p['remaining']}, Last Batch: {p['last_batch']} ({p['updated']})")


In [None]:
# ════════════════════════════════
# Colab Cell 5 ▶️ Validation Logic
# ════════════════════════════════
SYSTEM = """
You are validating if a Passage answers a Question.
Reply only with JSON:
{ "Connection": "Directly Connected"|"Indirectly Connected"|"Not Connected",
  "ShortReason": "..." }
"""

def validate_conn(question, passage):
    prompt = f"Question:\n{question}\n\nPassage:\n{passage}"
    def call():
        return openai.ChatCompletion.create(
            model=MODEL,
            messages=[
                {"role":"system","content":SYSTEM},
                {"role":"user",  "content":prompt}
            ],
            temperature=0,
            max_tokens=512
        )
    resp = retry(call)
    raw = resp.choices[0].message.content.strip()
    try:
        r = json.loads(raw)
        if "Connection" in r: return r
    except:
        pass
    return {"Connection":"Not Connected","ShortReason":"Parse failed"}


In [None]:
# ════════════════════════════════
# Colab Cell 6 ▶️ Main Loop
# ════════════════════════════════
def main():
    if not openai.api_key:
        raise RuntimeError("Set OPENAI_API_KEY!")
    os.makedirs(os.path.dirname(OUTPUT_JSON), exist_ok=True)
    os.makedirs(os.path.dirname(CACHE_JSON), exist_ok=True)

    cache = load_json(CACHE_JSON)
    all_qs = json.load(open(INPUT_JSON))
    done_ids = get_done_ids(OUTPUT_JSONL)
    to_do = [q for q in all_qs if q["QuestionID"] not in done_ids]
    if SAMPLE_SIZE: to_do = random.sample(to_do, SAMPLE_SIZE)

    print(f"🔍 {len(to_do)} questions to validate")
    processed = 0

    for i, item in enumerate(to_do, 1):
        new_passages = []
        for p in item["Passages"]:
            key = compute_cache_key(item["QuestionID"], f"{p['DocumentID']}#{p['PassageID']}")
            if key in cache:
                result = cache[key]
            else:
                result = validate_conn(item["Question"], p["Passage"])
                cache[key] = result
                save_json(cache, CACHE_JSON)
            new_passages.append({**p, **result})

        append_jsonl({
            "QuestionID": item["QuestionID"],
            "Question": item["Question"],
            "Passages": new_passages
        }, OUTPUT_JSONL)

        processed += 1
        print(f"[{datetime.now()}] ✅ {i}/{len(to_do)}")

        if i % FLUSH_EVERY == 0:
            flush_jsonl(OUTPUT_JSONL, OUTPUT_JSON)

        time.sleep(random.uniform(*SLEEP_RANGE))

    flush_jsonl(OUTPUT_JSONL, OUTPUT_JSON)
    update_progress(TOTAL_QS, count_done(), processed)
    print("\n🏁 Done!")
    show_progress()

if __name__=="__main__":
    main()


In [None]:
# ════════════════════════════════
# Colab Cell 7 ▶️ Dataset Statistics
# ════════════════════════════════
import json
from collections import Counter

# Load the full validated dataset
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
    validated = json.load(f)

# 1) Distribution of number of passages per question
passage_counts = Counter(len(item["Passages"]) for item in validated)
print("📊 Questions by # of passages:")
for num_passages in sorted(passage_counts):
    print(f"- {passage_counts[num_passages]:5d} questions have {num_passages} passages")

# 2) Overall totals
total_qs = len(validated)
total_passages = sum(passage_counts[num] * num for num in passage_counts)
print(f"\nTotal Questions: {total_qs}")
print(f"Total Passages  : {total_passages}")


In [None]:
# ════════════════════════════════
# Colab Cell 8 ▶️ Split into Train/Val/Test
# ════════════════════════════════
import random
import json

# Shuffle for randomness
random.shuffle(validated)

total = len(validated)
train_end = int(0.70 * total)
val_end   = train_end + int(0.15 * total)

train_set = validated[:train_end]
val_set   = validated[train_end:val_end]
test_set  = validated[val_end:]

# Save splits
with open("ObliQA_MultiPassage_train.json", "w", encoding="utf-8") as f:
    json.dump(train_set, f, ensure_ascii=False, indent=2)
with open("ObliQA_MultiPassage_val.json",   "w", encoding="utf-8") as f:
    json.dump(val_set,   f, ensure_ascii=False, indent=2)
with open("ObliQA_MultiPassage_test.json",  "w", encoding="utf-8") as f:
    json.dump(test_set,  f, ensure_ascii=False, indent=2)

print(f"✅ Split complete:")
print(f"- Train:      {len(train_set)} ({len(train_set)/total:.2%})")
print(f"- Validation: {len(val_set)} ({len(val_set)/total:.2%})")
print(f"- Test:       {len(test_set)} ({len(test_set)/total:.2%})")
