In [1]:
import os
import json
import random
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

# -----------------------
# Setup
# -----------------------
load_dotenv()  # load environment variables from .env

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url="https://api.llm7.io/v1"
)

# -----------------------
# Directories
# -----------------------
data_dir = "../data"                    # input files location
output_dir = "../data/questions"        # output directory
os.makedirs(output_dir, exist_ok=True)

# -----------------------
# File groups
# -----------------------
newlaws = ["bns.json", "bnss.json", "bsa.json"]
oldlaws = ["ipc.json", "crpc.json", "iea.json"]
mapping = ["mapping_of_laws.json"]

# -----------------------
# Prompt intro
# -----------------------
llm_prompt_intro = """You are a legal question generator.
For the given chunk of law text, generate exactly 5 questions that are clear, specific, and uniquely tied to the chunk.
Include section numbers, law titles, or abbreviations wherever possible. Avoid vague questions.

Rules:
- Each question must be self-contained and concise.
- Do not copy sentences directly from the chunk.
- Output JSON only, with keys: "chunk_id" and "llm_questions".
- "llm_questions" must be an array of exactly 5 strings.

Return strictly in this format:
{"chunk_id":"<chunk_id>","llm_questions":["q1","q2","q3","q4","q5"]}

Now the chunk JSON follows (do not add any commentary)."""

# -----------------------
# Safe JSON loader
# -----------------------
def safe_json_loads(content):
    try:
        return json.loads(content)
    except:
        return None

# -----------------------
# Extract LLM response content
# -----------------------
def _extract_content_from_response(resp):
    try:
        choices = getattr(resp, "choices", None)
    except Exception:
        choices = None

    if choices is None and isinstance(resp, dict):
        choices = resp.get("choices")

    if choices:
        first = choices[0]
        msg = getattr(first, "message", None)
        if msg:
            return getattr(msg, "content", None)
        if isinstance(first, dict):
            m = first.get("message")
            if isinstance(m, dict):
                return m.get("content")
            if "text" in first:
                return first["text"]
        if hasattr(first, "text"):
            return getattr(first, "text")
    return None

# -----------------------
# LLM question generator
# -----------------------
def generate_llm_questions(chunk):
    chunk_payload = {
        "chunk_id": chunk.get("chunk_id"),
        "doc_id": chunk.get("doc_id", ""),
        "text": chunk.get("text", ""),
        "metadata": chunk.get("metadata", {})
    }
    prompt = llm_prompt_intro + "\n\n" + json.dumps(chunk_payload, ensure_ascii=False)

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            timeout=60
        )

        content = _extract_content_from_response(resp)
        if not content:
            return None

        parsed = safe_json_loads(content)
        if parsed and "llm_questions" in parsed and len(parsed["llm_questions"]) == 5:
            return parsed["llm_questions"]
    except:
        return None

    return None

# -----------------------
# Group processing with real-time JSON update
# -----------------------
def process_group(files, llm_outpath, sample_fraction=0.2, seed=None):
    if seed is not None:
        random.seed(seed)

    all_llm = []

    # If file exists, load existing results
    if os.path.isfile(llm_outpath):
        try:
            with open(llm_outpath, "r", encoding="utf-8") as f:
                all_llm = json.load(f)
        except:
            all_llm = []

    for file in files:
        file_path = os.path.join(data_dir, file)
        if not os.path.isfile(file_path):
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            chunks = json.load(f)

        sample_count = max(1, int(len(chunks) * sample_fraction))
        available_chunks = chunks.copy()
        random.shuffle(available_chunks)

        processed = 0
        for chunk in tqdm(available_chunks, total=sample_count, desc=f"Processing chunks {file}"):
            if processed >= sample_count:
                break
            llm_qs = generate_llm_questions(chunk)
            if llm_qs:
                all_llm.append({
                    "chunk_id": chunk["chunk_id"],
                    "llm_questions": llm_qs
                })
                processed += 1
                # Save incrementally after each successful chunk
                with open(llm_outpath, "w", encoding="utf-8") as f:
                    json.dump(all_llm, f, indent=2, ensure_ascii=False)
            # silently skip if LLM fails

    print(f"\n✅ Saved {len(all_llm)} LLM question sets to {llm_outpath}\n")

# -----------------------
# Run all groups
# -----------------------
process_group(
    newlaws,
    os.path.join(output_dir, "llm_questions_newlaws.json"),
    sample_fraction=0.2,
    seed=42
)

process_group(
    oldlaws,
    os.path.join(output_dir, "llm_questions_oldlaws.json"),
    sample_fraction=0.2,
    seed=42
)

process_group(
    mapping,
    os.path.join(output_dir, "llm_questions_mapping.json"),
    sample_fraction=0.2,
    seed=42
)


Processing chunks bns.json:   0%|          | 0/90 [00:00<?, ?it/s]

Processing chunks bns.json: 111it [10:53,  5.89s/it]                       
Processing chunks bnss.json: 290it [50:42, 10.49s/it]                         
Processing chunks bsa.json: 41it [07:28, 10.93s/it]                        



✅ Saved 357 LLM question sets to ../data/questions/llm_questions_newlaws.json



Processing chunks ipc.json: 281it [52:38, 11.24s/it]                         
Processing chunks crpc.json: 471it [1:26:54, 11.07s/it]                         
Processing chunks iea.json: 106it [19:49, 11.22s/it]                       



✅ Saved 736 LLM question sets to ../data/questions/llm_questions_oldlaws.json



Processing chunks mapping_of_laws.json: 270it [49:06, 10.91s/it]                         


✅ Saved 247 LLM question sets to ../data/questions/llm_questions_mapping.json






In [1]:
import json

# Define file paths and their respective collection names
files = {
    "collection_map": "../data/questions/llm_questions_mapping.json",
    "collection_new": "../data/questions/llm_questions_newlaws.json",
    "collection_old": "../data/questions/llm_questions_oldlaws.json"
}

for collection_name, path in files.items():
    # Load the JSON file
    with open(path, "r") as f:
        questions = json.load(f)

    # Add the collection field to each question
    for q in questions:
        q["collection"] = collection_name

    # Save it back to the same file (pretty format)
    with open(path, "w") as f:
        json.dump(questions, f, indent=2)

    print(f"✅ Updated {path} with collection='{collection_name}' ({len(questions)} questions)")


✅ Updated ../data/questions/llm_questions_mapping.json with collection='collection_map' (247 questions)
✅ Updated ../data/questions/llm_questions_newlaws.json with collection='collection_new' (357 questions)
✅ Updated ../data/questions/llm_questions_oldlaws.json with collection='collection_old' (736 questions)
