# Stage-1 Data Collection and Preperation

## STEP 1 - Load SQuAD v1 JSON properly

In [None]:
import json

SQUAD_PATH = "/kaggle/input/stanford-question-answering-dataset/train-v1.1.json"

with open(SQUAD_PATH, "r") as f:
    squad_data = json.load(f)

len(squad_data["data"])

## STEP 2 - Extract ALL questions (raw)

In [8]:
questions = []

for article in squad_data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            q = qa["question"].strip()
            questions.append(q)

len(questions)


87599

## STEP 3 - Clean the questions (VERY IMPORTANT)?

In [9]:
import re

def is_good_question(q):
    # Length filter
    if len(q.split()) < 4 or len(q.split()) > 25:
        return False

    # Remove context-dependent questions
    bad_patterns = [
        "according to",
        "in the passage",
        "in the text",
        "mentioned above",
        "the paragraph"
    ]

    q_lower = q.lower()
    if any(p in q_lower for p in bad_patterns):
        return False

    # Must end with question mark
    if not q.endswith("?"):
        return False

    return True

clean_questions = [q for q in questions if is_good_question(q)]

len(clean_questions)


85549

## STEP 4 - Deduplicate (don’t skip this)

In [10]:
clean_questions = list(set(clean_questions))
len(clean_questions)


85309

## STEP 5 - Categorize questions (lightweight, rule-based)

In [11]:
def categorize(q):
    q_lower = q.lower()
    if q_lower.startswith(("who", "what", "when", "where")):
        return "factual"
    if q_lower.startswith(("why", "how")):
        return "explanatory"
    return "ambiguous"

categorized = {
    "factual": [],
    "explanatory": [],
    "ambiguous": []
}

for q in clean_questions:
    categorized[categorize(q)].append(q)

for k, v in categorized.items():
    print(k, len(v))


factual 53654
explanatory 9179
ambiguous 22476


## STEP 6 - Sample EXACTLY 150 questions

In [27]:
import random

random.seed(42)

squad_selected = (
    random.sample(categorized["factual"], 60) +
    random.sample(categorized["explanatory"], 60) +
    random.sample(categorized["ambiguous"], 30)
)

len(squad_selected)


150

## STEP 7 - Save SQuAD prompts

In [29]:
import pandas as pd

squad_df = pd.DataFrame({
    "prompt_id": [f"SQUAD_{i+1:03d}" for i in range(len(squad_selected))],
    "source": "squad_v1",
    "prompt_text": squad_selected,
    "category": [categorize(q) for q in squad_selected]
})

# Save updated SQuAD prompts
squad_df.to_csv("/kaggle/working/squad_prompts_150.csv", index=False)

# Verify category distribution
squad_df["category"].value_counts()


category
factual        60
explanatory    60
ambiguous      30
Name: count, dtype: int64

In [30]:
squad_df.head()

Unnamed: 0,prompt_id,source,prompt_text,category
0,SQUAD_001,squad_v1,What is the oldest Presbyterian church in Rich...,factual
1,SQUAD_002,squad_v1,Who is the CCO of Sony Music?,factual
2,SQUAD_003,squad_v1,When were late Paleolithic communities establi...,factual
3,SQUAD_004,squad_v1,Who eventually defeated the Arabs at Rajasthan?,factual
4,SQUAD_005,squad_v1,What part of a polychaete can be everted?,factual


In [31]:
len(squad_df)

150

# Natural Questions Dataset

## STEP 1 - Load Natural Questions CSV

In [15]:
import pandas as pd

NQ_PATH = "/kaggle/input/natural-questions-dataset/Natural-Questions-Filtered.csv"

nq_df = pd.read_csv(NQ_PATH)
nq_df.head()


Unnamed: 0,question,long_answers,short_answers
0,which is the most common use of opt-in e-mail ...,A common example of permission marketing is a ...,A newsletter sent to an advertising firm's cus...
1,how i.met your mother who is the mother,"Tracy McConnell, better known as `` The Mother...",Tracy McConnell
2,who had the most wins in the nfl,Active quarterback Tom Brady holds the records...,Tom Brady
3,who played mantis guardians of the galaxy 2,Pom Klementieff (born May 1986) is a French ac...,Pom Klementieff
4,the nashville sound brought a polished and cos...,"In the early 1960s, the Nashville sound began ...",The use of lush string arrangements with a rea...


In [16]:
nq_df.columns


Index(['question', 'long_answers', 'short_answers'], dtype='object')

## STEP 1.1 - Extract questions safely

In [17]:
questions = (
    nq_df["question"]
    .dropna()
    .astype(str)
    .str.strip()
    .tolist()
)

len(questions)


86212

## STEP 2 — Apply STRICT filtering

In [18]:
def is_good_nq_question(q):
    # length filter: search-style queries
    if len(q.split()) < 3 or len(q.split()) > 15:
        return False

    q_lower = q.lower()

    # remove noise / artifacts
    if any(x in q_lower for x in ["http", "<", ">", "{", "}", "[", "]"]):
        return False

    # must look like a real question
    if not (
        q_lower.startswith(("who", "what", "when", "where", "why", "how"))
        or q.endswith("?")
    ):
        return False

    return True


clean_nq = [q for q in questions if is_good_nq_question(q)]
clean_nq = list(set(clean_nq))  # deduplicate

len(clean_nq)


74527

## STEP 3 - Categorize (same logic as SQuAD)

In [23]:
def categorize_nq(q):
    q_lower = q.lower()
    if q_lower.startswith(("why", "how")):
        return "explanatory"
    return "factual"  # everything else
categorized_nq = {
    "factual": [],
    "explanatory": []
}

for q in clean_nq:
    categorized_nq[categorize_nq(q)].append(q)

for k, v in categorized_nq.items():
    print(k, len(v))


factual 70073
explanatory 4454


## Step 4 - Sample EXACTLY 150 (final)

In [24]:
import random
random.seed(42)

nq_selected = (
    random.sample(categorized_nq["factual"], 100)
    + random.sample(categorized_nq["explanatory"], 50)
)

len(nq_selected)


150

## Step 5 - Save & FREEZE

In [25]:
import pandas as pd

nq_final = pd.DataFrame({
    "prompt_id": [f"NQ_{i+1:03d}" for i in range(len(nq_selected))],
    "source": "natural_questions",
    "category": (
        ["factual"] * 100
        + ["explanatory"] * 50
    ),
    "prompt_text": nq_selected
})

nq_final.to_csv("/kaggle/working/nq_prompts_150.csv", index=False)

nq_final.head()


Unnamed: 0,prompt_id,source,category,prompt_text
0,NQ_001,natural_questions,factual,when did the nebraska state fair move to grand...
1,NQ_002,natural_questions,factual,who holds the power in an absolute monarchy
2,NQ_003,natural_questions,factual,who plays the guy in call me maybe
3,NQ_004,natural_questions,factual,where does season of migration to the north ta...
4,NQ_005,natural_questions,factual,when did blue m and ms come out


In [32]:
len(nq_final)

150

# Merge both datasets

## Step 1 - Load both CSV files

In [33]:
import pandas as pd

squad_path = "/kaggle/working/squad_prompts_150.csv"
nq_path = "/kaggle/working/nq_prompts_150.csv"

squad_df = pd.read_csv(squad_path)
nq_df = pd.read_csv(nq_path)

print("SQuAD shape:", squad_df.shape)
print("NQ shape:", nq_df.shape)


SQuAD shape: (150, 4)
NQ shape: (150, 4)


## STEP 2 - Schema check + alignment

In [35]:
# Define canonical column order
CANONICAL_COLUMNS = ["prompt_id", "source", "category", "prompt_text"]

# Check column sets (order-independent)
assert set(squad_df.columns) == set(nq_df.columns), "Column sets do not match!"

# Reorder columns consistently
squad_df = squad_df[CANONICAL_COLUMNS]
nq_df = nq_df[CANONICAL_COLUMNS]

squad_df.columns, nq_df.columns


(Index(['prompt_id', 'source', 'category', 'prompt_text'], dtype='object'),
 Index(['prompt_id', 'source', 'category', 'prompt_text'], dtype='object'))

### Step 3 - Merge (concatenate)

In [36]:
prompts_df = pd.concat([squad_df, nq_df], ignore_index=True)

prompts_df.shape


(300, 4)

## Step 4 - Final integrity checks

In [37]:
# Check counts by source
prompts_df["source"].value_counts()

source
squad_v1             150
natural_questions    150
Name: count, dtype: int64

In [38]:
# Check category distribution
prompts_df["category"].value_counts()


category
factual        160
explanatory    110
ambiguous       30
Name: count, dtype: int64

## Step 5 - Save final merged dataset (FREEZE)

In [39]:
prompts_df.to_csv("/kaggle/working/prompts.csv", index=False)

prompts_df.head()


Unnamed: 0,prompt_id,source,category,prompt_text
0,SQUAD_001,squad_v1,factual,What is the oldest Presbyterian church in Rich...
1,SQUAD_002,squad_v1,factual,Who is the CCO of Sony Music?
2,SQUAD_003,squad_v1,factual,When were late Paleolithic communities establi...
3,SQUAD_004,squad_v1,factual,Who eventually defeated the Arabs at Rajasthan?
4,SQUAD_005,squad_v1,factual,What part of a polychaete can be everted?
