<a href="https://colab.research.google.com/github/RushiBShinde/ThinkForge-IE-643-project/blob/main/Squad_Extractive_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Mount drive and install dependencies (if needed)
from google.colab import drive
drive.mount('/content/drive')

# Install/upgrade required libraries (uncomment if running in a fresh Colab)
# Note: transformers and datasets are usually preinstalled in Colab, but these commands ensure recent versions.
!pip install -q transformers datasets["torch"] tqdm

# Imports
import re
import string
from collections import Counter
from tqdm import tqdm
import torch
from transformers import pipeline
from datasets import load_dataset


Mounted at /content/drive


In [None]:
# --- FIXED Cell 2 for QASPER parquet version ---
import json

def safe_json_parse(x):
    """Try to parse a JSON string if possible; otherwise return x unchanged."""
    if isinstance(x, str):
        try:
            return json.loads(x)
        except Exception:
            return x
    return x

qa_items = []
total_questions = 0
extractive_questions = 0

for ex in ds:
    # Parse qas if it’s a JSON string
    qas = safe_json_parse(ex.get("qas", []))
    if not isinstance(qas, list):
        continue

    total_questions += len(qas)
    context = concat_full_text(safe_json_parse(ex.get("full_text", [])))

    for qa in qas:
        qa = safe_json_parse(qa)
        if not isinstance(qa, dict):
            continue

        extractive_spans = None

        # Try all known answer field variants
        if "extractive_spans" in qa and qa["extractive_spans"] is not None:
            extractive_spans = safe_json_parse(qa["extractive_spans"])
        elif "extractive_answers" in qa and qa["extractive_answers"] is not None:
            extractive_spans = safe_json_parse(qa["extractive_answers"])
        elif "answers" in qa and qa["answers"] is not None:
            answers_field = safe_json_parse(qa["answers"])
            cand = []
            if isinstance(answers_field, list):
                for a in answers_field:
                    a = safe_json_parse(a)
                    if isinstance(a, str):
                        cand.append(a)
                    elif isinstance(a, dict):
                        a_type = a.get("type") or a.get("answer_type")
                        if a_type and "extract" in a_type.lower():
                            text = a.get("text") or a.get("extractive_text") or a.get("extractive_span")
                            if text:
                                cand.append(text)
                        else:
                            text = a.get("text") or a.get("answer_text")
                            if text:
                                cand.append(text)
            if cand:
                extractive_spans = cand

        # Keep only extractive ones
        if extractive_spans and isinstance(extractive_spans, list) and len(extractive_spans) > 0:
            first_span = extractive_spans[0]
            if isinstance(first_span, dict):
                gt = first_span.get("text") or first_span.get("span") or ""
            else:
                gt = str(first_span)
            if gt.strip().lower() in {"yes", "no", "unanswerable", "cannot answer", ""}:
                continue
            qa_items.append({
                "query": qa.get("question") or qa.get("query") or "",
                "context": context,
                "gt": gt
            })
            extractive_questions += 1

print(f"Loaded {len(ds)} examples (papers).")
print(f"Total QAs present in those examples (rough): {total_questions}")
print(f"Extractive QA pairs collected (to evaluate): {len(qa_items)}")


Loaded 50 examples (papers).
Total QAs present in those examples (rough): 0
Extractive QA pairs collected (to evaluate): 0


In [None]:
extractive_qas = []

for ex in ds:
    qas = ex.get("qas")
    if not qas:
        continue

    # qas is likely a dict of {question_id: qa_object}
    for qa in qas.values():
        if not isinstance(qa, dict):
            continue

        question = qa.get("question", "").strip()
        if not question:
            continue

        answers = qa.get("answers", [])
        for ans_set in answers:
            if not isinstance(ans_set, dict):
                continue

            detailed_answers = ans_set.get("answer", [])
            for da in detailed_answers:
                if not isinstance(da, dict):
                    continue

                if da.get("extractive_spans"):
                    for span in da["extractive_spans"]:
                        extractive_qas.append({
                            "paper_id": ex.get("id", ""),
                            "question": question,
                            "answer": span
                        })
                elif da.get("free_form_answer"):
                    extractive_qas.append({
                        "paper_id": ex.get("id", ""),
                        "question": question,
                        "answer": da["free_form_answer"]
                    })

print(f"✅ Extractive QA pairs collected: {len(extractive_qas)}")


✅ Extractive QA pairs collected: 0


In [None]:
# ✅ Fixed version for the parquet-structured QASPER dataset
from tqdm import tqdm

def concat_full_text(full_text):
    """Concatenate all paragraphs into a single string."""
    try:
        if isinstance(full_text, dict) and "paragraphs" in full_text:
            return " ".join([" ".join(p) for p in full_text["paragraphs"]])
        elif isinstance(full_text, list):
            return " ".join([" ".join(p) if isinstance(p, list) else str(p) for p in full_text])
        else:
            return str(full_text)
    except Exception:
        return ""

examples = []
for ex in tqdm(ds, desc="Processing QASPER samples"):
    context = concat_full_text(ex.get("full_text", {}))
    qas = ex.get("qas", {})

    # Ensure it's a dict (as in your printed structure)
    if not isinstance(qas, dict):
        continue

    questions = qas.get("question", [])
    answers_list = qas.get("answers", [])

    for i, q in enumerate(questions):
        # Each i-th question corresponds to the i-th entry in answers_list
        if i >= len(answers_list):
            continue

        ans_entry = answers_list[i]
        if not ans_entry or not isinstance(ans_entry, dict):
            continue

        # Inside ans_entry, there's a key "answer" (a list of answers)
        detailed_answers = ans_entry.get("answer", [])
        for ans in detailed_answers:
            if ans.get("unanswerable"):
                continue
            extractive_spans = ans.get("extractive_spans", [])
            if extractive_spans:
                examples.append({
                    "question": q.strip(),
                    "context": context.strip(),
                    "ground_truth_answer": extractive_spans[0].strip()
                })
                break  # take only the first valid extractive span per question

print(f"✅ Extractive QA pairs collected: {len(examples)}")


Processing QASPER samples: 100%|██████████| 50/50 [00:00<00:00, 1477.16it/s]

✅ Extractive QA pairs collected: 136





In [None]:
# Cell 3: Define metrics - normalization, F1, EM

def normalize_for_f1(s):
    """Lowercase, remove punctuation, articles, and extra whitespace; used for token-level F1."""
    if s is None:
        return ""
    s = s.lower()
    # remove punctuation
    s = "".join(ch for ch in s if ch not in set(string.punctuation))
    # remove articles
    s = re.sub(r"\b(a|an|the)\b", " ", s)
    # normalize whitespace
    s = " ".join(s.split())
    return s

def f1_score(prediction, ground_truth):
    """
    Compute token-level F1 between prediction and ground_truth strings.
    """
    pred_tokens = normalize_for_f1(prediction).split()
    gt_tokens = normalize_for_f1(ground_truth).split()
    if len(pred_tokens) == 0 and len(gt_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match(prediction, ground_truth):
    """
    Exact Match per your spec: simple stripped string equality.
    """
    if prediction is None:
        prediction = ""
    if ground_truth is None:
        ground_truth = ""
    return 1.0 if prediction.strip() == ground_truth.strip() else 0.0


In [None]:
# ✅ Cell 4: Evaluate both RoBERTa models on the collected extractive QA pairs

from transformers import pipeline
from tqdm import tqdm
import torch

# -----------------------------
# Metric helper functions
# -----------------------------
def normalize_text(s):
    """Lowercase, remove punctuation/articles/extra whitespace."""
    import re, string
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def f1_score(prediction, ground_truth):
    """Compute token-level F1 between two strings."""
    pred_tokens = normalize_text(prediction).split()
    gt_tokens = normalize_text(ground_truth).split()
    common = set(pred_tokens) & set(gt_tokens)
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return int(pred_tokens == gt_tokens)
    if len(common) == 0:
        return 0
    prec = len(common) / len(pred_tokens)
    rec = len(common) / len(gt_tokens)
    return 2 * prec * rec / (prec + rec)

def exact_match(prediction, ground_truth):
    return normalize_text(prediction) == normalize_text(ground_truth)

# -----------------------------
# Evaluation function
# -----------------------------
def evaluate_model(model_name, examples, display_name):
    if not examples:
        print(f"\n{display_name}: No results (model not evaluated).")
        return None

    device = 0 if torch.cuda.is_available() else -1
    print(f"\nUsing device: {'cuda' if device == 0 else 'cpu'}")

    print(f"\nLoading pipeline for {display_name} ({model_name}) ...")
    qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_name, device=device)
    print(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")

    total_f1, total_em = 0.0, 0.0
    count = 0

    for ex in tqdm(examples, desc=f"Evaluating {display_name}"):
        query = ex["question"]
        context = ex["context"]
        gt = ex["ground_truth_answer"]

        try:
            result = qa_pipeline(question=query, context=context)
            pred = result["answer"].strip()
            total_f1 += f1_score(pred, gt)
            total_em += exact_match(pred, gt)
            count += 1
        except Exception as e:
            print(f"⚠️ Skipped one example due to error: {e}")
            continue

    if count > 0:
        avg_f1 = total_f1 / count
        avg_em = total_em / count
        print(f"\n{display_name}: Average F1 = {avg_f1:.4f}, Average EM = {avg_em:.4f} (evaluated on {count} extractive QAs)")
        return avg_f1, avg_em
    else:
        print(f"\n{display_name}: No valid evaluations performed.")
        return None

# -----------------------------
# Run evaluations for both models
# -----------------------------
print(f"Evaluating on {len(examples)} examples...\n")

results_base = evaluate_model("deepset/roberta-base-squad2", examples, "RoBERTa-base SQuAD2")
results_large = evaluate_model("deepset/roberta-large-squad2", examples, "RoBERTa-large SQuAD2")

# -----------------------------
# Final output
# -----------------------------
if results_base:
    print(f"\nRoBERTa-base SQuAD2: Average F1 = {results_base[0]:.4f}, Average EM = {results_base[1]:.4f}")
else:
    print("\nRoBERTa-base SQuAD2: No results (model not evaluated).")

if results_large:
    print(f"RoBERTa-large SQuAD2: Average F1 = {results_large[0]:.4f}, Average EM = {results_large[1]:.4f}")
else:
    print("\nRoBERTa-large SQuAD2: No results (model not evaluated).")


Evaluating on 136 examples...


Using device: cuda

Loading pipeline for RoBERTa-base SQuAD2 (deepset/roberta-base-squad2) ...


Device set to use cuda:0


Device set to use cuda:0


Evaluating RoBERTa-base SQuAD2:   7%|▋         | 10/136 [00:06<01:07,  1.87it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating RoBERTa-base SQuAD2: 100%|██████████| 136/136 [01:03<00:00,  2.13it/s]



RoBERTa-base SQuAD2: Average F1 = 0.1381, Average EM = 0.0588 (evaluated on 136 extractive QAs)

Using device: cuda

Loading pipeline for RoBERTa-large SQuAD2 (deepset/roberta-large-squad2) ...


Device set to use cuda:0


Device set to use cuda:0


Evaluating RoBERTa-large SQuAD2: 100%|██████████| 136/136 [03:22<00:00,  1.49s/it]


RoBERTa-large SQuAD2: Average F1 = 0.1640, Average EM = 0.0515 (evaluated on 136 extractive QAs)

RoBERTa-base SQuAD2: Average F1 = 0.1381, Average EM = 0.0588
RoBERTa-large SQuAD2: Average F1 = 0.1640, Average EM = 0.0515





In [None]:
len(examples)

136

In [None]:
# Cell: Retrieval + Sliding-window chunking + QA evaluation for top-10 papers per query
# Requires: ds (HF dataset of 50 papers), examples (list of extractive QAs)
# Outputs Average F1 and EM for both deepset/roberta-base-squad2 and deepset/roberta-large-squad2.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoTokenizer
from tqdm import tqdm
import numpy as np
import torch
import math
import re, string
from collections import Counter

# -------------------------
# Params (tuneable)
# -------------------------
TOP_K_PAPERS = 10         # top-k papers to search per query
CHUNK_SIZE = 400          # chunk size in tokens (model tokens)
CHUNK_STRIDE = 128        # overlap stride in tokens
MAX_DOCS = len(ds)        # use all loaded papers (should be 50 as loaded)
USE_CUDA = torch.cuda.is_available()

# -------------------------
# Helpers: normalize / metrics
# -------------------------
def normalize_for_eval(s):
    if s is None:
        return ""
    s = s.lower()
    s = "".join(ch for ch in s if ch not in set(string.punctuation))
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = " ".join(s.split())
    return s

def f1_score(prediction, ground_truth):
    pred_tokens = normalize_for_eval(prediction).split()
    gt_tokens = normalize_for_eval(ground_truth).split()
    if len(pred_tokens) == 0 and len(gt_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

def exact_match(prediction, ground_truth):
    return 1.0 if normalize_for_eval(prediction) == normalize_for_eval(ground_truth) else 0.0

# -------------------------
# Build corpus of paper texts for retrieval
# -------------------------
def build_paper_text(ex):
    # try to include title + abstract + full_text
    parts = []
    if ex.get("title"):
        parts.append(ex["title"])
    if ex.get("abstract"):
        parts.append(ex["abstract"])
    # full_text can be nested - try to flatten common structures
    ft = ex.get("full_text", {})
    # If earlier concat_full_text exists, use it, else fallback:
    try:
        full_text_str = concat_full_text(ft)
    except Exception:
        # robust fallback: join paragraphs if present
        if isinstance(ft, dict) and "paragraphs" in ft:
            paragraphs = []
            for p in ft["paragraphs"]:
                if isinstance(p, list):
                    paragraphs.append(" ".join(p))
                elif isinstance(p, str):
                    paragraphs.append(p)
            full_text_str = " ".join(paragraphs)
        elif isinstance(ft, list):
            # list of lists or strings
            segments = []
            for p in ft:
                if isinstance(p, list):
                    segments.append(" ".join(p))
                else:
                    segments.append(str(p))
            full_text_str = " ".join(segments)
        else:
            full_text_str = str(ft)
    parts.append(full_text_str)
    return " ".join([p for p in parts if p])

paper_texts = []
paper_ids = []
for i, ex in enumerate(ds):
    paper_ids.append(ex.get("id", f"paper_{i}"))
    paper_texts.append(build_paper_text(ex))

# -------------------------
# TF-IDF retriever (simple & effective)
# -------------------------
vectorizer = TfidfVectorizer(stop_words="english", max_features=20000)
tfidf_matrix = vectorizer.fit_transform(paper_texts)  # (num_papers x vocab)

# Map queries (we evaluate each example separately)
queries = [ex["question"] for ex in examples]

# Precompute query vectors
query_vecs = vectorizer.transform(queries)  # (num_queries x vocab)

# For each query, compute cosine similarity to papers and select top-k papers
# This produces a list topk_paper_indices_per_query of length len(queries)
cosine_sim = cosine_similarity(query_vecs, tfidf_matrix)  # (num_queries x num_papers)
topk_paper_indices_per_query = np.argsort(-cosine_sim, axis=1)[:, :TOP_K_PAPERS]

# -------------------------
# Chunking utility (token-based) using tokenizer
# -------------------------
def chunk_text_tokenwise(text, tokenizer, chunk_size=CHUNK_SIZE, stride=CHUNK_STRIDE):
    # encode -> list of token ids, split into overlapping windows, decode windows back to text
    tok_ids = tokenizer.encode(text, add_special_tokens=False)
    if len(tok_ids) == 0:
        return []
    chunks = []
    start = 0
    while start < len(tok_ids):
        end = min(start + chunk_size, len(tok_ids))
        chunk_ids = tok_ids[start:end]
        # decode chunk back to text
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        chunks.append(chunk_text)
        if end == len(tok_ids):
            break
        start += (chunk_size - stride)
    return chunks

# -------------------------
# Evaluate function (retrieval + chunking + QA)
# -------------------------
def evaluate_with_retrieval_and_chunking(model_name, examples, topk_indices_per_query, paper_texts, device):
    # Create pipeline and tokenizer
    device_idx = 0 if device else -1
    qa_pipe = pipeline("question-answering", model=model_name, tokenizer=model_name, device=device_idx)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    total_f1 = 0.0
    total_em = 0.0
    n = 0

    # iterate examples and run retrieval+chunking
    for qi, ex in enumerate(tqdm(examples, desc=f"Retrieval+Chunking eval for {model_name}")):
        query = ex["question"]
        gold = ex["ground_truth_answer"]
        # Top-K paper indices for this query
        topk = topk_indices_per_query[qi]
        best_pred = ""
        best_score = -1.0
        found_any = False

        # For each candidate paper, chunk and run QA
        for pidx in topk:
            doc_text = paper_texts[pidx]
            if not doc_text or len(doc_text.strip()) == 0:
                continue
            chunks = chunk_text_tokenwise(doc_text, tokenizer, chunk_size=CHUNK_SIZE, stride=CHUNK_STRIDE)
            if not chunks:
                continue
            for chunk in chunks:
                try:
                    out = qa_pipe(question=query, context=chunk)
                    # pipeline returns dict. If list returned, take the first
                    if isinstance(out, list):
                        out = out[0] if len(out) > 0 else {"answer": "", "score": 0.0}
                    ans = out.get("answer", "").strip()
                    score = out.get("score", 0.0) or 0.0
                    # keep best by score
                    if score > best_score:
                        best_score = score
                        best_pred = ans
                        found_any = True
                except Exception as e:
                    # skip chunk on exception but don't crash whole evaluation
                    # print("Chunk error:", e)
                    continue

        # If we didn't get anything (should not happen), set pred empty
        if not found_any:
            best_pred = ""

        # compute metrics (even if empty predicted)
        total_f1 += f1_score(best_pred, gold)
        total_em += exact_match(best_pred, gold)
        n += 1

    # cleanup
    try:
        del qa_pipe
        torch.cuda.empty_cache()
    except Exception:
        pass

    avg_f1 = (total_f1 / n) if n > 0 else 0.0
    avg_em = (total_em / n) if n > 0 else 0.0
    return avg_f1, avg_em, n

# -------------------------
# Run evaluation for both models
# -------------------------
device_flag = True if USE_CUDA else False
print("Device available:", "cuda" if device_flag else "cpu")

models = [
    ("RoBERTa-base SQuAD2", "deepset/roberta-base-squad2"),
    ("RoBERTa-large SQuAD2", "deepset/roberta-large-squad2")
]

results = {}
for display_name, model_id in models:
    print(f"\nStarting evaluation for {display_name} ({model_id}) ...")
    avg_f1, avg_em, count = evaluate_with_retrieval_and_chunking(model_id, examples, topk_paper_indices_per_query, paper_texts, device_flag)
    results[display_name] = {"avg_f1": avg_f1, "avg_em": avg_em, "n": count}

# -------------------------
# Print final results
# -------------------------
for display_name in ["RoBERTa-base SQuAD2", "RoBERTa-large SQuAD2"]:
    if display_name in results:
        r = results[display_name]
        print(f"\n{display_name}: Average F1 = {r['avg_f1']:.4f}, Average EM = {r['avg_em']:.4f} (evaluated on {r['n']} extractive QAs)")
    else:
        print(f"\n{display_name}: No results (model not evaluated).")


Device available: cuda

Starting evaluation for RoBERTa-base SQuAD2 (deepset/roberta-base-squad2) ...


Device set to use cuda:0
Retrieval+Chunking eval for deepset/roberta-base-squad2:   0%|          | 0/136 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (7632 > 512). Running this sequence through the model will result in indexing errors
Retrieval+Chunking eval for deepset/roberta-base-squad2: 100%|██████████| 136/136 [15:12<00:00,  6.71s/it]



Starting evaluation for RoBERTa-large SQuAD2 (deepset/roberta-large-squad2) ...


Device set to use cuda:0
Retrieval+Chunking eval for deepset/roberta-large-squad2:   0%|          | 0/136 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (7632 > 512). Running this sequence through the model will result in indexing errors
Retrieval+Chunking eval for deepset/roberta-large-squad2: 100%|██████████| 136/136 [46:45<00:00, 20.63s/it]


RoBERTa-base SQuAD2: Average F1 = 0.0531, Average EM = 0.0221 (evaluated on 136 extractive QAs)

RoBERTa-large SQuAD2: Average F1 = 0.0822, Average EM = 0.0294 (evaluated on 136 extractive QAs)





In [None]:
print("hi")

hi
