#load Qasper

In [22]:
from datasets import load_dataset

ds = load_dataset("allenai/qasper", revision="refs/convert/parquet")
print(ds)
print(ds["train"].features)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 888
    })
    validation: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 281
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 416
    })
})
{'id': Value('string'), 'title': Value('string'), 'abstract': Value('string'), 'full_text': {'section_name': List(Value('string')), 'paragraphs': List(List(Value('string')))}, 'qas': {'question': List(Value('string')), 'question_id': List(Value('string')), 'nlp_background': List(Value('string')), 'topic_background': List(Value('string')), 'paper_read': List(Value('string')), 'search_query': List(Value('string')), 'question_writer': List(Value('string')), 'answers': List({'answer': List({'unanswerable': Value('bool'), 'extractive_spans': List(Value('string'

##Create a  50-paper subset

In [23]:
import random

papers = ds["train"]
rng = random.Random(42)

idx = list(range(len(papers)))
rng.shuffle(idx)
idx = idx[:50]

subset = papers.select(idx)
print("Pilot papers:", len(subset))
print(subset[0].keys())

Pilot papers: 50
dict_keys(['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'])


##Flatten the paper text

In [24]:
def flatten_full_text(full_text):
    sections = []
    for sec, paras in zip(full_text["section_name"], full_text["paragraphs"]):
        sec = sec.strip() if sec else "Unknown"
        body = "\n".join(p.strip() for p in paras if p and p.strip())
        sections.append(f"{sec}\n{body}")
    return "\n\n".join(sections)

# quick sanity check
text0 = flatten_full_text(subset[0]["full_text"])
print(text0[:500])

Introduction
Through this CS224N Pre-trained Contextual Embeddings (PCE) project, we tackle the question answering problem which is one of the most popular in NLP and has been brought to the forefront by datasets such as SQUAD 2.0. This problem's success stems from both the challenge it presents and the recent successes in approaching human level function. As most, if not all, of the problems humans solve every day can be posed as a question, creating an deep learning based solution that has acc


##Build the evaluation question list

In [25]:
eval_items = []

for row in subset:
    paper_id = row["id"]
    title = row["title"]
    paper_text = flatten_full_text(row["full_text"])

    qas = row["qas"]
    n_q = len(qas["question"])

    for i in range(n_q):
        question = qas["question"][i]
        ann_answers = qas["answers"][i]["answer"]  # list of annotator answers

        gold_answer, gold_evidence = None, []
        for a in ann_answers:
            ff = (a.get("free_form_answer") or "").strip()
            ex = a.get("extractive_spans") or []
            if ff:
                gold_answer = ff
                gold_evidence = a.get("highlighted_evidence") or a.get("evidence") or []
                break
            if ex:
                gold_answer = ex[0].strip()
                gold_evidence = a.get("highlighted_evidence") or a.get("evidence") or []
                break

        eval_items.append({
            "paper_id": paper_id,
            "title": title,
            "paper_text": paper_text,
            "question": question,
            "gold_answer": gold_answer,
            "gold_evidence": [e.strip() for e in gold_evidence if e and e.strip()],
        })

print("Pilot questions:", len(eval_items))
print("Example question:", eval_items[0]["question"])
print("Gold answer:", eval_items[0]["gold_answer"])
print("Evidence count:", len(eval_items[0]["gold_evidence"]))


Pilot questions: 148
Example question: What ensemble methods are used for best model?
Gold answer: choosing the answer from the network that had the highest probability and choosing no answer if any of the networks predicted no answer
Evidence count: 1


In [26]:
def flatten_full_text_with_headers(full_text):
    blocks = []
    for sec, paras in zip(full_text["section_name"], full_text["paragraphs"]):
        sec = sec.strip() if sec else "Unknown"
        for p in paras:
            if p and p.strip():
                blocks.append(f"[SECTION: {sec}] {p.strip()}")
    return "\n".join(blocks)

In [27]:
import nltk
nltk.download("punkt_tab"),
def chunk_text(
    text,
    target_words=200,
    overlap_sentences=1
):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current = []
    current_len = 0

    for i, sent in enumerate(sentences):
        sent_len = len(sent.split())
        current.append(sent)
        current_len += sent_len

        if current_len >= target_words:
            chunks.append(" ".join(current))

            # overlap: keep last N sentences
            current = current[-overlap_sentences:]
            current_len = sum(len(s.split()) for s in current)

    if current:
        chunks.append(" ".join(current))

    return chunks

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [28]:
chunks0 = chunk_text(eval_items[0]["paper_text"], target_words=200, overlap_sentences=1)
print("Chunks:", len(chunks0))
print(chunks0[0][:300])


Chunks: 20
Introduction
Through this CS224N Pre-trained Contextual Embeddings (PCE) project, we tackle the question answering problem which is one of the most popular in NLP and has been brought to the forefront by datasets such as SQUAD 2.0. This problem's success stems from both the challenge it presents and


In [29]:
from sentence_transformers import SentenceTransformer

retriever = SentenceTransformer("all-mpnet-base-v2")

paper_index = {}  # paper_id -> {"chunks": [...], "emb": tensor}

for row in subset:
    paper_id = row["id"]
    text = flatten_full_text_with_headers(row["full_text"])  # <-- changed line
    chunks = chunk_text(text, target_words=200, overlap_sentences=1)
    emb = retriever.encode(chunks, convert_to_tensor=True, show_progress_bar=False)
    paper_index[paper_id] = {"chunks": chunks, "emb": emb}

print("Re-indexed papers:", len(paper_index))


Re-indexed papers: 50


#Retrieval function

In [30]:
from sentence_transformers import util

def retrieve_from_paper(paper_id, query, top_k=5):
    chunks = paper_index[paper_id]["chunks"]
    emb = paper_index[paper_id]["emb"]

    q_emb = retriever.encode(query, convert_to_tensor=True, show_progress_bar=False)
    scores = util.cos_sim(q_emb, emb)[0]
    top = scores.topk(k=min(top_k, len(chunks)))

    return [chunks[i] for i in top.indices.tolist()]

#Evidence hit + Recall@K

In [31]:
def evidence_hit(retrieved_chunks, gold_evidence):
    if not gold_evidence:
        return None  # skip if no evidence provided
    rc = " ".join(retrieved_chunks).lower()
    return any(ev.lower() in rc for ev in gold_evidence)

def recall_at_k(eval_items, k):
    hits, total = 0, 0
    for ex in eval_items:
        hit = evidence_hit(
            retrieve_from_paper(ex["paper_id"], ex["question"], top_k=k),
            ex["gold_evidence"]
        )
        if hit is None:
            continue
        total += 1
        hits += int(hit)
    return hits / total if total else 0.0, total

for k in [1, 3, 5, 10]:
    r, n = recall_at_k(eval_items, k)
    print(f"Recall@{k}: {r:.3f}  (on {n} questions with evidence)")


Recall@1: 0.310  (on 113 questions with evidence)
Recall@3: 0.442  (on 113 questions with evidence)
Recall@5: 0.558  (on 113 questions with evidence)
Recall@10: 0.726  (on 113 questions with evidence)


In [32]:
def mrr_at_k(eval_items, k):
    rr_sum, total = 0.0, 0
    for ex in eval_items:
        gold = ex["gold_evidence"]
        if not gold:
            continue
        retrieved = retrieve_from_paper(ex["paper_id"], ex["question"], top_k=k)
        total += 1
        found_rank = 0
        for rank, chunk in enumerate(retrieved, start=1):
            chunk_l = chunk.lower()
            if any(ev.lower() in chunk_l for ev in gold):
                found_rank = rank
                break
        if found_rank:
            rr_sum += 1.0 / found_rank
    return rr_sum / total if total else 0.0, total

mrr, n = mrr_at_k(eval_items, 10)
print(f"MRR@10: {mrr:.3f}  (on {n} questions with evidence)")


MRR@10: 0.414  (on 113 questions with evidence)


#End-to-end evaluation

In [39]:
sample = [ex for ex in eval_items if ex["gold_answer"] and ex["gold_evidence"]][:10]
print("Sample size:", len(sample))
print(sample[2]["question"])

Sample size: 10
How much F1 was improved after adding skip connections?


In [40]:
for i, ex in enumerate(sample, 1):
    retrieved = retrieve_from_paper(ex["paper_id"], ex["question"], top_k=5)
    print("="*80)
    print(f"Q{i}: {ex['question']}")
    print("Gold evidence snippet:", ex["gold_evidence"][0][:120])
    print("\nTop-1 retrieved chunk:\n", retrieved[0][:400])

Q1: What ensemble methods are used for best model?
Gold evidence snippet: We constructed the ensembled predictions by choosing the answer from the network that had the highest probability and ch

Top-1 retrieved chunk:
 As such, we left the batch size 6 as with the base model and used a gradient accumulation of 3 so that only two examples were on the GPU at a time. Additionally, the large model is very sensitive to the learning rate, and the rate of 3e-5 which we used with the smaller model no longer worked. We ran the model on a subset of the data with various learning rates and found that 1.1e-5 to 1.5e-5 works
Q2: What hyperparameters have been tuned?
Gold evidence snippet: Finally, we performed hyperparameter tuning by adjusting the number of coattention blocks, the batch size, and the numbe

Top-1 retrieved chunk:
 [SECTION: Methods ::: Data Augmentation - SQuAD 2.Q] We call our augmented dataset SQUAD 2.Q and make 3 different versions (35%, 50%, and 100% augmentation) alongside c

In [41]:
bad_ids = [1, 4, 10]  # Q1, Q4, Q10 (1-indexed)

for qnum in bad_ids:
    ex = sample[qnum-1]
    retrieved = retrieve_from_paper(ex["paper_id"], ex["question"], top_k=5)

    print("="*80)
    print(f"Q{qnum}: {ex['question']}")
    print("Gold evidence:", ex["gold_evidence"][0][:140])

    for j, ch in enumerate(retrieved, 1):
        print(f"\n--- Top-{j} ---\n{ch[:350]}")


Q1: What ensemble methods are used for best model?
Gold evidence: We constructed the ensembled predictions by choosing the answer from the network that had the highest probability and choosing no answer if 

--- Top-1 ---
As such, we left the batch size 6 as with the base model and used a gradient accumulation of 3 so that only two examples were on the GPU at a time. Additionally, the large model is very sensitive to the learning rate, and the rate of 3e-5 which we used with the smaller model no longer worked. We ran the model on a subset of the data with various le

--- Top-2 ---
[SECTION: Results and Analysis] As seen in Figure FIGREF25, we conducted an error analysis over different question types. Note that questions that did not fit into the 7 bins were classified as "Other". An example of a question in the "Other" category would be an "Is it?" question which is a minority set in SQUAD 2.0. Over the baseline, our model p

--- Top-3 ---
[SECTION: Methods ::: Hyperparameter Tuning] N

#RAG EVAL

In [42]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

llm_name = "mistralai/Mistral-7B-Instruct-v0.2"
tok = AutoTokenizer.from_pretrained(llm_name)
llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [43]:
def generate_answer(prompt, max_new_tokens=200):
    inputs = tok(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        out = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )
    return tok.decode(out[0], skip_special_tokens=True)

In [None]:
test_sample = sample[:3]  # only 3 questions

for ex in test_sample:
    contexts = retrieve_from_paper(ex["paper_id"], ex["question"], top_k=5)
    ctx = "\n\n".join(contexts)

    prompt = f"""Answer ONLY using the evidence.
If the evidence is insufficient, say: Not enough evidence.

Evidence:
{ctx}

Question: {ex["question"]}
Answer:"""

    pred = generate_answer(prompt)

    print("="*80)
    print("Q:", ex["question"])
    print("GOLD:", ex["gold_answer"])
    print("PRED:", pred[:400])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Q: What ensemble methods are used for best model?
GOLD: choosing the answer from the network that had the highest probability and choosing no answer if any of the networks predicted no answer
PRED: Answer ONLY using the evidence.
If the evidence is insufficient, say: Not enough evidence.

Evidence:
As such, we left the batch size 6 as with the base model and used a gradient accumulation of 3 so that only two examples were on the GPU at a time. Additionally, the large model is very sensitive to the learning rate, and the rate of 3e-5 which we used with the smaller model no longer worked. We r


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
