In [1]:
!pip install evaluate langchain-huggingface



In [2]:
from datasets import load_dataset
from evaluate import load
import subprocess
from tqdm.notebook import tqdm
from langchain_community.llms import Ollama
from tqdm.notebook import tqdm
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
import torch

In [2]:
llm = HuggingFaceEndpoint(
    repo_id="microsoft/phi-4",
    task="text-generation",
    temperature=0.0,
    max_new_tokens=512,
    model_kwargs={"device_map": "cuda", "torch_dtype": torch.float16},
)
chat = ChatHuggingFace(
    llm=llm,
    verbose=True,
)

In [3]:
import re
from typing import Optional

def extract_answer(cot_output: str) -> Optional[str]:
    """
    Given a chain-of-thought LLM output ending with:
      ... 
      Answer: <the extractive answer>
    this returns the <the extractive answer> string.
    If no “Answer:” line is found, returns None.
    """
    # Look for a line that starts with “Answer:” (case-sensitive),
    # optionally preceded by whitespace, and capture the rest of the line.
    match = re.search(r'^[ \t]*Answer:\s*(.+)$', cot_output, flags=re.MULTILINE)
    if not match:
        return None
    return match.group(1).strip()


In [4]:
output = """Reasoning:
1. I see “Santa Clara, California” mentioned as the location.
2. The question asks “Where did Super Bowl 50 take place?”
Answer: Santa Clara, California
"""

answer = extract_answer(output)
print(answer)  # "Santa Clara, California"


Santa Clara, California


In [8]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Downloading absl_py-2.2.2-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=25025 sha256=cfce015a8da8ec8bbbc043538247ffde4d78da4961ad477cc74ff3e1a9c16083
  Stored in directory: c:\users\001\appdata\local\pip\cache\wheels\1e\19\43\8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl-py-2.2.2 rouge_score-0.1.2


In [10]:
!pip install --upgrade nltk



In [3]:
ds = load_dataset("squad", split="validation")
ds_shuffled = ds.shuffle(seed=42)
ds_sample = ds_shuffled.select(range(500))

In [6]:
squad_metric = load("squad")
rouge_metric = load("rouge")
bleu_metric  = load("bleu")

def evaluate(chain, cot_used: bool=False, ds=ds_sample):
    squad_preds, squad_refs = [], []
    texts_pred, texts_ref = [], []  # for BLEU/ROUGE we need flat lists of strings

    for ex in tqdm(ds, desc="Evaluating on SQuAD + BLEU/ROUGE"):
        raw = chain.predict(
            retrieved_SQuAD_passage=ex["context"],
            user_question=ex["question"]
        ).strip()

        # If using CoT, extract final answer
        if cot_used:
            raw = extract_answer(raw) or "Unsure about answer."

        # 1) build SQuAD v1 inputs
        squad_preds.append({
            "id": ex["id"],
            "prediction_text": raw,
        })
        squad_refs.append({
            "id": ex["id"],
            "answers": {
                "text": ex["answers"]["text"], 
                "answer_start": ex["answers"]["answer_start"]
            }
        })

        # 2) build flat strings for BLEU/ROUGE
        #    here we pick the FIRST gold answer as the reference
        texts_pred.append(raw)
        texts_ref.append(ex["answers"]["text"][0])

    # 3) compute SQuAD exact_match & F1
    squad_results = squad_metric.compute(
        predictions=squad_preds, 
        references=squad_refs
    )

    # 4) compute ROUGE (returns rouge1, rouge2, rougeL, etc.)
    rouge_results = rouge_metric.compute(
        predictions=texts_pred, 
        references=texts_ref
    )

    # 5) compute BLEU (returns 'bleu' score)
    bleu_results = bleu_metric.compute(
        predictions=texts_pred, 
        references=[[r] for r in texts_ref]  
        # BLEU expects list of list of references per prediction
    )

    return {
        **squad_results,
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "bleu":   bleu_results["bleu"],
    }

### Phi4 + Zero Shot Prompt

In [7]:
zero_shot_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        """
        You are an expert extractive question-answering system.
        Use only the provided context to answer the question.
        Always output the answer using the exact wording and phrasing as it appears in the context.
        If the answer is not contained in the context, reply “Unsure about answer.”
        """
    ),
    HumanMessagePromptTemplate.from_template(
        """
        Context:
        {retrieved_SQuAD_passage}

        Question:
        {user_question}

        'Answer:'
        """
    )
])

In [8]:
chain_zero_shot_prompt = LLMChain(llm=chat, prompt=zero_shot_prompt)
results_zero_shot_prompt = evaluate(chain_zero_shot_prompt, cot_used=False, ds=ds_sample)
print(f"Zero-shot Prompting Results: {results_zero_shot_prompt}")

  chain_zero_shot_prompt = LLMChain(llm=chat, prompt=zero_shot_prompt)


Evaluating on SQuAD + BLEU/ROUGE:   0%|          | 0/500 [00:00<?, ?it/s]

Zero-shot Prompting Results: {'exact_match': 17.8, 'f1': 47.865771366810996, 'rouge1': np.float64(0.4337529517105183), 'rouge2': np.float64(0.30747240973384526), 'rougeL': np.float64(0.4323955352478021), 'bleu': 0.10809094069972884}


### Phi4 + COT prompt

In [9]:
cot_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        """
        You are an expert extractive question-answering system.
        When given a context and a question, you will:
        1. Think through the relevant part of the context step by step.
        2. Show your reasoning clearly (chain-of-thought).
        3. Finally, output **only** the answer using the exact wording as it appears in the context.
        If the answer is not contained in the context, your final answer must be 'Unsure about answer.'
        """
    ),
    HumanMessagePromptTemplate.from_template(
        """
        Context:
        {retrieved_SQuAD_passage}

        Question:
        {user_question}

        Begin by reasoning step by step, then conclude with 'Answer: <your extractive answer>'."""
    ),
])

In [10]:
chain_cot_prompt = LLMChain(llm=chat, prompt=cot_prompt)
results_cot_prompt = evaluate(chain_cot_prompt, cot_used=True, ds=ds_sample)
print(f"Chain-of-Thought Prompting Results: {results_cot_prompt}")

Evaluating on SQuAD + BLEU/ROUGE:   0%|          | 0/500 [00:00<?, ?it/s]

Chain-of-Thought Prompting Results: {'exact_match': 59.0, 'f1': 76.18329879976443, 'rouge1': np.float64(0.7103717761840456), 'rouge2': np.float64(0.5166486506070347), 'rougeL': np.float64(0.7098182819885526), 'bleu': 0.31843942324695707}


## DistilBert Evaluation

In [4]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast, pipeline

In [5]:
def preprocess_valid_examples(examples, tokenizer, max_length=384, stride=128):
    """Process the validation split of the SQuAD dataset.

    Process the training split of the SQuAD dataset to include the unique ID of each row,
    the tokenized questions and context, as well as the start and end positions of the answer
    within the context.

    Args:
        examples: A row from the dataset containing an example.
        tokenizer: The BERT tokenizer to be used.
        max_length: The maximum length of the input sequence. If exceeded, truncate the second
            sentence of a pair (or a batch of pairs) to fit within the limit.
        stride: The number of tokens to retain from the end of a truncated sequence, allowing
            for overlap between truncated and overflowing sequences.

    Returns:
        The processed example.
    """
    # Tokenize the questions and context sequences
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
      questions,
      examples["context"],
      truncation="only_second",
      padding="max_length",
      stride=stride,
      max_length=max_length,
      return_offsets_mapping=True,
      return_overflowing_tokens=True,
    )

    example_ids = []
    answers = examples["answers"]
    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    # find the start and end positions of the answer within the context
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids  # keep the unique ID of the example
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [6]:
import torch
import numpy as np
from tqdm.auto import tqdm

def forward_distilbert(
    model,
    tokenizer,
    ds,
    batch_size: int = 8,
    max_length: int = 384,
    stride: int = 128,
):
    """
    Run DistilBERT over the SQuAD validation set in batches.

    Returns:
      start_logits: np.ndarray of shape (num_feature_rows, seq_len)
      end_logits:   np.ndarray of shape (num_feature_rows, seq_len)
      features:     HF Dataset with preprocessing columns (example_id, offset_mapping...)
    """
    def wrapper(examples):
        return preprocess_valid_examples(examples, tokenizer=tokenizer,
                                         max_length=max_length, stride=stride)

    features = ds.map(
        wrapper,
        batched=True,
        remove_columns=ds.column_names,
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tensor_features = features.remove_columns(
        ["offset_mapping", "example_id", "start_positions", "end_positions"]
    ).with_format(
        type="torch",
        columns=["input_ids", "attention_mask"],
        device=device
    )

    # 3) Allocate arrays to hold all logits
    num_rows   = len(tensor_features)
    seq_len    = tensor_features["input_ids"].shape[1]
    all_start  = np.zeros((num_rows, seq_len), dtype=np.float32)
    all_end    = np.zeros((num_rows, seq_len), dtype=np.float32)

    model.to(device).eval()
    with torch.no_grad():
        for i in tqdm(range(0, num_rows, batch_size), desc="DistilBERT inference"):
            batch_slice = slice(i, min(i + batch_size, num_rows))
            batch_inputs = {
                "input_ids":      tensor_features["input_ids"][batch_slice],
                "attention_mask": tensor_features["attention_mask"][batch_slice],
            }
            outputs = model(**batch_inputs)
            start_b = outputs.start_logits.detach().cpu().numpy()
            end_b   = outputs.end_logits.detach().cpu().numpy()

            all_start[batch_slice, :] = start_b
            all_end[batch_slice, :]   = end_b

    return all_start, all_end, features


In [7]:
import collections
import numpy as np
from tqdm.auto import tqdm
import evaluate

def compute_metrics(
    start_logits, end_logits, features, examples,
    n_best=20, max_answer_length=50
):
    """
    Compute Exact Match (EM), F1, BLEU and ROUGE for a Span-extraction model.

    Args:
        start_logits:      List of [num_tokens] arrays for start predictions
        end_logits:        List of [num_tokens] arrays for end predictions
        features:          The tokenized/offset-mapped validation features
        examples:          The raw validation examples (with "id", "context", "answers")
        n_best:            How many top start/end pairs to consider per example
        max_answer_length: Maximum span length

    Returns:
        A dict with keys:
          - exact_match, f1           (SQuAD)
          - rouge1, rouge2, rougeL    (ROUGE)
          - bleu                      (BLEU)
    """
    # Load metrics
    squad_metric = evaluate.load("squad")
    rouge_metric = evaluate.load("rouge")
    bleu_metric  = evaluate.load("bleu")

    # Map from example to its feature indices
    example_to_features = collections.defaultdict(list)
    for i, feat in enumerate(features):
        example_to_features[feat["example_id"]].append(i)

    # Build predictions
    predicted = []
    for ex in tqdm(examples, desc="Building predictions"):
        ex_id  = ex["id"]
        context = ex["context"]
        candidates = []

        for fi in example_to_features[ex_id]:
            starts = np.argsort(start_logits[fi])[-n_best:][::-1]
            ends   = np.argsort(end_logits[fi])[-n_best:][::-1]
            offsets = features[fi]["offset_mapping"]

            for s in starts:
                for e in ends:
                    if offsets[s] is None or offsets[e] is None:
                        continue
                    if e < s or (e - s + 1) > max_answer_length:
                        continue
                    text = context[offsets[s][0] : offsets[e][1]]
                    score = start_logits[fi][s] + end_logits[fi][e]
                    candidates.append((text, score))

        if candidates:
            best_text = max(candidates, key=lambda x: x[1])[0]
        else:
            best_text = ""

        predicted.append({
            "id": ex_id,
            "prediction_text": best_text
        })

    # Build references for SQuAD
    references = [
        {"id": ex["id"], "answers": ex["answers"]}
        for ex in examples
    ]

    # 1) SQuAD EM/F1
    squad_res = squad_metric.compute(
        predictions=predicted,
        references=references
    )

    # Prepare flat lists for BLEU/ROUGE
    preds_texts = [p["prediction_text"] for p in predicted]
    # pick the first gold answer for each example
    refs_texts  = [ex["answers"]["text"][0] for ex in examples]

    # 2) ROUGE
    rouge_res = rouge_metric.compute(
        predictions=preds_texts,
        references=refs_texts
    )

    # 3) BLEU (expects list of lists of refs)
    bleu_res = bleu_metric.compute(
        predictions=preds_texts,
        references=[[r] for r in refs_texts],
        # tokenizer="none"  # skip NLTK to avoid external deps
    )

    # Merge and return
    return {
        **squad_res,
        "rouge1": rouge_res["rouge1"],
        "rouge2": rouge_res["rouge2"],
        "rougeL": rouge_res["rougeL"],
        "bleu":   bleu_res["bleu"],
    }


#### DistilBert without finetuning

In [8]:
model_path =  "distilbert/distilbert-base-uncased"
model_not_finetuned = DistilBertForQuestionAnswering.from_pretrained(model_path)
tokenizer_not_finetuned = DistilBertTokenizerFast.from_pretrained(model_path)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
start_logits, end_logits, features = forward_distilbert(model_not_finetuned, ds=ds_sample, tokenizer=tokenizer_not_finetuned)
results_distilbert_not_finetuned = compute_metrics(
    start_logits, end_logits, features, ds_sample,
    n_best=20, max_answer_length=50
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DistilBERT inference:   0%|          | 0/64 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Building predictions:   0%|          | 0/500 [00:00<?, ?it/s]

In [10]:
results_distilbert_not_finetuned

{'exact_match': 0.2,
 'f1': 7.834462772607801,
 'rouge1': np.float64(0.07191534459914353),
 'rouge2': np.float64(0.03069607321407338),
 'rougeL': np.float64(0.06955680376129332),
 'bleu': 0.012541996200093062}

#### DistilBert with finetuning

In [11]:
tokenizer_path = "../backend/distilbert-squad-finetuned_tokenizer"
model_path = "../backend/distilbert-squad-finetuned_model"
model = DistilBertForQuestionAnswering.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_path)

In [12]:
start_logits, end_logits, features = forward_distilbert(model, ds=ds_sample, tokenizer=tokenizer)
results_distilbert_finetuned = compute_metrics(
    start_logits, end_logits, features, ds_sample,
    n_best=20, max_answer_length=50
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DistilBERT inference:   0%|          | 0/64 [00:00<?, ?it/s]

Building predictions:   0%|          | 0/500 [00:00<?, ?it/s]

In [13]:
results_distilbert_finetuned

{'exact_match': 74.6,
 'f1': 83.40220257669176,
 'rouge1': np.float64(0.7596087082942087),
 'rouge2': np.float64(0.5054857359267266),
 'rougeL': np.float64(0.7587980584781542),
 'bleu': 0.45915352231519446}