In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")
persist_directory = "../backend/chroma_db_squad"
chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
retriever = chroma_db.as_retriever()

In [2]:
def get_answer_qa(qa_pipeline, query, k: int=2):
    docs = retriever.get_relevant_documents(query)
    docs_context = "\n\n".join([d.page_content for d in docs[:k]])
    result = qa_pipeline(question=query, context=docs_context)
    return result["answer"]

In [4]:
import re
from typing import Optional

def extract_answer(cot_output: str) -> Optional[str]:
    """
    Given a chain-of-thought LLM output ending with:
      ... 
      Answer: <the extractive answer>
    this returns the <the extractive answer> string.
    If no “Answer:” line is found, returns None.
    """
    # Look for a line that starts with “Answer:” (case-sensitive),
    # optionally preceded by whitespace, and capture the rest of the line.
    match = re.search(r'^[ \t]*Answer:\s*(.+)$', cot_output, flags=re.MULTILINE)
    if not match:
        return None
    return match.group(1).strip()


In [5]:
def get_answer_text_generation(chain, query, k: int=3, cot_used=False):
    docs = retriever.get_relevant_documents(query)
    docs_context = "\n\n".join([d.page_content for d in docs[:k]])
    answer = chain.predict(
        retrieved_SQuAD_passage=docs_context,
        user_question=query
    ).strip()
    if cot_used:
        answer = extract_answer(answer) or "Unsure about answer."
    return answer

In [6]:
from datasets import load_dataset

ds = load_dataset("squad", split="validation")

In [8]:
ds_shuffled = ds.shuffle(seed=42)
ds_sample = ds_shuffled.select(range(100))

In [9]:
from evaluate import load
from tqdm.notebook import tqdm

In [10]:
squad_metric = load("squad")
rouge_metric = load("rouge")
bleu_metric  = load("bleu")

In [11]:
def evaluate(pipeline_or_chain, cot_used: bool=False, ds=ds_sample, is_distilbert: bool=False):
    squad_preds, squad_refs = [], []
    texts_pred, texts_ref = [], []  
    
    for ex in tqdm(ds, desc="Evaluating on SQuAD + BLEU/ROUGE"):
        if not is_distilbert:
            raw = get_answer_text_generation(pipeline_or_chain, ex["question"], k=3, cot_used=cot_used)
        else:
            raw = get_answer_qa(pipeline_or_chain, ex["question"], k=3)

        squad_preds.append({
            "id": ex["id"],
            "prediction_text": raw,
        })
        squad_refs.append({
            "id": ex["id"],
            "answers": {
                "text": ex["answers"]["text"], 
                "answer_start": ex["answers"]["answer_start"]
            }
        })

        texts_pred.append(raw)
        texts_ref.append(ex["answers"]["text"][0])

    squad_results = squad_metric.compute(
        predictions=squad_preds, 
        references=squad_refs
    )

    rouge_results = rouge_metric.compute(
        predictions=texts_pred, 
        references=texts_ref
    )

    bleu_results = bleu_metric.compute(
        predictions=texts_pred, 
        references=[[r] for r in texts_ref]  
    )

    return {
        **squad_results,
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "bleu":   bleu_results["bleu"],
    }

# DistilBert

### Finetuned DistilBert

In [12]:
from transformers import pipeline

qa_pipeline_distilbert_finetuned = pipeline("question-answering", 
                        model="../backend/distilbert-squad-finetuned_model", 
                        tokenizer="../backend/distilbert-squad-finetuned_tokenizer")

Device set to use cuda:0


In [13]:
query = "What is a very seldom used unit of mass in the metric system?"
get_answer_qa(qa_pipeline_distilbert_finetuned, query, k=3)

  docs = retriever.get_relevant_documents(query)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


'the metric slug'

In [14]:
results_distilbert_finetuned = evaluate(qa_pipeline_distilbert_finetuned, ds=ds_sample, is_distilbert=True)

Evaluating on SQuAD + BLEU/ROUGE:   0%|          | 0/100 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [15]:
results_distilbert_finetuned

{'exact_match': 70.0,
 'f1': 73.2813258636788,
 'rouge1': np.float64(0.687505288828818),
 'rouge2': np.float64(0.4449621723305933),
 'rougeL': np.float64(0.6872597010832304),
 'bleu': 0.5422825123791992}

### NOT Finetuned DistilBert

In [16]:
from transformers import pipeline

qa_pipeline_distilbert_not_finetuned = pipeline("question-answering", 
                        model="distilbert/distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [17]:
query = "What is a very seldom used unit of mass in the metric system?"
get_answer_qa(qa_pipeline_distilbert_not_finetuned, query, k=3)

'of particles, there are no internal forces that are unbalanced. That'

In [18]:
results_distilbert_not_finetuned = evaluate(qa_pipeline_distilbert_not_finetuned, ds=ds_sample, is_distilbert=True)

Evaluating on SQuAD + BLEU/ROUGE:   0%|          | 0/100 [00:00<?, ?it/s]

In [19]:
results_distilbert_not_finetuned

{'exact_match': 0.0,
 'f1': 1.735071301247772,
 'rouge1': np.float64(0.014065934065934066),
 'rouge2': np.float64(0.003636363636363636),
 'rougeL': np.float64(0.014139194139194141),
 'bleu': 0.0054419568831522705}

# Phi4

In [21]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_ollama import ChatOllama
import torch

# llm = HuggingFaceEndpoint(
#     repo_id="microsoft/phi-4",
#     task="text-generation",
#     temperature=0.0,
#     max_new_tokens=512,
#     model_kwargs={"device_map": "cuda", "torch_dtype": torch.float16},
# )
# chat = ChatHuggingFace(
#     llm=llm,
#     verbose=True,
# )

chat = ChatOllama(
    model="phi4",    
    temperature=0.0,        
    num_predict=512,        # max new tokens
)

#### Zero Shot Prompt

In [22]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

zero_shot_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        """
        You are an expert extractive question-answering system.
        Use only the provided context to answer the question.
        Always output the answer using the exact wording and phrasing as it appears in the context.
        If the answer is not contained in the context, reply “Unsure about answer.”
        """
    ),
    HumanMessagePromptTemplate.from_template(
        """
        Context:
        {retrieved_SQuAD_passage}

        Question:
        {user_question}

        'Answer:'
        """
    )
])

In [23]:
from langchain import LLMChain

chain_zero_shot_prompt = LLMChain(llm=chat, prompt=zero_shot_prompt)

  chain_zero_shot_prompt = LLMChain(llm=chat, prompt=zero_shot_prompt)


In [24]:
query = "What is a very seldom used unit of mass in the metric system?"
get_answer_text_generation(chain_zero_shot_prompt, query, k=3, cot_used=False)

'The metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf.'

In [25]:
results_zero_shot = evaluate(chain_zero_shot_prompt, ds=ds_sample, is_distilbert=False, cot_used=False)

Evaluating on SQuAD + BLEU/ROUGE:   0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
results_zero_shot

{'exact_match': 9.0,
 'f1': 35.40381629282565,
 'rouge1': np.float64(0.32876703530083873),
 'rouge2': np.float64(0.22962927894887022),
 'rougeL': np.float64(0.32855611646718663),
 'bleu': 0.06971672811429382}

#### COT Prompt

In [27]:
cot_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        """
        You are an expert extractive question-answering system.
        When given a context and a question, you will:
        1. Think through the relevant part of the context step by step.
        2. Show your reasoning clearly (chain-of-thought).
        3. Finally, output **only** the answer using the exact wording as it appears in the context.
        If the answer is not contained in the context, your final answer must be 'Unsure about answer.'
        """
    ),
    HumanMessagePromptTemplate.from_template(
        """
        Context:
        {retrieved_SQuAD_passage}

        Question:
        {user_question}

        Begin by reasoning step by step, then conclude with 'Answer: <your extractive answer>'."""
    ),
])

In [28]:
chain_cot_prompt = LLMChain(llm=chat, prompt=cot_prompt)

In [29]:
query = "What is a very seldom used unit of mass in the metric system?"
get_answer_text_generation(chain_cot_prompt, query, k=3, cot_used=True)

'metric slug'

In [30]:
results_cot = evaluate(chain_cot_prompt, ds=ds_sample, is_distilbert=False, cot_used=True)

Evaluating on SQuAD + BLEU/ROUGE:   0%|          | 0/100 [00:00<?, ?it/s]

In [31]:
results_cot

{'exact_match': 50.0,
 'f1': 65.75330030867113,
 'rouge1': np.float64(0.6316693562618112),
 'rouge2': np.float64(0.44573261033398315),
 'rougeL': np.float64(0.6297702009698172),
 'bleu': 0.2572570101548084}