<a href="https://colab.research.google.com/github/Rstam59/TaskDataRepoForStudents/blob/main/QuestionAnswering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from datasets import get_dataset_config_names

# domains = get_dataset_config_names("subjqa")
# domains

In [None]:
!pip install datasets --upgrade

In [None]:
!pip install farm-haystack[elasticsearch]

In [None]:
# !pip uninstall -y farm-haystack haystack-ai || true
# !pip install --upgrade haystack-ai           \
#                    elasticsearch-haystack   # ⬅ doc-store integration


In [None]:
from datasets import load_dataset

subjqa = load_dataset("subjqa", name = 'electronics')
subjqa

In [None]:
print(subjqa['train'][1])

In [None]:
import pandas as pd

dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}

for split, df in dfs.items():
    print(f"Number of questions in {split}: {df['id'].nunique()}")

In [None]:
qa_cols = ['title', 'question', 'answers.text', 'answers.answer_start', 'context']
sample_df = dfs['train'][qa_cols].sample(2, random_state = 7)
sample_df

In [None]:
start_idx = sample_df['answers.answer_start'].iloc[0][0]
end_idx = start_idx + len(sample_df['answers.text'].iloc[0][0])
sample_df['context'].iloc[0][start_idx:end_idx]

In [None]:
import matplotlib.pyplot as plt

counts = {}
question_types = ['What', 'How', 'Is', 'Does', 'Do', 'Was', 'Where', 'Why']

for q in question_types:
    counts[q] = dfs['train']['question'].str.startswith(q).value_counts()[True]

pd.Series(counts).sort_values().plot.barh()
plt.title('Frequency of Question Types')
plt.show()

In [None]:
for question_type in ['How', 'What', 'Is']:
    for question in (
        dfs['train'][dfs['train'].question.str.startswith(question_type)]
        .sample(n = 3, random_state = 42)['question']):
        print(question)

#Tokenizing text for QA

In [None]:
from transformers import AutoTokenizer

model_ckpt = 'deepset/minilm-uncased-squad2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
question = "How much music can this hold?"
context = "An MP3 is about 1 MB/minute, so about 6000 hours depending on file size"
inputs = tokenizer(question, context, return_tensors = 'pt')

In [None]:
input_df = pd.DataFrame.from_dict(tokenizer(question, context), orient = 'index')
input_df

In [None]:
print(tokenizer.decode(inputs['input_ids'][0]))

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

with torch.no_grad():
    outputs = model(**inputs)

print(outputs)

In [None]:
import numpy as np

s_scores = outputs.start_logits.detach().numpy().flatten()
e_scores = outputs.end_logits.detach().numpy().flatten()
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].numpy()[0])

fig, (ax1, ax2) = plt.subplots(nrows = 2, sharex = True, figsize = (10, 4))
colors = ['C0' if s!= np.max(s_scores) else 'C1' for s in s_scores]
ax1.bar(x = tokens, height = s_scores, color = colors)
ax1.set_ylabel("Start Scores")
colors = ['C0' if e!= np.max(e_scores) else 'C1' for e in e_scores]
ax2.bar(x = tokens, height = e_scores, color = colors)
ax2.set_ylabel("End Scores")
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
import torch

start_logits = outputs.start_logits
end_logits = outputs.end_logits

start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1
answer_span = inputs['input_ids'][0][start_idx: end_idx]
answer = tokenizer.decode(answer_span)
print(f'Question: {question}')
print(f'Answer: {answer}')

In [None]:
from transformers import pipeline

pipe = pipeline('question-answering', model = model, tokenizer = tokenizer)
pipe(question = question, context = context, topk = 3)

In [None]:
pipe(question = 'Why is there no data?', context = context,
     handle_impossible_answer = True)

#Dealing with long passages

In [None]:
def compute_input_length(row):
    inputs = tokenizer(row['question'], row['context'])
    return len(inputs['input_ids'])


dfs['train']['n_tokens'] = dfs['train'].apply(compute_input_length, axis = 1)

fig, ax = plt.subplots()
dfs['train']['n_tokens'].hist(bins = 100, grid = False, ax = ax)
plt.xlabel('Number of tokens in question-context pair')
ax.axvline(x = 512, ymin = 0, ymax = 1, linestyle = '--',
           color = 'C1', label = 'Maximun sequence length')
plt.legend()
plt.ylabel('Count')
plt.show()

In [None]:
example = dfs['train'].iloc[0][['question', 'context']]
tokenized_example = tokenizer(example['question'], example['context'],
                              return_overflowing_tokens = True, max_length = 100,
                              stride = 25, truncation = True)

In [None]:
for idx, window in enumerate(tokenized_example['input_ids']):
    print(f'Window #{idx} has {len(window)} tokens')

In [None]:
for window in tokenized_example['input_ids']:
    print(f'{tokenizer.decode(window)} \n')

#Using Haystack to build a QA Pipeline

Initializing a document store

In [None]:

url = """https://artifacts.elastic.co/downloads/elasticsearch/\
elasticsearch-7.9.2-linux-x86_64.tar.gz"""
!wget -nc -q {url}
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz


In [None]:
import os
from subprocess import Popen, PIPE, STDOUT

# Run Elasticsearch as a background process
!chown -R daemon:daemon elasticsearch-7.9.2
es_server = Popen(args=['elasticsearch-7.9.2/bin/elasticsearch'],
                  stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30

In [None]:
!curl -X GET "localhost:9200/?pretty"

In [None]:
# correct import for Haystack 1.26+
from haystack.document_stores import ElasticsearchDocumentStore

document_store = ElasticsearchDocumentStore(
    host="localhost",      # default
    port=9200,             # default
    username="",
    password="",
    index="document",      # main index name (you can change it)
    return_embedding=True
)


In [None]:

for split, df in dfs.items():
    # Exclude duplicate reviews
    docs = [{"content": row["context"],
             "meta":{"item_id": row["title"], "question_id": row["id"],
                     "split": split}}
        for _,row in df.drop_duplicates(subset="context").iterrows()]
    document_store.write_documents(docs, index="document")

print(f"Loaded {document_store.get_document_count()} documents")

#Initializing a retriever

In [None]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [None]:
item_id = 'B0074BW614'
query = "Is it good for reading?"
retrieved_docs = retriever.retrieve(query = query, top_k = 3, filters = {'item_id': [item_id],
                                                                            'split': ['train']})

In [None]:
print(retrieved_docs[0])

#Initializing a reader

In [None]:
from haystack.nodes import TransformersReader

model_ckpt = 'deepset/minilm-uncased-squad2'
reader = TransformersReader(
    model_name_or_path=model_ckpt,
    max_seq_len=384,
    doc_stride=128
)


#Putting it all together

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader = reader, retriever = retriever)

In [None]:
n_answers = 3
preds = pipe.run(
    query=query,
    params={
        "Retriever": {
            "top_k": 3,
            "filters": {"item_id": [item_id], "split": ["train"]}
        },
        "Reader": {
            "top_k": n_answers
        }
    }
)

In [None]:
print(f"Question: {preds['query']} \n")
answers = preds.get("answers", [])

if not answers:
    print("No answers found.")
else:
    for idx, ans in enumerate(answers):
        print(f"Answer {idx+1}: {ans.answer}")
        print(f"Review snippet: ...{ans.context}...\n")


#Improving Our QA Pipeline

Evaluating the Retriever

In [None]:
from haystack.pipelines import Pipeline

retriever_pipe = Pipeline()
retriever_pipe.add_node(component=retriever,
                        name="Retriever",
                        inputs=["Query"])


In [None]:
from haystack.schema import Answer, Document, Label

labels = []

for i, row in dfs["test"].iterrows():
    query = row["question"]
    context = row["context"]
    item_id = row["title"]
    question_id = row["id"]
    meta = {"item_id": item_id, "split": "test"}

    # Get the correct document from Elasticsearch
    docs = document_store.get_all_documents(filters={"item_id": [item_id], "split": ["test"]})
    if not docs:
        continue  # skip if no matching doc
    gold_doc = next((doc for doc in docs if doc.content == context), None)
    if gold_doc is None:
        continue  # skip if no exact match found

    if len(row["answers.text"]):
        for ans_text in row["answers.text"]:
            answer_obj = Answer(
                answer=ans_text,
                type="extractive",
                context=context,
                offsets_in_document=[{"start": 0, "end": 0}],  # placeholder
                document_id=gold_doc.id,
                score=1.0
            )
            label = Label(
                query=query,
                answer=answer_obj,
                document=gold_doc,
                is_correct_answer=True,
                is_correct_document=True,
                origin="gold-label",
                meta=meta
            )
            labels.append(label)
    else:
        # Handle no-answer examples
        answer_obj = Answer(
            answer="",
            type="extractive",
            context=context,
            offsets_in_document=[],
            document_id=gold_doc.id,
            score=1.0
        )
        label = Label(
            query=query,
            answer=answer_obj,
            document=gold_doc,
            is_correct_answer=True,
            is_correct_document=True,
            origin="gold-label",
            meta=meta
        )
        labels.append(label)

# ✅ Write labels to the store (to index="label")
document_store.write_labels(labels, index="label")

print(f"✅ Loaded {document_store.get_label_count(index='label')} question-answer pairs")


In [None]:
labels_agg = document_store.get_all_labels_aggregated(
    index="label",
    open_domain=True,
    aggregate_by_meta=["item_id"]
)

print("Aggregated labels:", len(labels_agg))

sample_ml = labels_agg[0]
print("Sample query :", sample_ml.query)
print("Item-ID      :", sample_ml.labels[0].meta["item_id"])


In [None]:
print("Aggregated labels:", len(labels_agg))

sample_ml = labels_agg[0]
print("Sample query :", sample_ml.query)
print("Item-ID      :", sample_ml.labels[0].meta["item_id"])


In [None]:
# ---------- helper: evaluate one query ----------
def single_query_eval(retriever_pipeline, multi_label, top_k=3):
    """
    Run one MultiLabel (aggregated question) through the pipeline
    and return recall@k.
    """
    item_id = multi_label.labels[0].meta["item_id"]   # ← fix
    metrics = retriever_pipeline.eval(
        labels=[multi_label],
        params={
            "Retriever": {
                "top_k":   top_k,
                "filters": {"item_id": [item_id], "split": ["test"]}
            }
        }
    )
    return metrics["Retriever"]["recall"]


In [None]:
# install a compatible version for Haystack 1.26.4
!pip install -U "transformers==4.47.0" --quiet



In [None]:



# ---------- full evaluation over several k values ----------
import pandas as pd
from haystack.pipelines import Pipeline

def evaluate_retriever(retriever, topk_values=(1, 3, 5, 10, 20)):
    results = {}
    pipe = Pipeline()
    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])

    for k in topk_values:
        recall_scores = [single_query_eval(pipe, ml, top_k=k) for ml in labels_agg]
        results[k] = {"recall": sum(recall_scores) / len(recall_scores)}

    return pd.DataFrame.from_dict(results, orient="index")


es_topk_df = evaluate_retriever(retriever)
print(es_topk_df)
