# Retrieval-Augmented Generation (RAG) for Domain-Specific Q&A

This notebook implements a research-quality RAG pipeline with hybrid retrieval (BM25 + dense) and a small, free LLM (Flan-T5).


## Setup & Configuration

- Install dependencies from `requirements.txt`
- Adjust hyperparameters in `config.json`


In [None]:
import json, os, random
import numpy as np
import pandas as pd
from tqdm import tqdm
from src.data_loader import load_and_prepare_corpus, load_corpus_dataframe
from src.retriever import BM25Retriever, DenseRetriever, HybridRetriever
from src.generator import T5Generator
from src.evaluate import compute_metrics

with open('config.json', 'r', encoding='utf-8') as f:
    CFG = json.load(f)
os.makedirs('data', exist_ok=True)
os.makedirs('outputs', exist_ok=True)
random.seed(CFG['seed'])
np.random.seed(CFG['seed'])


## 1. Dataset: Load and Chunk

We load a small split of an open dataset (default `ag_news`) and chunk documents into ~500-token segments with overlap.


In [None]:
if not os.path.exists(CFG['corpus_chunks_path']):
    df_corpus = load_and_prepare_corpus(
        dataset_name=CFG['dataset_name'],
        dataset_split=CFG['dataset_split'],
        text_fields=CFG['text_fields'],
        title_field=CFG['title_field'],
        chunk_size_tokens=CFG['chunk_size_tokens'],
        chunk_overlap_tokens=CFG['chunk_overlap_tokens'],
        embedding_model_name=CFG['embedding_model_name'],
        output_path=CFG['corpus_chunks_path'],
        seed=CFG['seed']
    )
else:
    df_corpus = load_corpus_dataframe(CFG['corpus_chunks_path'])
df_corpus.head()


## 2. Build Retrievers: BM25 and FAISS (dense)

We compute BM25 on raw text and build a FAISS index over sentence-transformers embeddings.


In [None]:
documents = df_corpus['text'].tolist()

# BM25
bm25 = BM25Retriever(documents, bm25_index_path=CFG['bm25_index_path'])
bm25.save()

# Dense
dense = DenseRetriever(CFG['embedding_model_name'],
                      index_path=CFG['faiss_index_path'],
                      embeddings_path=CFG['embeddings_path'])
if not os.path.exists(CFG['faiss_index_path']):
    dense.build(documents)
    dense.save()
else:
    dense.load()

hybrid = HybridRetriever(bm25, dense, CFG['bm25_weight'], CFG['dense_weight'])


## 3. Question Answering

Given a user query, retrieve top-k chunks, concatenate into context, and generate an answer with Flan-T5.


In [None]:
gen = T5Generator(CFG['generation_model_name'], max_new_tokens=CFG['max_new_tokens'], temperature=CFG['temperature'])

def answer_query(question: str, top_k: int = None):
    if top_k is None:
        top_k = CFG['top_k']
    results = hybrid.search(question, top_k=top_k)
    idxs = [i for i, s in results]
    ctx_chunks = [documents[i] for i in idxs]
    prompt = T5Generator.build_prompt(question, ctx_chunks)
    answer = gen.generate(prompt)
    return answer, ctx_chunks, results

sample_q = 'What is the news about the economy?'
pred, ctx, scores = answer_query(sample_q)
pred


## 4. Evaluation

We create a small set of Q&A pairs, compare baseline (no retrieval) vs RAG, and report EM/F1.


In [None]:
# Build a small eval set (you can replace with manual pairs in data/eval_qa.jsonl)
eval_pairs = [
    {"q": "What topic is discussed in the first sample?", "a": "news"},
    {"q": "Which subject relates to sports?", "a": "sports"}
]

# Baseline: direct generation without retrieval
def baseline_answer(question: str):
    prompt = f'Answer briefly: {question}'
    return gen.generate(prompt)

preds_base, preds_rag, golds = [], [], []
for item in eval_pairs:
    q, a = item['q'], item['a']
    golds.append(a)
    preds_base.append(baseline_answer(q))
    pred, _, _ = answer_query(q)
    preds_rag.append(pred)

em_base, f1_base = compute_metrics(preds_base, golds)
em_rag, f1_rag = compute_metrics(preds_rag, golds)
em_base, f1_base, em_rag, f1_rag
