In [1]:
pip install sentence-transformers faiss-cpu pandas

Collecting pandas
  Using cached pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.2 tzdata-2025.2
Note: you may need to restart the kernel to use updated packages.


In [59]:
import os
import json
import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from typing import List, Dict

In [60]:
file_paths = [
    'Pdf/dataset (1).json',
    'Pdf/Merged Q& A data.json',
    'Pdf/public_health_batch_2_51_to_100.json',
    'Pdf/public_health_batch_3_101_to_150.json',
    'Pdf/public_health_batch_4_next_50.json',
    'Pdf/public_health_batch_5_next_50.json',
    'Pdf/public_health_batch_6_statewise_200_to_249.json',
    'Pdf/public_health_batch_7_statewise_250_to_299.json',
    'Pdf/public_health_batch_9_statewise_350_to_399.json',
    'Pdf/public_health_batch_10_countywise_400_to_449.json',
    'Pdf/public_health_batch_11_countywise_450_to_499.json',
    'Pdf/public_health_batch_12_countywise_500_to_549.json',
    'Pdf/public_health_batch_13_countywise_550_to_599.json',
    'Pdf/public_health_socioeconomic_dataset.json',
    'Pdf/testing.json'
]

In [61]:
embedding_file = "vector_index.faiss"
metadata_file = "vector_metadata.json"
eval_query_file = "evaluation_queries.json"

In [62]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

In [63]:
def extract_text_from_file(path: str) -> List[Dict]:
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    texts = []
    if isinstance(data, dict) and "entries" in data:
        for entry in data["entries"]:
            content = f"Q: {entry.get('question')} A: {entry.get('answer')}"
            texts.append({"text": content, "source": os.path.basename(path)})
    elif isinstance(data, dict):
        for doc in data.values():
            content = ' '.join([
                doc.get("title", ""),
                doc.get("abstract", ""),
                doc.get("summary", ""),
                doc.get("key_findings", ""),
                doc.get("conclusion", "")
            ])
            texts.append({"text": content, "source": os.path.basename(path)})
    elif isinstance(data, list):
        for item in data:
            content = ' '.join([
                item.get("socio_economic_indicator", ""),
                item.get("summary", ""),
                item.get("analysis", ""),
                item.get("statistical_findings", "")
            ])
            texts.append({"text": content, "source": os.path.basename(path)})
    return texts

In [64]:
texts = [doc["text"] for doc in documents]
print(f"Generating embeddings for {len(texts)} documents...")
embeddings = model.encode(texts, show_progress_bar=True)

Generating embeddings for 1441 documents...


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

In [65]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
faiss.write_index(index, embedding_file)

In [66]:
with open(metadata_file, "w") as f:
    json.dump(documents, f, indent=2)

In [67]:
def evaluate(index, documents, model, eval_queries, k=5):
    mrr_total, precision_total, recall_total = 0, 0, 0
    num_queries = len(eval_queries)

    for item in tqdm(eval_queries):
        query = item["query"]
        relevant_ids = set(item["relevant_doc_ids"])
        query_embedding = model.encode([query])
        distances, indices = index.search(np.array(query_embedding), k)
        retrieved_set = set(indices[0])

        # MRR
        mrr = 0
        for rank, idx in enumerate(indices[0], 1):
            if idx in relevant_ids:
                mrr = 1 / rank
                break
        mrr_total += mrr

        # Precision & Recall
        match = relevant_ids.intersection(retrieved_set)
        precision_total += len(match) / k
        recall_total += len(match) / len(relevant_ids) if relevant_ids else 0

    print(f"\n📊 Evaluation Summary (Top-{k}):")
    print(f" - MRR: {mrr_total / num_queries:.4f}")
    print(f" - Precision@{k}: {precision_total / num_queries:.4f}")
    print(f" - Recall@{k}: {recall_total / num_queries:.4f}")

In [74]:
raw_queries = [
    "What is the relationship between education and obesity?",
    "How does housing influence respiratory conditions?",
    "How does income inequality affect life expectancy?",
    "What reforms were made to Connecticut’s education system?",
    "What is the impact of curriculum change on student learning?"
]

In [78]:
def auto_generate_eval_queries(queries, model, index, documents, top_k=1):
    eval_data = []

    for query in queries:
        query_embedding = model.encode([query])
        distances, indices = index.search(np.array(query_embedding), top_k)

        relevant_doc_ids = indices[0].tolist()
        eval_data.append({
            "query": query,
            "relevant_doc_ids": relevant_doc_ids
        })

        print(f"\nQuery: {query}")
        for i in relevant_doc_ids:
            print(f"Matched doc #{i}: {documents[i]['text'][:200]}")

    with open("evaluation_queries.json", "w") as f:
        json.dump(eval_data, f, indent=2)
    print("\nevaluation_queries.json generated with top-{top_k} results for each query.")

In [79]:
auto_generate_eval_queries(raw_queries, model, index, documents, top_k=1)


Query: What is the relationship between education and obesity?
Matched doc #202: Q: What impact does educational attainment have on obesity rates in the USA? A: Higher educational attainment is associated with lower obesity rates in the USA. Adults with a college degree have an ob

Query: How does housing influence respiratory conditions?
Matched doc #206: Q: What role does housing instability play in respiratory illness rates in the USA? A: Housing instability, such as frequent moves or overcrowding, increases respiratory illness rates by 15-20%. Poor 

Query: How does income inequality affect life expectancy?
Matched doc #201: Q: How does income inequality, as measured by the Gini coefficient, correlate with life expectancy in the United States? A: Income inequality, measured by the Gini coefficient, shows a negative correl

Query: What reforms were made to Connecticut’s education system?
Matched doc #9: Curriculum Development in Connecticut: A Strategy for Educational Improvement T

In [80]:
if os.path.exists(eval_query_file):
    with open(eval_query_file, 'r') as f:
        eval_queries = json.load(f)
    index = faiss.read_index(embedding_file)
    evaluate(index, documents, model, json.load(open("evaluation_queries.json")), k=5)
else:
    print(f"No evaluation file found at `{eval_query_file}`. Skipping evaluation.")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 54.71it/s]


📊 Evaluation Summary (Top-5):
 - MRR: 1.0000
 - Precision@5: 0.2000
 - Recall@5: 1.0000



