In [22]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from langchain_core.documents import Document
from pinecone import ServerlessSpec
from pinecone import Pinecone
from dotenv import load_dotenv
from sentence_transformers import CrossEncoder
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, OpenAI

load_dotenv()

True

In [25]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDINGS = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

### Load document with ids

In [4]:
with open('documents_with_ids.json', 'rb') as f_out:
    documents = json.load(f_out)

In [10]:
documents[60]

{'page_content': 'edema; secondly, the imbalance of intravascular fluid \nexchange, the interstitial fluid generates more than reflux and \nresult in edema. Some of factors that make the liquid from a \ncapillary outflow greater than inflow would result in edema, \nsuch as congestive heart failure, acute nephritis, renal failure, \nthe amount of fluid more than renal excretion, intravascular \nfluid volume increased, or thrombosis, thrombophlebitis \ncause local venous return blocked. Various causes of hypoal-\nbuminemia, such as malnutrition, liver disease, massive pro-\nteinuria, severe diarrhea or high decomposition metabolic \nstate, can cause the plasma colloid osmotic pressure reduced, \nwhich is lead to another important reason for edema. \nCapillary endothelial damage can also lead to increased cap-\nillary permeability, such as bacterial, physical and chemical \nfactors, allergic reactions or immune damage, etc. Lymphatic \nobstruction can cause lymphedema, such as filariasis.

### Load the ground truth dataset

In [2]:
data = pd.read_csv('questions.csv')

ground_truth = data.to_dict(orient='records')

In [11]:
ground_truth[60]

{'question': 'Mediastinal issues or related cardiovascular examination',
 'case_prompt': "A 65-year-old male presents to the clinic with complaints of chest pain and a 'crunching' sound when he breathes, described as intermittent over the past week.",
 'document': '70d62f9f'}

In [16]:
doc_idx = {d['id']: d for d in documents}
doc_idx['70d62f9f']['page_content']

'mediastinal crunch, 103\nmediastinum, 149, 174\nmedical history, in pregnancy, 734\nmedically unexplained symptoms (MUS), 26, \n27b, 847t–850t\nmedication, in neurological history, 497–498, \n498b\nmedulla, anatomy of, 532f\nmedullary brainstem hypercapnia test, 876\nmelaena, 224–225, 268\nmelanoma, metastatic, 821f\nmembranes, rupture of, 732\nMénière’s disease, 495\nmeningocele, 699f\nmeningococcal rash, 871f\nmeningomyelocele, 698t\nmeniscal tear, 434f\nmeniscus sign, 208f\nmenorrhagia, 744\nMENs syndromes see multiple endocrine \nneoplasias (MENs) syndromes\nmenstrual history, 15, 303, 744–750\nmenstruation, 441\nmental health\nof elderly patients, 32–33\ngastrointestinal system and, 234–235\nneurological history and, 497\nrisk, 851\nsystems review of, 9b–12b\nmental health examination, 845–863, \n855t–857t, 863b\nfamily history in, 853\nsocial history in, 853–854\nmeralgia paraesthetica, 425, 575\nmetabolic syndrome, 442, 442t\nmetacarpophalangeal (MCP) joints, 375–379, \n379f'

### Retreival Implementation with reranking using pinecone

In [26]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("final")

In [27]:
def get_query_embeddings(query: str) -> list[float]:
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings

In [28]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 20, include_metadata: bool = True
) -> dict[str, any]:
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

In [29]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [30]:
def rerank_documents(query, retrieved_docs):
    
    pairs = []
    doc_list = []

    for doc in retrieved_docs["matches"]:
        text = doc["metadata"].get("page_content") 
        # if text is None:
        #     print(f"Warning: Missing 'page_content' in metadata for document ID {doc['metadata'].get('id', 'Unknown')}")
        #     continue 
        
        pairs.append((query, text))
        doc_list.append(doc)

    if not pairs:
        print("No valid documents found for reranking.")
        return []

    scores = reranker.predict(pairs)

    scored_docs = list(zip(scores, doc_list))
    scored_docs.sort(key=lambda x: x[0], reverse=True)
    reranked_docs = [doc for _, doc in scored_docs]
    
    return reranked_docs

### The Rag Flow