In [20]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from langchain_core.documents import Document
from pinecone import ServerlessSpec
from pinecone import Pinecone
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, OpenAI
from sentence_transformers import SentenceTransformer

load_dotenv()

True

In [15]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDINGS = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
with open('documents_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [None]:
def create_chunk_embedding(documents):
    
    processed_docs = [
        Document(page_content=doc["page_content"], metadata=doc["metadata"])
        for doc in documents
    ]

    # OpenAI Embeddings
    openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    faiss_index_openai = FAISS.from_documents(processed_docs, openai_embeddings)
    openai_index_path = "../embeddings/faiss_index_openai"
    faiss_index_openai.save_local(openai_index_path)

    # SentenceTransformers Embeddings
    st_model = SentenceTransformer("all-mpnet-base-v2")
    st_embeddings = np.array([st_model.encode(doc["page_content"]) for doc in documents])
    doc_ids = [doc["id"] for doc in documents]

    faiss_index_st = FAISS.from_embeddings(list(zip(doc_ids, st_embeddings)), st_model)
    st_index_path = "../embeddings/faiss_index_st"
    faiss_index_st.save_local(st_index_path)

    return {"openai_index": openai_index_path, "sentence_transformer_index": st_index_path}


In [None]:
index_path_one = create_chunk_embedding(documents)

### Evaluate FAISS retrieval

In [70]:
def load_faiss_index(index_path):
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    faiss_index = FAISS.load_local(
        index_path,
        embeddings,
        allow_dangerous_deserialization=True
    )
    return faiss_index

In [71]:
def query_faiss_index(faiss_index, query, k=5):
    query_text = query["question"]
    results = faiss_index.similarity_search(query_text, k=k)
    return results

In [73]:
first_index = load_faiss_index("../embeddings/faiss_index")

In [74]:
df_ground_truth = pd.read_csv('questions.csv')

In [75]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [76]:
ground_truth[0]

{'question': 'What are the possible causes and examination findings for a patient with bradycardia?',
 'case_prompt': 'A 71-year-old man presents to the clinic with complaints of dizziness and fatigue for the past week. He reports that these symptoms have worsened over the last few days, and he denies any chest pain or palpitations.',
 'document': '3d02524a'}

### Code to evaluate retrieval

In [148]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [149]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [150]:
def evaluate(ground_truth):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = query_faiss_index(first_index, q, k=5)
        relevance = [d.metadata["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [83]:
evaluate(ground_truth)

100%|██████████| 250/250 [01:57<00:00,  2.13it/s]


{'hit_rate': 0.288, 'mrr': 0.18553333333333333}

## Evaluate Pinecone Retrieval

In [None]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

def generate_embeddings(documents):
    texts = [doc["page_content"] for doc in documents] 
    embeddings = embeddings_model.embed_documents(texts) 
    return embeddings

# Example usage
chunked_document_embeddings = generate_embeddings(documents)

In [92]:
print(f"Generated {len(chunked_document_embeddings)} embeddings.")

Generated 5406 embeddings.


In [115]:
def combine_vector_and_text(documents: list[dict], doc_embeddings: list[list[float]]) -> list[dict]:
    data_with_metadata = []

    for doc, embedding in zip(documents, doc_embeddings):
        data_item = {
            "id": str(doc.get("id", "unknown_id")),
            "values": embedding, 
            "metadata": {"page_content": doc.get("page_content", ""), "id": str(doc.get("id", "unknown_id"))},
        }
        data_with_metadata.append(data_item)

    return data_with_metadata


In [116]:
data = combine_vector_and_text(documents=documents, doc_embeddings=chunked_document_embeddings) 

In [101]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.create_index(
name="final",
dimension=1536,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)

In [102]:
index = pc.Index("final")

In [117]:
def upsert_data_to_pinecone(data_with_metadata: list[dict[str, any]], chunk_size: int = 100) -> None:
    
    for i in range(0, len(data_with_metadata), chunk_size):
        chunk = data_with_metadata[i:i + chunk_size]
        index.upsert(vectors=chunk)


upsert_data_to_pinecone(data_with_metadata= data)

In [118]:
def get_query_embeddings(query: str) -> list[float]:
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings

In [119]:
query_embeddings = get_query_embeddings(query="How do i take the history for Breathlessness?")

In [120]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 4, include_metadata: bool = True
) -> dict[str, any]:
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

In [121]:
# Call the function
answers = query_pinecone_index(query_embeddings=query_embeddings)

answers

{'matches': [{'id': '894e1647',
              'metadata': {'id': '894e1647',
                           'page_content': 'CHAPTER 9   THE RESPIRATORy '
                                           'HISTORy 173\n'
                                           'References\n'
                                           '1. Schmitt BP , Kushner MS, Wiener '
                                           'SL. The diagnostic usefulness of '
                                           'the history \n'
                                           'of the patient with dyspnea. J Gen '
                                           'Intern Med 1986; 1:386–393. '
                                           'History alone \n'
                                           'was correct three out of four '
                                           'times when deciding the cause of '
                                           'dyspnoea \n'
                                           'in defined circumstances.\n'
          

In [None]:
from tqdm import tqdm

def evaluate_pinecone(ground_truth):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']  
        query_text = q['case_prompt']
        embeddings = get_query_embeddings(query_text) 
        results = query_pinecone_index(embeddings, top_k=5)
        
        relevance = [match["metadata"]["id"] == doc_id for match in results["matches"]]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [154]:
evaluate_pinecone(ground_truth)

100%|██████████| 250/250 [03:58<00:00,  1.05it/s]


{'hit_rate': 0.056, 'mrr': 0.0246}