In [20]:
import os
import json
import numpy as np
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, OpenAI

load_dotenv()

True

In [15]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDINGS = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
with open('documents_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [None]:
def create_chunk_embedding(documents):
    
    processed_docs = [
        Document(page_content=doc["page_content"], metadata=doc["metadata"])
        for doc in documents
    ]

    # OpenAI Embeddings
    openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    faiss_index_openai = FAISS.from_documents(processed_docs, openai_embeddings)
    openai_index_path = "../embeddings/faiss_index_openai"
    faiss_index_openai.save_local(openai_index_path)

    # SentenceTransformers Embeddings
    st_model = SentenceTransformer("all-mpnet-base-v2")


    st_embeddings = np.array([st_model.encode(doc["page_content"]) for doc in documents])
    doc_ids = [doc["id"] for doc in documents]

    faiss_index_st = FAISS.from_embeddings(st_embeddings, doc_ids)
    st_index_path = "../embeddings/faiss_index_st"
    faiss_index_st.save_local(st_index_path)

    return {"openai_index": openai_index_path, "sentence_transformer_index": st_index_path}



In [None]:
index_path_one = create_chunk_embedding(documents)

### Evaluate openAI embeddings

In [None]:
openai_index_path = "../embeddings/faiss_index_openai"


openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
faiss_index_openai = FAISS.load_local(openai_index_path, openai_embeddings)


query = "How do you diagnose atrial fibrillation?"
results = faiss_index_openai.similarity_search(query, k=5)

for res in results:
    print(res.page_content)
