In [14]:
## lets take the embedding model and vectore store and try to look in deep how we are going to convert the data from raw to vectors

import os 
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [15]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [16]:
# lets see how many dimension embeddings does this model gives 
embd = model.embed_query("Hey How are you?")
print(f"Embedding dimension : {len(embd)}")

Embedding dimension : 768


In [53]:
import faiss 
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

index = faiss.IndexFlatL2(768)

vectorstore = FAISS(
    embedding_function=model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

#### **Load the Data**

In [25]:
# as data we used medical book 
from langchain_community.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader('/Users/umesh/Desktop/Insurance-RAG/data', glob="*.pdf")

docs = loader.load()


In [26]:
len(docs)

759

In [28]:
docs[10]

Document(metadata={'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '/Users/umesh/Desktop/Insurance-RAG/data/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'total_pages': 759, 'page': 10, 'page_label': '11'}, page_content='Ann M. Haren\nScience Writer\nMadison, CT\nJudy C. Hawkins, M.S.\nGenetic Counselor\nThe University of Texas Medical\nBranch\nGalveston, TX\nCaroline Helwick\nMedical Writer\nNew Orleans, LA\nDavid Helwig\nMedical Writer\nLondon, Ontario\nLisette Hilton\nMedical Writer\nBoca Raton, FL\nKatherine S. Hunt, M.S.\nGenetic Counselor\nUniversity of New Mexico Health\nSciences Center\nAlbuquerque, NM\nKevin Hwang, M.D.\nMedical Writer\nMorristown, NJ\nHolly Ann Ishmael, M.S., C.G.C.\nGenetic Counselor\nThe Children’s Mercy Hospital\nKansas City, MO\nDawn A. Jacob, M.S.\nGenetic Counselor\nObstetrix Medical Group of Texas\

In [47]:
## using text splitters 
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(docs)

print(f"Total chunks we got : {len(texts)}")

Total chunks we got : 7080


In [48]:
print(type(texts[0]))

<class 'langchain_core.documents.base.Document'>


In [52]:
print(texts[200])

page_content='Chemotherapy usually causes destruction of normal
cells, and cancer cells can become immune to chemical
destruction. Side effects and patient tolerance issues are
typically anticipated and dosages may have to be specifi-
cally altered. Very few chemotherapeutic agents offer
curative responses.
Biological therapies may cause patient toxicity
resulting in extensive side effects. This can occur since
the optimal dose may be exceedingly elevated above
patient tolerance.
Description
Surgery' metadata={'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '/Users/umesh/Desktop/Insurance-RAG/data/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'total_pages': 759, 'page': 27, 'page_label': '28'}


#### **Lets add some of these documents to vectorstore**

In [44]:
from uuid import uuid4
import pdb 

uuids = [str(uuid4()) for _ in range(len(texts))]
vectorstore.add_documents(documents=texts, ids=uuids)

['dad45dc9-a89a-4b6a-8fdf-e2354a5d3249',
 'bdbfe316-53c6-49dc-9032-ff547cb2132b',
 'c8fea537-4651-49c6-b7b2-2612f2295d93',
 '870d8ceb-5202-49a9-b79d-40e0f161769f',
 'ad50cbd5-d9c2-4135-b16f-912ec116805b',
 'd0ce20dc-6451-4ea4-9bbd-433aaf1c0211',
 'c4d9451f-fe1b-465f-a4a5-4d4f86c1fd61',
 '30786990-e93c-43ac-b68c-6d1df36bd974',
 '5e38b925-26b6-47f5-b457-f8221bad25ce',
 'fe598774-264f-47d8-8f74-87c3c3c0f2db',
 'ffd69ca5-abae-4efe-8b1b-7de423a4bdac',
 'd0b9172f-d7a6-4887-a831-ba1538e48127',
 '8c173d08-8897-4992-9062-cbd8ccb6a24a',
 '70a85d21-2660-405a-86b6-f5d4e8e2aff9',
 '12fafb3a-c913-4122-aa27-7333c13a0c03',
 '40b98a5a-4a00-44d6-9baa-f9fbb016c0f2',
 'c88ffe92-935f-4820-b3af-8e67f86bf294',
 '957f8393-34e1-4959-8314-5ed1eb50cb0d',
 '553399b9-7f1b-42b0-9ec5-5ae422272365',
 '2140689e-1626-42d1-980a-02e22f792ecf',
 '767c2187-630e-499c-ace2-8dfe8dd3d2ca',
 '874f0d2a-0d79-4ae1-8e63-c874b3a15c57',
 'b8e63bc4-e717-4d69-9b41-a3a5e7a52f53',
 'beedacdd-efca-4dbd-a1a2-1c7dd5c234ce',
 'ff32e3c7-0c62-

In [46]:
# making a similarity search to find relevant documents
results = vectorstore.similarity_search(
    "Cancer treatment?",
    k=2,
    # filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Cancer treatment can take many different forms, and [{'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '/Users/umesh/Desktop/Insurance-RAG/data/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'total_pages': 759, 'page': 23, 'page_label': '24'}]
* patient’s response to cancer. These treatments are mostly [{'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': '/Users/umesh/Desktop/Insurance-RAG/data/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'total_pages': 759, 'page': 28, 'page_label': '29'}]
