In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [2]:
DATA_PATH='/media/arindam-shukla/Linux Storage/medical_chatbot/medical_bot/data/'
DB_FAISS_PATH='/media/arindam-shukla/Linux Storage/medical_chatbot/medical_bot/vectorstores/db_faiss'

In [3]:
loader = DirectoryLoader(DATA_PATH, glob='*.pdf', loader_cls=PyPDFLoader)
documents = loader.load()
print(f" Loaded {len(documents)} documents from {DATA_PATH}")

 Loaded 637 documents from /media/arindam-shukla/Linux Storage/medical_chatbot/medical_bot/data/


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

print(f" Created {len(texts)} text chunks.")
print("First chunk:", texts[0].page_content[:200])

 Created 7151 text chunks.
First chunk: TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION


In [5]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
sample_text= "this is a text sentence."
vector = embeddings.embed_query(sample_text)

print(f" embedding vector size: {len(vector)}")
print(f" first 5 values: {vector[:5]}")

  from .autonotebook import tqdm as notebook_tqdm


 embedding vector size: 384
 first 5 values: [0.06502410769462585, 0.11340221762657166, 0.021272560581564903, 0.04130653291940689, 0.022471055388450623]


In [6]:
db = FAISS.from_documents(texts, embeddings)

query= " heart disease symptoms"
results = db.similarity_search(query,k=3)

print(f" found {len(results)} similar documents. ")
print(' first results: ', results[0].page_content[:200])

 found 3 similar documents. 
 first results:  symptoms through their twenties, but by age 40, mostpeople with this condition have symptoms that caninclude shortness of breath , rapid abnormal beating of
the atria (atrial fibrillation), and eventu


In [7]:
db.save_local(DB_FAISS_PATH)