In [7]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings


In [3]:
def load_documents_from_directory(data):
    loader = DirectoryLoader(data, 
                            glob = "*.pdf",
                            loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

                            

In [4]:
extracted__data_of_file = load_documents_from_directory(data=r"C:\Users\VAIBHAVRAI\OneDrive\Desktop\LangchainProjects\INTIALIZER\MedicalChatbot\data")

In [5]:
def text_spliter(extracted__data_of_file):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 700,
        chunk_overlap = 50,
    )
    
    text_chunks = text_splitter.split_documents(extracted__data_of_file)
    return text_chunks


In [6]:
text_chunks = text_spliter(extracted__data_of_file)
print(f"Length of text chunks: {len(text_chunks)}")

Length of text chunks: 4250


In [8]:
def download_embeddings():
    embedding = HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
    return embedding

In [None]:
embeddings = download_embeddings()

In [10]:
query = embeddings.embed_query("What is the purpose of the study?")
print(f"Query embedding: {query}")

Query embedding: [0.008776651695370674, 0.15643653273582458, -0.04509524255990982, 0.051768913865089417, 0.009445113129913807, 0.034862786531448364, -0.010551833547651768, 0.02474001608788967, 0.04771703481674194, 0.035446859896183014, 0.0049771894700825214, 0.0061756581999361515, -0.06988959014415741, -0.001901131821796298, -0.0674164742231369, -0.051991336047649384, -0.07194741815328598, 0.0008498877286911011, 0.04470723494887352, -0.02794799394905567, -0.03563689813017845, -0.012978333048522472, 0.0912344753742218, 0.004037539940327406, -0.08054336905479431, 0.04109899327158928, -0.015233662910759449, -0.038993075489997864, 0.04187706112861633, 0.030341822654008865, 0.058921173214912415, 0.10268291085958481, 0.08704260736703873, 0.04398878291249275, -0.058621715754270554, 0.029144512489438057, 0.03288446366786957, 0.06653779000043869, 0.024259064346551895, 0.04004432260990143, -0.047169968485832214, -0.0002665957435965538, 0.0442504957318306, -0.003937646746635437, 0.012668251059949

In [38]:
from dotenv import load_dotenv
load_dotenv()

import os
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

In [12]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key= PINECONE_API_KEY)

index_name = "medical-chatbot"
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1",
        )
)

{
    "name": "medical-chatbot",
    "metric": "cosine",
    "host": "medical-chatbot-etxujuw.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [13]:
from langchain_pinecone import PineconeVectorStore

search = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name = index_name
)



In [14]:
from langchain_pinecone import PineconeVectorStore
document_search = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding= embeddings)

In [15]:
document_search

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2817d253f50>

In [None]:
retriver = document_search.as_retriever(search_type = "similarity",search_kwargs={"k": 3})



In [36]:
retrived_doc = retriver.invoke("What is acne")

In [32]:
retrived_doc

[Document(id='9c81a5aa-34e7-4525-b22c-0445826de3dd', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'C:\\Users\\VAIBHAVRAI\\OneDrive\\Desktop\\LangchainProjects\\INTIALIZER\\MedicalChatbot\\data\\Gale Encyclopedia of Medicine. Vol. 1. 2nd ed.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 224\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 24'),
 Document(id='d6fc3db7-c699-4f98-a686-6b758b4d9b42', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 239.0, 'page_label': '240', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'C:\\Users\\VAIBHAVRAI\\OneDrive\\Desktop\\LangchainProjects\\INTIALIZER\\MedicalChatbot\\data\\Gale Encyclopedia of Medicine. Vol. 1. 2nd ed.pdf', 'total_pages': 637.0}, page_content='acne.\nPurpose\nDifferent type

In [39]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    temperature=0.5,
    model="models/gemini-1.5-flash",
    max_tokens=500
)