In [1]:
from dotenv import load_dotenv
import os
import warnings
warnings.filterwarnings("ignore")

from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI , GoogleGenerativeAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone , ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from tqdm import tqdm
import time


In [2]:
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

#### Load LLM Gemini

In [3]:
def load_model(name_model):
    
    llm = ChatGoogleGenerativeAI(
        model=name_model,
        temperature=0.01,
        google_api_key=GOOGLE_API_KEY
    )
    return llm

model = load_model("gemini-1.5-flash")

In [4]:
model.invoke("Welcome Gemini !")

AIMessage(content='Thank you!  Is there anything I can help you with today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []}, id='run--42fe74e2-4169-4d89-898f-8f7b7022e901-0', usage_metadata={'input_tokens': 3, 'output_tokens': 15, 'total_tokens': 18, 'input_token_details': {'cache_read': 0}})

#### Load Data

In [5]:
def data_extraction(path_data) :

    if not os.path.exists(path=path_data) :
        raise FileNotFoundError(f"The directory {path_data} does not exist.")
    
    loader = DirectoryLoader(path=path_data , 
                             glob="*.pdf" ,
                               loader_cls=PyPDFLoader ,
                                show_progress=True )
    docs = loader.load()
    print(f"Number of documents loaded: {len(docs)}")
    if not docs :
        print(f"No PDF files found in {path_data}")

    return docs


extracted_data = data_extraction("../DataSets/")

100%|██████████| 1/1 [10:35<00:00, 635.51s/it]

Number of documents loaded: 4505





#### Text Chunking 

In [6]:
def text_split(extracted_data) :
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000 , chunk_overlap = 200)
    chunks = text_splitter.split_documents(extracted_data)
    return chunks

text_chunks = text_split(extracted_data)

In [7]:
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 12018


#### Embedding model

In [8]:
def embedding() :
    embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001" ,
                                              google_api_key=GOOGLE_API_KEY)
    
    return embedding

embedding = embedding()


In [9]:
# Test for embedding model
emb_qu = embedding.embed_query("Sayed Ali")
print(emb_qu)
print(f"Length {len(emb_qu)}")

[0.033262744545936584, -0.04583624377846718, -0.007227430120110512, 0.012480302713811398, 0.055423010140657425, 0.02036462351679802, 0.014924985356628895, -0.004126895219087601, 0.031131969764828682, -0.009737880900502205, 0.01622362621128559, 0.014348899014294147, -0.012359149754047394, 0.0032185816671699286, 0.014089487493038177, -0.0028034152928739786, 0.023563353344798088, 0.021167682483792305, 0.008936342783272266, -0.012969622388482094, 0.05718334764242172, -0.0037716804072260857, -0.03718428686261177, -0.03831556811928749, -0.03446822240948677, 0.0007000736659392715, 0.051768798381090164, -0.07508740574121475, -0.043272387236356735, 0.044699061661958694, -0.0335586741566658, 0.06126679852604866, -0.0791306346654892, 0.020705237984657288, -0.014834999106824398, -0.03327192738652229, 0.03088236227631569, 0.028418999165296555, 0.0010401898762211204, 0.03421711549162865, 0.010614023543894291, -0.018871579319238663, -0.009773609228432178, -0.00730155361816287, 0.02814314141869545, -0

In [None]:
index_name = "medicalbot"

pc = Pinecone(api_key=PINECONE_API_KEY)

existing_indexes = pc.list_indexes().names() 

if index_name in existing_indexes:
    print(f"Index '{index_name}' already exists. Using the existing index.")
else:
    print(f"Creating new index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"Index '{index_name}' created successfully.")

index = pc.Index(index_name)

Index 'medicalbot' already exists. Using the existing index.


In [None]:
def initialize_pinecone_vector_store(documents , index_name , embedding , retries = 3 , delay = 5) :

    for i in range(retries) :
        try :
            docsearch = PineconeVectorStore.from_documents([] ,
                                                            index_name=index_name , 
                                                            embedding=embedding)
            
            for j in tqdm(range(0 , len(documents)) , desc="Uploading documents" , unit="doc") :
                docsearch.add_documents([documents[i]])

            return docsearch

        except Exception as e:
            print(f"\n Attempt {i + 1} failed: {e}")
            if i < retries - 1:
                time.sleep(delay)
            else:
                raise e
            

docsearch = initialize_pinecone_vector_store(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding
)


Uploading documents:  45%|████▍     | 5364/12018 [1:41:26<5:13:33,  2.83s/doc] 

In [None]:
# Load the existing index
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name,
                                                    embedding=embedding)

docsearch