In [15]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from dotenv import load_dotenv
import os
from pinecone import ServerlessSpec
import pandas as pd
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

In [2]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [3]:
extracted_data = load_pdf("../data/")

In [4]:
# extracted_data

In [5]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [7]:
type(text_chunks)

list

In [8]:
print(len(text_chunks[8].page_content))

487


In [9]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
# query_result

In [24]:
# Load environment variables from the .env file
load_dotenv()

PINECONE_API = os.getenv("PINECONE_API_KEY")

In [25]:
#Initializing the Pinecone
pc = Pinecone(api_key=PINECONE_API)

In [26]:
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [27]:
index_name = 'medical-chatbot'

In [33]:
# Create index or create a new index manually using pinecone website
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384, # according to embedding model
        metric='cosine',
        spec=spec
    )

In [34]:
# connect to index
index = pc.Index(index_name)

In [35]:
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [36]:
#Creating Embeddings for Each of The Text Chunks & storing in pinecone DB , see in pinecone website
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

vectorstore.add_texts([t.page_content for t in text_chunks])

['ed75dfd6-5863-4988-ade5-fb5a07d7e849',
 '4e8f291f-4505-43e6-b5a5-6d11474fd513',
 'e9957216-2cd0-4973-814c-5a86499e14a4',
 '0bbb49af-dd2a-489a-8031-612cb67c8ae7',
 '75014dd0-0e39-4ca8-8b9d-7f722d3fdd8a',
 '9e152295-9da0-4aa0-821d-e137b01eaa05',
 '2eb225dc-0f58-432c-bfe4-76c30f14d2c6',
 'afff71b4-b545-402c-b5ad-809270279d9e',
 '09a06345-54a4-4417-970a-3f8a0962fb6c',
 '9c738e81-3684-4ef8-b018-cabea7214143',
 'a0ed1102-0e71-49d9-9035-b369a1eb4d2c',
 '6891ec53-f22e-44ce-8048-42b12e0c0967',
 '6f24a9ed-cc85-4af7-8a87-5c3d3c143870',
 'ea671fc6-9e18-48cb-a721-42cd48d18f3e',
 'b1be86a6-7da6-42a9-9386-ed75c76836b8',
 '473328e7-6837-4af5-9204-fdd0e59f786a',
 '963f6c2d-edd9-47ed-9d8f-8807347eef75',
 '0aea9680-fa61-4ace-80b1-444f7798d82c',
 '685710e6-7b69-457d-8d5e-7463a180ff49',
 'd2c1ef6a-b8fb-4cf2-94b0-854825de3c63',
 'a1982046-735e-412a-b3f1-3e75534ff873',
 'd38a9935-877f-478a-a9f9-f7b355d7aa07',
 'edd5a43c-1ae3-4a64-9c19-db51eb493560',
 '6af135fb-75ca-4acc-a063-a3048e842e06',
 '0139fd32-0ccb-

In [37]:
query = "What is depression"

In [38]:
# Perform a similarity search in the vector store using the provided query
# Retrieve the top 'k' most similar results
context = vectorstore.similarity_search(query,k=5)

In [39]:
print(context[0].page_content)

Definition
Antidepressant drugs are medicines that relieve
symptoms of depressive disorders .
Purpose
Depressive disorders may be either unipolar (depres-
sion alone) or bipolar (depression alternating with peri-ods of extreme excitation). The formal diagnosis requiresa cluster of symptoms, lasting at least two weeks. Thesesymptoms include, but are not limited to mood changes,insomnia or hypersomnia, and diminished interest in


In [65]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [66]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [67]:
# Load the llm 
llm=CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':200,
                          'temperature':0.8})

In [68]:
qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        chain_type="stuff",
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True)

In [69]:
response = qa_chain(query)  

In [70]:
response

{'query': 'What is depression',
 'result': "Depression is a mental condition in which a person feels extremely sad and loses interest in life. People with depression may also have sleep problems, loss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss ofloss of anore loss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of which can' loss of appet aloss of appet aloss of appet aloss of appet aloss of trouble concentrating, and loss of appet aloss of a loss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of their loss of appet aloss of appet aloss of appet aloss of which are commonly have loss of appet aloss of appet al",
 'source_documents': [Document(page_content='Definition\nAntidepressant drugs are medicines that relieve\nsymptoms of depressive disorders .\nPurpose\nDepressive disorders may be either unipolar (depres-\nsion alone) or bipolar (depression alternati

In [71]:
print(response["result"])

Depression is a mental condition in which a person feels extremely sad and loses interest in life. People with depression may also have sleep problems, loss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss ofloss of anore loss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of which can' loss of appet aloss of appet aloss of appet aloss of appet aloss of trouble concentrating, and loss of appet aloss of a loss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of appet aloss of their loss of appet aloss of appet aloss of appet aloss of which are commonly have loss of appet aloss of appet al


In [73]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa_chain({"query": user_input})
    print("Response : ", result["result"])

KeyboardInterrupt: 

In [51]:
# Another way to store content in index and do similarity search out of it
# # Step 1: Generate unique IDs
# ids = [f"doc{i+1}" for i in range(len(text_chunks))]

# # Step 2: Get embeddings for each text chunk
# values=[embeddings.embed_query(chunk.page_content) for chunk in text_chunks]

# metadata = [{'text': chunk.page_content} for chunk in text_chunks]

# # Step 4: Create the dataframe
# data = {'id': ids, 'values': values, 'metadata': metadata}
# df = pd.DataFrame(data)

# df.head(5)


# # Insert data into Pinecone index
# # to_upsert = [{"id" :row['id'], "values":row['values'], "metadata":row['metadata']} for _, row in df.iterrows()]
# # index.upsert(vectors=to_upsert)
# # since there is limit of 1000 vector can be put in index and we have text chunks 7060 so we made batch of it

# batch_size = 10
# to_upsert = []

# for i, row in df.iterrows():
#     # Prepare the vector dictionary
#     vector = {"id": row['id'], "values": row['values'], "metadata": row['metadata']}
    
#     # Add the vector to the to_upsert list
#     to_upsert.append(vector)
    
#     # If the batch is full, perform the upsert
#     if len(to_upsert) == batch_size:
    
#         index.upsert(vectors=to_upsert)
        
#         to_upsert = []  # Reset the batch list

# # Upsert any remaining vectors that didn't fill a full batch
# if to_upsert:
#     index.upsert(vectors=to_upsert)


# query = "What is depression"
# query_vector = embeddings.embed_query(query)
# # print(query_vector)


# # To do similarity search 
# similar =index.query(
#     vector=query_vector,
#     top_k=5,
#     include_values=True,
#     include_metadata=True
# )

# print(similar.matches[0].metadata["text"])