In [None]:
import chardet 
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone 
import pinecone 
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [4]:
#By setting these environment variables, the script or application using Pinecone can authenticate with the service using the provided API key
# and specify the environment where the Pinecone service is deployed. 
#This allows the script to interact with the Pinecone service for vector indexing and search operations.
PINECONE_API_KEY = "46dbbaa9-27a9-404b-b7ae-9a13538fdbdf"
PINECONE_API_ENV = "gcp-starter"

In [2]:
#Extract data from the PDF 
#this function takes a directory path containing PDF files as input, initializes a DirectoryLoader object to load PDF files using PyPDFLoader,
# loads the documents from the directory, and returns the loaded documents. 
This function is likely part of a larger data processing pipeline for working with PDF data.
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [None]:
#alling the load_pdf function with a directory path 
extracted_data = load_pdf(r"C:\Users\sreek\Desktop\Temp\1044191010_last part\Langchain Chatbot\data")

In [5]:
# extracted_data

this function takes the extracted data from PDF files as input, initializes a RecursiveCharacterTextSplitter object to split the text data into smaller chunks, splits the documents into text chunks, and returns the split text chunks. This function could be useful for breaking down large text documents into manageable segments for further analysis or processing. If you have any further questions or need clarification, feel free to ask!

In [11]:
#Create text chunks
def text_split(extracted_data):#This line defines a function named text_split that takes one parameter extracted_data
    #responsible for splitting text data into smaller chunks. Each chunk will contain up to 500 characters.
    #This argument specifies the overlap between consecutive chunks in terms of characters
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [12]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 1170


In [8]:
# text_chunks

In [13]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [14]:
embeddings = download_hugging_face_embeddings()

In [15]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [16]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
# query_result

In [21]:

#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="medchatgpt"

#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [22]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "I've been feeling kind of down lately"

docs=docsearch.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content='a) Microangiopathic hemolytic anemia  \nb) Sickle cell disease  \nc) Thalassemia  \nd) G6PD deficiency.  \n  \n \n3) A 73 -year-old man comes to the office with fatigue that has become progressively \nworse over the last several months. He is also short of breath when he walks up one \nflight of stairs. He drinks 4 vodka martinis a day. He complains of numbness and \ntingling in his feet. On physical examination he has decreased sensation of his feet. His', metadata={}), Document(page_content='a) Microangiopathic hemolytic anemia  \nb) Sickle cell disease  \nc) Thalassemia  \nd) G6PD deficiency.  \n  \n \n3) A 73 -year-old man comes to the office with fatigue that has become progressively \nworse over the last several months. He is also short of breath when he walks up one \nflight of stairs. He drinks 4 vodka martinis a day. He complains of numbness and \ntingling in his feet. On physical examination he has decreased sensation of his feet. His', metadata