## Installing the Pinecone and LangChain libraries required for the Chatbot

In [67]:
# !pip install \
#   langchain_community \
#   langchain_pinecone \
#   langchain_openai \
#   unstructured \
#   langchain-text-splitters \
#   pinecone-text

## Importing the Langchain and Pinecone Modules from the libraries

In [68]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.retrievers import (
    PineconeHybridSearchRetriever)
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from tqdm.auto import tqdm
import pinecone
import os
import glob

## Loading the Documents

In [69]:
# List all files in the 'data' directory
print("Files in 'data' directory:", os.listdir('data'))

Files in 'data' directory: ['Appointments Module ✓.docx', 'Comms Module ✓.docx', 'Clients - Cases Module ✓.docx', 'Clients Module ✓.docx', 'Retrieval Augmented Generation (RAG) for Everyone.docx', 'Leads Module ✓.docx', 'Activity Logs Module ✓.docx', 'Settings Module.docx', 'Reports Module ✓.docx', 'Cases Module ✓.docx', 'Clients - Comms Module ✓.docx', 'Clients - Billings Module ✓.docx', 'c.pdf', 'Virtual Visits Module ✓.docx', 'Calendar Module ✓.docx', 'Files Module ✓.docx', ' Billings Module ✓.docx', 'Clients - Clients Module ✓.docx']


In [70]:
directory = 'data'
def load_docs(directory):
  loader = DirectoryLoader(directory)
  docs = loader.load()
  return docs

docs = load_docs(directory)
len(docs)

18

In [71]:
print(f"Loaded {len(docs)} documents.")


Loaded 18 documents.


## Natural Language ToolKit

In [72]:
 import nltk
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## API Keys Verifications

In [73]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Fetch API keys from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

# Set the environment variables
if openai_api_key:
    os.environ['OPENAI_API_KEY'] = openai_api_key
if pinecone_api_key:
    os.environ['PINECONE_API_KEY'] = pinecone_api_key

#Verify that the keys are loaded
#print(f"OpenAI API Key: {os.environ.get('OPENAI_API_KEY')}")
#print(f"Pinecone API Key: {os.environ.get('PINECONE_API_KEY')}")

## Index the data in Pinecone

In [74]:
use_serverless = True  


## OpenAI Embeddings Model (Dense Vectors)

In [75]:
# Split our documents into chunks

os.environ["HUGGINGFACEHUB_API_TOKEN"]="HUGGINGFACEHUB_API_TOKEN"
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",  #response time is 9s  #infloat/e5-base-V2 has 3.53sec response time.
)
embeddings

index_name = "test-2"

# Split our documents into chunks
chunk_size = 1000  
chunk_overlap = 200  

# Initialize the text splitter with chunk size and chunk overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Split your documents into chunks
split_docs = text_splitter.split_documents(docs)

## BM25 (Sparse Vectors)

In [86]:
# import pandas as pd
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# # Assume you have the 'docs' variable which is your original list of documents

# # Initialize the RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# # Split the documents
# split_docs = text_splitter.split_documents(docs)

# # Extract metadata from each document
# # metadata = [doc.metadata for doc in split_docs]

# # Convert the metadata into a pandas DataFrame
# df_metadata = pd.DataFrame(metadata)

# # Print the DataFrame to see the result
# #print(df_metadata)

# # Check the columns in the DataFrame
# print(df_metadata.columns)

In [77]:
#print(df_metadata.info())


In [78]:
# import pandas as pd
# from pinecone_text.sparse import BM25Encoder

# # Initialize the BM25Encoder
# bm25 = BM25Encoder()

# # Assuming df_metadata is your DataFrame containing the 'productDisplayName' column
# encode = df_metadata['source'].tolist()

# # Fit the BM25 model on the productDisplayNames
# bm25.fit(encode)

# # Create lists to store the results
# encoded_queries = []
# encoded_documents = []

# # Loop through each productDisplayName
# for name in encode:
#     query_encoding = bm25.encode_queries(name)
#     document_encoding = bm25.encode_documents(name)
    
#     encoded_queries.append(query_encoding)
#     encoded_documents.append(document_encoding)

# # Optionally, you can convert the results into DataFrames for easier handling
# df_encoded_queries = pd.DataFrame(encoded_queries)
# df_encoded_documents = pd.DataFrame(encoded_documents)

# # Print the results
# #print("Encoded Queries:")
# #print(df_encoded_queries.head())

# #print("Encoded Documents:")
# #print(df_encoded_documents.head())

In [79]:
# retriever = PineconeHybridSearchRetriever(
#     embeddings=embeddings, sparse_encoder=bm25, index=index_name
# )
# retriever

## Pinecone Vector Store

In [80]:
index_name = "test-2"
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

## Searching for Similar Documents in Vector Store

In [90]:
tenant_id = "tenant_A"  # Assigning metadata
query="Hi, I'm Ujjwal Khadka from Novelty Technology"  #query with metadata filters

similar_docs = vectorstore.similarity_search(query, k=5, filter={"tenant_id": tenant_id}) #Adding an ID prefix

## Importing Chat Model as GPT-4o

In [91]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.7
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True  # This will return source documents in the response

)

#qa.invoke(query)

In [92]:
# Post-process the output
response = qa.invoke(query)
# result = response.get('result', 'No result found')
source_documents = response.get('source_documents', 'No source documents available')
source_info = response['source_documents']  
print(f"Response: {response['result']} (Source: {source_info})")

Response: Hello Ujjwal Khadka! How can I assist you today? (Source: [Document(metadata={'source': 'data/c.pdf'}, page_content="Computer Operator, Election Commission\n\n7 Mr. Purna Man Shakya\n\nAdvisor, Reliance Law Firm\n\nVoters' Awareness Program Coordination Committee\n\nName\n\n1 Mr. Purusottam Sapkota\n\n2 Mr. Neel Kantha Upreti\n\nDesignation/Organization Secretary,\n\nUnder Coordinator Project Chief, Election Commission – Member\n\nElection Commission-\n\n3 Mr. Mark Wallem\n\nDirector NDI/Nepal - Member\n\n4 Ms. Anamika Rai\n\nProgram Officer, NDI/Nepal-Member Secretary\n\n47"), Document(metadata={'source': 'data/c.pdf'}, page_content="Computer Operator, Election Commission\n\n7 Mr. Purna Man Shakya\n\nAdvisor, Reliance Law Firm\n\nVoters' Awareness Program Coordination Committee\n\nName\n\n1 Mr. Purusottam Sapkota\n\n2 Mr. Neel Kantha Upreti\n\nDesignation/Organization Secretary,\n\nUnder Coordinator Project Chief, Election Commission – Member\n\nElection Commission-\n\n3 Mr.

## Query and Response (with Pinecone and without Pinecone)

In [84]:
query = """ 12. How to approve a membership????"""

# Send each query to the LLM twice, first with relevant knowledge from Pincone 
# and then without any additional knowledge.
print("Response \n")
print("Chat with Pinecone:")
print(qa.invoke(query).get("result"))
#print("\nChat with GPT-4o:")
#print(llm.invoke(query).content)
# Combine the two responses for clarity
#print("\nCombined Response (Pinecone + GPT-4o):")
#combined_response = f"Pinecone Response: {"Chat with Pinecone:"}\nGPT-4o Response: {"\nChat with GPT-4o:"}"
#print(combined_response)


Response 

Chat with Pinecone:
To approve a membership, follow these steps:

1. After creating a client (either an Individual or a group), they will be in a "submitted" status.
2. Review the client's information to ensure everything is correct and meets the necessary criteria.
3. Approve the client or group. Once approved, the client or group will move out of the "submitted" status.
4. Once approved, clients will receive a receipt and a welcome email (if applicable).

Note: No transactions will happen until the client or group is approved. If a client or group is declined, they will receive an email with notes, if any are provided.


In [85]:
# Function to delete documents by ids
def delete_documents(ids):
    vectorstore.delete(ids)
    

# Deleting documents with specific ids
ids_to_delete = ["id1", "id2"]  # Replace with actual document ids
delete_documents(ids_to_delete)    

## Chaining Everything with a SequentialChain
