## Installing the Pinecone and LangChain libraries required for the Chatbot

In [120]:
!pip install \
  langchain_community \
  langchain_pinecone \
  langchain_openai \
  unstructured \
  langchain-text-splitters \
  pinecone-text



## Importing the Langchain and Pinecone Modules from the libraries

In [121]:
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from tqdm.auto import tqdm
import pinecone
import os
import glob

## Loading the Documents

In [122]:
# List all files in the 'data' directory
print("Files in 'data' directory:", os.listdir('data'))

Files in 'data' directory: ['Appointments Module ✓.docx', 'Comms Module ✓.docx', 'Clients - Cases Module ✓.docx', 'Clients Module ✓.docx', 'Leads Module ✓.docx', 'Activity Logs Module ✓.docx', 'Settings Module.docx', 'Reports Module ✓.docx', 'Cases Module ✓.docx', 'Clients - Comms Module ✓.docx', 'Clients - Billings Module ✓.docx', 'c.pdf', 'Virtual Visits Module ✓.docx', 'Calendar Module ✓.docx', 'Files Module ✓.docx', ' Billings Module ✓.docx', 'Clients - Clients Module ✓.docx']


In [123]:
directory = 'data'
def load_docs(directory):
  loader = DirectoryLoader(directory)
  docs = loader.load()
  return docs

docs = load_docs(directory)
len(docs)

17

In [124]:
print(f"Loaded {len(docs)} documents.")


Loaded 17 documents.


## Natural Language ToolKit

In [125]:
 import nltk
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## API Keys Verifications

In [126]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Fetch API keys from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

# Set the environment variables
if openai_api_key:
    os.environ['OPENAI_API_KEY'] = openai_api_key
if pinecone_api_key:
    os.environ['PINECONE_API_KEY'] = pinecone_api_key

#Verify that the keys are loaded
#print(f"OpenAI API Key: {os.environ.get('OPENAI_API_KEY')}")
#print(f"Pinecone API Key: {os.environ.get('PINECONE_API_KEY')}")

## OpenAI Embeddings Model (Dense Vectors)

In [127]:
# Split our documents into chunks

os.environ["HUGGINGFACEHUB_API_TOKEN"]="HUGGINGFACEHUB_API_TOKEN"
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",  #response time is 9s  #infloat/e5-base-V2 has 3.53sec response time.
)
embeddings

index_name = "test-2"

# Split our documents into chunks
chunk_size = 1000  
chunk_overlap = 200  

# Initialize the text splitter with chunk size and chunk overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Split your documents into chunks
split_docs = text_splitter.split_documents(docs)

import pandas as pd
df_dense = pd.DataFrame(split_docs)
#df_dense.shape



## BM25 (Sparse Vectors)

In [128]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assume you have the 'docs' variable which is your original list of documents

# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Split the documents
split_docs = text_splitter.split_documents(docs)

# Extract metadata from each document
metadata_list = [doc.metadata for doc in split_docs]

# Convert the metadata into a pandas DataFrame
df_metadata = pd.DataFrame(metadata_list)

# Print the DataFrame to see the result
#print(df_metadata)

# Check the columns in the DataFrame
print(df_metadata.columns)

Index(['source'], dtype='object')


In [129]:
import pandas as pd
from pinecone_text.sparse import BM25Encoder

# Initialize the BM25Encoder
bm25 = BM25Encoder()

# Assuming df_metadata is your DataFrame containing the 'productDisplayName' column
encode = df_metadata['source'].tolist()

# Fit the BM25 model on the productDisplayNames
bm25.fit(encode)

# Create lists to store the results
encoded_queries = []
encoded_documents = []

# Loop through each productDisplayName
for name in encode:
    query_encoding = bm25.encode_queries(name)
    document_encoding = bm25.encode_documents(name)
    
    encoded_queries.append(query_encoding)
    encoded_documents.append(document_encoding)

# Optionally, you can convert the results into DataFrames for easier handling
df_encoded_queries = pd.DataFrame(encoded_queries)
df_encoded_documents = pd.DataFrame(encoded_documents)

# Print the results
#print("Encoded Queries:")
#print(df_encoded_queries.head())

#print("Encoded Documents:")
#print(df_encoded_documents.head())

100%|██████████| 404/404 [00:00<00:00, 7002.96it/s]


## Pinecone Vector Store

In [132]:
index_name = "test-2"
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)
vectorstore

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x3277c23c0>

In [134]:
upserts = [
    {'id': 'id1', 'values': [0.1, 0.2, 0.3], 'metadata': {'key': 'value'}},
    {'id': 'id2', 'values': [0.4, 0.5, 0.6], 'metadata': {'key': 'value'}}
]

In [135]:
vectorstore.upsert(vectors=upserts)

AttributeError: 'PineconeVectorStore' object has no attribute 'upsert'

## Searching for Similar Documents in Vector Store

In [90]:
tenant_id = "tenant_A"  # This can be dynamically assigned based on your application logic

query="Hi"

similar_docs = vectorstore.similarity_search(query, k=5, filter={"tenant_id": tenant_id})

## Upsert

## Importing Chat Model as GPT-4o

In [49]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.7
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True  # This will return source documents in the response

)

#qa.invoke(query)

In [50]:
# Post-process the output
response = qa.invoke(query)
result = response.get('result', 'No result found')
source_documents = response.get('source_documents', 'No source documents available')
source_info = response['source_documents']  
print(f"Response: {response['result']} (Source: {source_info})")

Response: Hello! How can I assist you today? (Source: [Document(metadata={'blobType': '', 'flowise_chatId': '46225d20-7c7b-4719-a6ba-7ed3e67fc794', 'loc.lines.from': 1.0, 'loc.lines.to': 5.0, 'loc.pageNumber': 96.0, 'pdf.info.CreationDate': "D:20240524094432+02'00'", 'pdf.info.Creator': 'Adobe InDesign 17.4 (Macintosh)', 'pdf.info.IsAcroFormPresent': False, 'pdf.info.IsXFAPresent': False, 'pdf.info.ModDate': "D:20240524085908+01'00'", 'pdf.info.PDFFormatVersion': '1.7', 'pdf.info.Producer': 'Adobe PDF Library 16.0.7', 'pdf.info.Trapped.name': 'False', 'pdf.metadata._metadata.dc:format': 'application/pdf', 'pdf.metadata._metadata.pdf:producer': 'Adobe PDF Library 16.0.7', 'pdf.metadata._metadata.pdf:trapped': 'False', 'pdf.metadata._metadata.xmp:createdate': '2024-05-24T09:44:32+02:00', 'pdf.metadata._metadata.xmp:creatortool': 'Adobe InDesign 17.4 (Macintosh)', 'pdf.metadata._metadata.xmp:metadatadate': '2024-05-24T08:59:08+01:00', 'pdf.metadata._metadata.xmp:modifydate': '2024-05-24T0

## Query and Response (with Pinecone and without Pinecone)

In [51]:
query = """What are specific election crimes in Nepal?"""

# Send each query to the LLM twice, first with relevant knowledge from Pincone 
# and then without any additional knowledge.
print("Response \n")
print("Chat with Pinecone:")
print(qa.invoke(query).get("result"))
print("\nChat with GPT-4o:")
print(llm.invoke(query).content)

Response 

Chat with Pinecone:
The specific election crimes in Nepal according to Chapter 2 of the Election Crimes and Punishment Act of 1990 are:

a. To obtain a ballot and cast a vote in another’s name.

b. To influence others through fear, terror, or intimidation.

c. To indulge in character assassination of a candidate or his family members with an intention to influence the election or to influence voters.

d. To campaign in a manner affecting the independence, sovereignty, and integrity of the nation, or to engage in publicity based on religion, caste, creed, language, or regionalism leading to communal disharmony.

e. To offer any cash, goods, or gifts in exchange for votes.

Chat with GPT-4o:
In Nepal, election crimes are actions that violate the electoral laws and regulations designed to ensure free, fair, and transparent elections. Some specific election crimes in Nepal include:

1. **Voter Fraud**: This includes activities like double voting, voting in someone else’s name, a

In [52]:
# Function to delete documents by ids
def delete_documents(ids):
    vectorstore.delete(ids)
    

# Deleting documents with specific ids
ids_to_delete = ["id1", "id2"]  # Replace with actual document ids
delete_documents(ids_to_delete)    

## Chaining Everything with a SequentialChain
