## Installing the Pinecone and LangChain libraries required for the Chatbot

In [190]:
# !pip install \
#   langchain_community \
#   langchain_pinecone \
#   langchain_openai \
#   unstructured \
#   langchain-text-splitters \
#   pinecone-text

## Importing the Langchain and Pinecone Modules from the libraries

In [191]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.retrievers import (
    PineconeHybridSearchRetriever)
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import Runnable
from tqdm.auto import tqdm
import pinecone
import os
import glob

## Loading the Documents

In [192]:
# List all files in the 'data' directory
print("Files in 'data' directory:", os.listdir('data'))

Files in 'data' directory: ['Appointments Module ✓.docx', 'Comms Module ✓.docx', 'Clients - Cases Module ✓.docx', 'Clients Module ✓.docx', 'Retrieval Augmented Generation (RAG) for Everyone.docx', 'Leads Module ✓.docx', 'Activity Logs Module ✓.docx', 'Settings Module.docx', 'Reports Module ✓.docx', 'Cases Module ✓.docx', 'Clients - Comms Module ✓.docx', 'Clients - Billings Module ✓.docx', 'c.pdf', 'Virtual Visits Module ✓.docx', 'Calendar Module ✓.docx', 'Files Module ✓.docx', ' Billings Module ✓.docx', 'Clients - Clients Module ✓.docx']


In [193]:
directory = 'data'
def load_docs(directory):
  loader = DirectoryLoader(directory)
  docs = loader.load()
  return docs

docs = load_docs(directory)
len(docs)

18

In [194]:
print(f"Loaded {len(docs)} documents.")


Loaded 18 documents.


## Natural Language ToolKit

In [195]:
#  import nltk
#  nltk.download('punkt')

## API Keys Verifications

In [196]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Fetch API keys from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

# Set the environment variables
if openai_api_key:
    os.environ['OPENAI_API_KEY'] = openai_api_key
if pinecone_api_key:
    os.environ['PINECONE_API_KEY'] = pinecone_api_key

#Verify that the keys are loaded
#print(f"OpenAI API Key: {os.environ.get('OPENAI_API_KEY')}")
#print(f"Pinecone API Key: {os.environ.get('PINECONE_API_KEY')}")

## Index the data in Pinecone

In [197]:
use_serverless = True  


## OpenAI Embeddings Model (Dense Vectors)

In [198]:
os.environ["HUGGINGFACEHUB_API_TOKEN"]="HUGGINGFACEHUB_API_TOKEN"
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",  #response time is 9s  #infloat/e5-base-V2 has 3.53sec response time.
)
embeddings

index_name = "test-2"

chunk_size = 1000  
chunk_overlap = 200  

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

split_docs = text_splitter.split_documents(docs)

## BM25 ()

In [199]:
# import pandas as pd
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# # Assume you have the 'docs' variable which is your original list of documents

# # Initialize the RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# # Split the documents
# split_docs = text_splitter.split_documents(docs)

# # Extract metadata from each document
# # metadata = [doc.metadata for doc in split_docs]

# # Convert the metadata into a pandas DataFrame
# df_metadata = pd.DataFrame(metadata)

# # Print the DataFrame to see the result
# #print(df_metadata)

# # Check the columns in the DataFrame
# print(df_metadata.columns)

In [200]:
#print(df_metadata.info())


In [201]:
# import pandas as pd
# from pinecone_text.sparse import BM25Encoder

# # Initialize the BM25Encoder
# bm25 = BM25Encoder()

# # Assuming df_metadata is your DataFrame containing the 'productDisplayName' column
# encode = df_metadata['source'].tolist()

# # Fit the BM25 model on the productDisplayNames
# bm25.fit(encode)

# # Create lists to store the results
# encoded_queries = []
# encoded_documents = []

# # Loop through each productDisplayName
# for name in encode:
#     query_encoding = bm25.encode_queries(name)
#     document_encoding = bm25.encode_documents(name)
    
#     encoded_queries.append(query_encoding)
#     encoded_documents.append(document_encoding)

# # Optionally, you can convert the results into DataFrames for easier handling
# df_encoded_queries = pd.DataFrame(encoded_queries)
# df_encoded_documents = pd.DataFrame(encoded_documents)

# # Print the results
# #print("Encoded Queries:")
# #print(df_encoded_queries.head())

# #print("Encoded Documents:")
# #print(df_encoded_documents.head())

In [202]:
# retriever = PineconeHybridSearchRetriever(
#     embeddings=embeddings, sparse_encoder=bm25, index=index_name
# )
# retriever

## Pinecone Vector Store

In [203]:
index_name = "test-2"
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

## Query

In [204]:
query = """ Hello"""

## Retriever

In [205]:
retriever = vectorstore.as_retriever(search_kwargs = {"k":5})
retriever.get_relevant_documents(query)
retriever

VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x3353912b0>, search_kwargs={'k': 5})

## Importing Chat Model as GPT-4o

In [206]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.7
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True  # This will return source documents in the response

)

#qa.invoke(query)

In [207]:
# # Post-process the output
# response = qa.invoke(query)
# # result = response.get('result', 'No result found')
# source_documents = response.get('source_documents', 'No source documents available')
# source_info = response['source_documents']  
# print(f"Response: {response['result']} (Source: {source_info})")

## Query and Response (with Pinecone and without Pinecone)

In [208]:
# query = """ According to the Constitution of the Kingdom of Nepal 1990, there is a
# provision for electing how members to the House of
# Representatives."""

# # Send each query to the LLM twice, first with relevant knowledge from Pincone 
# # and then without any additional knowledge.
# print("Response \n")
# print("Chat with Pinecone:")
# print(qa.invoke(query).get("result"))
# #print("\nChat with GPT-4o:")
# #print(llm.invoke(query).content)
# # Combine the two responses for clarity
# #print("\nCombined Response (Pinecone + GPT-4o):")
# #combined_response = f"Pinecone Response: {"Chat with Pinecone:"}\nGPT-4o Response: {"\nChat with GPT-4o:"}"
# #print(combined_response)


Response 

Chat with Pinecone:
According to the Constitution of the Kingdom of Nepal 1990, there is a provision for electing 205 members to the House of Representatives.


## Prompt Template

In [209]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

retrieved_docs = retriever.invoke(query)
#print(format_docs(retrieved_docs))

In [210]:
template = """You are an expert LLM assistant specialized in answering questions based solely on the information provided in the uploaded documents (PDF, DOCX, or TXT formats). Use only the information from the documents to respond accurately and clearly to each question.

Guidelines:
1. Provide concise and informative answers.
2. If the answer is not found in the uploaded documents, state, "The answer is not available in the provided documents."
3. Avoid using outside knowledge or assumptions. Stick strictly to the content in the documents.
4. Maintain a professional and helpful tone.
5. Answer for normal conversation question like "Hi", "Hey", "Hello", "How are you", and many others questions with answer "Hello, How can I assist you?".

Question: {question}

Context: {context}

Answer:
"""
prompt = template.format(question = query, context =  format_docs(retrieved_docs))


## Chains

In [211]:
llm = OpenAI(api_key=openai_api_key)

custom_rag_template = PromptTemplate.from_template(template)

# Create the parallel chain
My_rag_chain = RunnableParallel(
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
) | custom_rag_template | llm | StrOutputParser()


## My chain : Retriever(Pinecone) | custom_rag_template(prompt) | llm | StrOutputParser()

## Query and Response (with Pinecone and without Pinecone)

In [216]:
query = """ According to the Constitution of the Kingdom of Nepal 1990, there is a
provision for electing how members to the House of
Representatives."""

print("Chat with your Documents:")
print(My_rag_chain.invoke(query))
#print("\nChat with GPT-4o:")
#print(llm.invoke(query).content)
# Combine the two responses for clarity
#print("\nCombined Response (Pinecone + GPT-4o):")
#combined_response = f"Pinecone Response: {"Chat with Pinecone:"}\nGPT-4o Response: {"\nChat with GPT-4o:"}"
#print(combined_response)

Chat with your Documents:
General elections are nationwide elections held for the 205 members of the House of Representatives in Nepal. These elections can take place in more than one phase, and the Election Commission has the authority to decide whether to hold them all at once or in phases.


## Gradio

In [184]:
import gradio as gr

def llm_response(query, memory = None):
    return My_rag_chain.invoke(query)

rag_demo = gr.ChatInterface(
    llm_response, 
    title="RAG demo",
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Enter query here", scale=5),
    examples=["Hello"],
    retry_btn=gr.Button("Retry"),
    clear_btn=gr.Button("Clear"),
    undo_btn=gr.Button("Undo"),
    submit_btn=gr.Button("Submit")
)

In [185]:
rag_demo.launch(share=False)

Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




## Tenants


In [186]:
# Define 25 tenants, each with 50 users
tenants = {f"tenant_{i}": [f"user_{i}_{j}" for j in range(50)] for i in range(1, 26)}

# Print out the tenants and their users
for tenant_id, users in tenants.items():
    print(f"Tenant ID: {tenant_id}")
    for user in users:
        print(f"  User: {user}")


Tenant ID: tenant_1
  User: user_1_0
  User: user_1_1
  User: user_1_2
  User: user_1_3
  User: user_1_4
  User: user_1_5
  User: user_1_6
  User: user_1_7
  User: user_1_8
  User: user_1_9
  User: user_1_10
  User: user_1_11
  User: user_1_12
  User: user_1_13
  User: user_1_14
  User: user_1_15
  User: user_1_16
  User: user_1_17
  User: user_1_18
  User: user_1_19
  User: user_1_20
  User: user_1_21
  User: user_1_22
  User: user_1_23
  User: user_1_24
  User: user_1_25
  User: user_1_26
  User: user_1_27
  User: user_1_28
  User: user_1_29
  User: user_1_30
  User: user_1_31
  User: user_1_32
  User: user_1_33
  User: user_1_34
  User: user_1_35
  User: user_1_36
  User: user_1_37
  User: user_1_38
  User: user_1_39
  User: user_1_40
  User: user_1_41
  User: user_1_42
  User: user_1_43
  User: user_1_44
  User: user_1_45
  User: user_1_46
  User: user_1_47
  User: user_1_48
  User: user_1_49
Tenant ID: tenant_2
  User: user_2_0
  User: user_2_1
  User: user_2_2
  User: user_2_3
  

## Folder, Upload Docs, Delete Docs, Isolation Query

In [187]:
# User Folders
def create_user_folders(base_path, tenants):
    """Create folders for each tenant and their users."""
    for tenant_id, users in tenants.items():
        tenant_path = os.path.join(base_path, tenant_id)
        os.makedirs(tenant_path, exist_ok=True)
        for user in users:
            user_path = os.path.join(tenant_path, user)
            os.makedirs(user_path, exist_ok=True)
    print("User folders created successfully.")

#Uploading File with metadata 
def upload_file(base_path, tenant_id, user_id, file_path):
    """Upload a file and store its embedding with metadata for tenant filtering."""
    tenant_folder = os.path.join(base_path, tenant_id)
    user_folder = os.path.join(tenant_folder, user_id)
    
    if not os.path.exists(user_folder):
        raise ValueError(f"User folder {user_folder} does not exist")
    
    destination = os.path.join(user_folder, os.path.basename(file_path))
    
    if os.path.exists(destination):
        raise FileExistsError(f"File {destination} already exists")
    
    try:
        with open(file_path, 'rb') as fsrc, open(destination, 'wb') as fdst:
            fdst.write(fsrc.read())
        
        # Create embeddings and store in Pinecone with tenant metadata
        embedding = embeddings.embed_from_file(file_path)
        metadata = {"tenant_id": tenant_id, "user_id": user_id}
        vectorstore.add(vectors=[embedding], ids=[os.path.basename(file_path)], metadata=metadata)
        print(f"File {file_path} uploaded to {destination}")
    except Exception as e:
        print(f"An error occurred while uploading the file: {e}")

# Delete File
def delete_file(base_path, tenant_id, user_id, file_name):
    """Delete a file from the specified tenant and user folder."""
    file_path = os.path.join(base_path, tenant_id, user_id, file_name)
    
    if os.path.exists(file_path):
        try:
            os.remove(file_path)
            print(f"File {file_path} deleted successfully.")
        except Exception as e:
            print(f"An error occurred while deleting the file: {e}")
    else:
        print(f"File {file_path} not found.")

# Query File for Isolation
def query_files(tenant_id, query_text):
    """Query files visible to the specified tenant."""
    query_embedding = embeddings.embed(query_text)
    results = vectorstore.query(query=query_embedding, top_k=5, filter={"tenant_id": tenant_id})
    
    print(f"Query results for tenant {tenant_id}:")
    for result in results:
        print(result)


##### Example for tenant and user data ######
#####                                  ######
tenants = {
    'tenant_1': ['user_1_0', 'user_1_1','user_1_2','user_1_3','user_1_4',],
    'tenant_2': ['user_2_0', 'user_2_1','user_2_2','user_2_3','user_2_4',],
    'tenant_3': ['user_3_0', 'user_3_1','user_3_2','user_3_3','user_3_4',],
    'tenant_4': ['user_4_0', 'user_4_1','user_4_2','user_4_3','user_4_4',],
    'tenant_5': ['user_5_0', 'user_5_1','user_5_2','user_5_3','user_5_4',],
    # Add more tenants and users as needed
}

# Base directory where user folders will be created
base_path = 'file_storage'

# Create folders
create_user_folders(base_path, tenants)

# Example of uploading a file
#upload_file(base_path, 'tenant_1', 'user_1_1', 'Retrieval Augmented Generation (RAG) for Everyone.docx')

# Example of deleting a file
#delete_file(base_path, 'tenant_1', 'user_1_1', 'Retrieval Augmented Generation (RAG) for Everyone.docx')


User folders created successfully.


In [188]:
#Get an index endpoint = https://ujjwaln-rn229jx.svc.aped-4627-b74a.pinecone.io
