In [1]:
# ! pip install langchain
# ! pip install pypdf

In [52]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [3]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("pdfs/Namami-Gange-Programme.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [4]:
# Split PDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [5]:
splits = text_splitter.split_documents(docs)

In [6]:
len(splits)

18

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

## VectorStore (Chroma)

In [8]:
! pip install chromadb



In [9]:
from langchain.vectorstores import Chroma

In [10]:
persist_directory = 'vector-store/chroma/'

In [11]:
! rm -rf vector-store/chroma

In [12]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [13]:
question = "What is Namami Gange?"

In [14]:
docs = vectordb.similarity_search(question,k=3)

In [15]:
len(docs)

3

In [16]:
docs[2].page_content

'7 \n \nA five tier structure has been created through the Order  No S.O. 3187(E) \ndated  7th October 2016  invoking the provision under Section 3 of Environment \n(Protection) Act, 1986 at the national, state and district level to take measures for \nprevention, control and abatement of environmental pollution in river Ganga and \nto ensure continuous adequate flow of water so as to rejuvenate the river Ganga  as \nbelow:8 \n1. National Ganga Council under chairmanship of Hon’ble Prime Minister of \nIndia,   \n2. Empowered Task Force (ETF) on river Ganga under chairmanship of \nHon’ble Union Minister of Water Resources, River Development and Ganga \nRejuvenation,   \n3. National Mission for Clean Ganga (NMCG),   \n4. State Ganga Co mmittees, and   \n5. District Ganga Committees in every specified district abutting river Ganga \nand its tributaries in the States.   \nNamami Gange Programme  \n The Namami Gange  programme was introduced in 2014 as an umbrella \nprogramme, with the aim 

In [17]:
vectordb.persist()

In [18]:
print(vectordb._collection.count())

18


In [19]:
docs[0].page_content[:100], docs[1].page_content[:100], docs[2].page_content

('unit, apart from contemplating on a legislation that aims to check pollution and \nprotect the river.',
 '9 \n \nrehabilitation and augmentation of existing STPs and immediate short term \nmeasures for arresti',
 '7 \n \nA five tier structure has been created through the Order  No S.O. 3187(E) \ndated  7th October 2016  invoking the provision under Section 3 of Environment \n(Protection) Act, 1986 at the national, state and district level to take measures for \nprevention, control and abatement of environmental pollution in river Ganga and \nto ensure continuous adequate flow of water so as to rejuvenate the river Ganga  as \nbelow:8 \n1. National Ganga Council under chairmanship of Hon’ble Prime Minister of \nIndia,   \n2. Empowered Task Force (ETF) on river Ganga under chairmanship of \nHon’ble Union Minister of Water Resources, River Development and Ganga \nRejuvenation,   \n3. National Mission for Clean Ganga (NMCG),   \n4. State Ganga Co mmittees, and   \n5. District Ganga Commi

In [20]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

Number of requested results 20 is greater than number of elements in index 18, updating n_results = 18


In [21]:
docs_mmr[0].page_content[:100], docs_mmr[1].page_content[:100], docs_mmr[2].page_content[:100]

('unit, apart from contemplating on a legislation that aims to check pollution and \nprotect the river.',
 'duties, AND NAMAMI GANGE PROGRAMME  \nIntroduction  \n Ganga is  considered as  the most sacred river ',
 'ranging from .05 -1.1 mg/l, with potential to raise the \nnutrient level to a considerable degree in ')

In [22]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import OpenAI

In [23]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [24]:
# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [25]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [26]:
question = "What is the namami gange program about?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

The Namami Gange will focus on pollution abatement interventions namely interception, diversion & treatment of wastewater flowing through the open drains through bio-remediation/appropriate in-situ treatment/use of innovative technologies/sewage treatment plants (STPs)/effluent treatment plant (ETPs).
----------------------------------------------------------------------------------------------------
Document 2:

"The Namami Gange Programme covers  short term, medium term a nd long term activities. Under s hort term activities certain entry level activities which covers development of Ghat crematoria & river surface cleaning activities , etc. have been taken up. Under m edium term activities existing sewage treatment plants (STPs) and effluent treatment p lants (ETPs) are being upgraded and new STPs  and ETPs  are being established. Beside, rural sanitation has been taken up in the villages on the banks of river Ganga. The long term action plan involves res toration of who

In [27]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [28]:
question = "What is the namami gange program about?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Number of requested results 20 is greater than number of elements in index 18, updating n_results = 18


Document 1:

The Namami Gange will focus on pollution abatement interventions namely interception, diversion & treatment of wastewater flowing through the open drains through bio-remediation/appropriate in-situ treatment/use of innovative technologies/sewage treatment plants (STPs)/effluent treatment plant (ETPs).
----------------------------------------------------------------------------------------------------
Document 2:

"The Namami Gange Programme covers  short term, medium term a nd long term activities. Under s hort term activities certain entry level activities which covers development of Ghat crematoria & river surface cleaning activities , etc. have been taken up. Under m edium term activities existing sewage treatment plants (STPs) and effluent treatment p lants (ETPs) are being upgraded and new STPs  and ETPs  are being established. Beside, rural sanitation has been taken up in the villages on the banks of river Ganga. The long term action plan involves res toration of who

## LLM

In [29]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    LLM_NAME = os.environ['TURBO-PREVIEW']
else:
    LLM_NAME = os.envron["TURBO"]

gpt-3.5-turbo


In [30]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'vector-store/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [31]:
print(vectordb._collection.count())

18


In [32]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=LLM_NAME, temperature=0)

### RetrievalQA Chain

In [33]:
from langchain.chains import RetrievalQA

In [35]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever())

In [37]:
question = "What is the namami gange program about?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [38]:
result = qa_chain({"query": question})

In [39]:
result["result"]

'The Namami Gange program is an initiative introduced in 2014 by the Indian government to clean and rejuvenate the River Ganga. It aims to integrate previous and ongoing initiatives by enhancing efficiency, coordination, and comprehensive interventions. The program focuses on pollution abatement interventions such as wastewater treatment, interception, diversion, and treatment of open drains. It also includes the rehabilitation and augmentation of existing sewage treatment plants, as well as measures to prevent the inflow of sewage at exit points on the riverfront. The program has a five-tier structure, including the National Ganga Council, Empowered Task Force, National Mission for Clean Ganga, State Ganga Committees, and District Ganga Committees. The program is expected to deliver socio-economic benefits such as job creation, improved livelihoods, and health benefits to the population dependent on the river.'

### Prompt

In [40]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [41]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [44]:
question = "What is the importance of Ganga river?"
result = qa_chain({"query": question})

In [45]:
result["result"]

'The Ganga river is considered the most sacred river in the country and holds cultural and spiritual significance for Indians. It provides water for the livelihood of millions of people and is highly revered. Thanks for asking!'

In [46]:
question = "What is Namami Gange's aim?"
result = qa_chain({"query": question})

In [47]:
result["result"]

'The aim of the Namami Gange program is to abate pollution, protect the river, and maintain water quality in the Ganga basin states. Thanks for asking!'

### Chat System