In [17]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.vectorstores import Chroma,FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings,GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from dotenv import load_dotenv
import csv

load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')



In [2]:
llm=GoogleGenerativeAI(model="gemini-1.5-pro-latest")

In [3]:
embeddings=GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [4]:
loader=PyPDFDirectoryLoader("./pdfs")
docs=loader.load()

In [5]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
documents=text_splitter.split_documents(docs)


In [6]:
print(len(documents))

187


In [7]:
vectorstore=FAISS.from_documents(documents,embeddings)


In [8]:
retriever_vectordb=vectorstore.as_retriever(search_kwargs={"k":4})

In [9]:
keyword_retriever = BM25Retriever.from_documents(documents)

In [10]:
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [11]:
prompt=ChatPromptTemplate.from_template(
    """ Answer the following question based only on the provided context. 
    Think step by step before providing a detailed answer . 
    I will tip you 1000$ if the user finds the answer helpful.
    <context>
    {context}
    </context>
    Question: {input}
    """
)

In [12]:
document_chain=create_stuff_documents_chain(llm,prompt)
retrieval_chain=create_retrieval_chain(ensemble_retriever,document_chain)

In [13]:
text="what is the document about?"
retrieval_chain.invoke({'input':text})['answer']

'This document appears to be a report on the **2022 Nepal Demographic and Health Survey (NDHS)**. \n\nHere\'s why, based on the provided text:\n\n* **"NDHS" is mentioned multiple times:** This acronym likely stands for Nepal Demographic and Health Survey.\n* **Focus on health and demographic indicators:** The excerpt mentions topics like "Disability," "Child Growth Standards," "Underweight," "Overweight," "Family Planning," "Maternal Care,"  and "Antenatal care," which are common indicators studied in demographic and health surveys.\n* **Detailed methodology:** The text describes sampling design, questionnaires, data collection methods (anthropometry, anemia testing, blood pressure measurement), training procedures, and data processing - all crucial aspects of conducting a large-scale survey.\n* **Reference to data and statistics:**  Specific numbers are given for children under 5 with valid data on height and weight, indicating data analysis and reporting of findings.\n\n**Therefore, 

In [15]:
intent_prompt = ChatPromptTemplate.from_template(
    """You are an AI designed to assist with inquiries. A user has asked the following question: "{input}". 
    Is the user asking to be contacted (such as requesting a call, email, or other forms of communication)? 
    Answer with "Yes" or "No" only.
    """
)

In [45]:
info_verification_prompt = ChatPromptTemplate.from_template(
    """A user has provided the following information: "{input}". 
    Does this message contain the user's full name, phone number, and email address? 
    If not, specify which information is missing or incomplete. Otherwise, Respond in the following format:
    Name: [name]
    Phone: [phone]
    Email: [email]
    
    """
)

In [18]:
def save_info(name,phone,email,csv_file="user_info.csv"):
    with open(csv_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([name, phone, email])

In [40]:
save_info("RAmasda kumar",9842324232,'hadfjer')

In [19]:
def detect_intent(query):
    response=llm(intent_prompt.format(input=query))
    return response.strip().lower() == "yes"



In [21]:
print(detect_intent("hello"))

False


In [59]:
def verify_info(query):
    response=llm(info_verification_prompt.format(input=query))
    return response

In [57]:
my_r=verify_info("Ravi Das 9834287743 kalo@gmail.com")

In [58]:
my_r

'Name: Ravi Das\nPhone: 9834287743\nEmail: kalo@gmail.com \n'

In [None]:
# import re

# input_text = "Name: John Doe"
# name_match = re.search(r"(?i)(name:\s*)(\w+\s\w+)", input_text)

# if name_match:
#     print(name_match.group(1))  # Output: 'Name: '
#     print(name_match.group(2))  # Output: 'John Doe'

In [26]:
import re

In [51]:
# def extract_info(input_text):
#     name_match = re.search(r"(?i)(name:\s*)(\w+\s\w+)", input_text)
#     phone_match = re.search(r"(?i)(phone:\s*)(\d{10,})", input_text)
#     email_match = re.search(r"(?i)(email:\s*)(\S+@\S+\.\S+)", input_text)
#     print(name_match)
#     name = name_match.group(2) if name_match else ""
#     phone = phone_match.group(2) if phone_match else ""
#     email = email_match.group(2) if email_match else ""

#     return name, phone, email

def extract_user_info(response):
    
    
    # Parsing the structured response
    name = response.split("Name: ")[1].split("Phone: ")[0].strip() if "Name: " in response else None
    phone = response.split("Phone: ")[1].split("Email: ")[0].strip() if "Phone: " in response else None
    email = response.split("Email: ")[1].strip() if "Email: " in response else None
    
    # Handle "None" responses
    name = None if name == "None" else name
    phone = None if phone == "None" else phone
    email = None if email == "None" else email
    
    return name, phone, email

In [54]:

save_info(extract_user_info(my_r))

TypeError: save_info() missing 2 required positional arguments: 'phone' and 'email'

In [55]:
nae,phone,email=extract_user_info(my_r)

In [56]:
save_info(nae,phone,email)

In [33]:
def handle_user_query(query):
    if detect_intent(query):
        print("Bot: Please provide your Name, Phone Number, and Email below in the format: Name: John Doe, Phone: 1234567890, Email: john.doe@example.com")
        return True ,False

In [36]:
def handle_user_query(query, vectors, retrieval_chain, collecting_info, info_saved):
    if detect_intent(query):
        print("Bot: Please provide your Name, Phone Number, and Email below in the format: Name: John Doe, Phone: 1234567890, Email: john.doe@example.com")
        return True, False  # Continue collecting info

    if collecting_info and not info_saved:
        verification_result = verify_info(query)
        if verification_result.lower() == "complete":
            name, phone, email = extract_info(query)
            save_info(name, phone, email)
            print("Bot: Thank you! Your contact information has been saved.")
            return False, True  # Stop collecting info, info saved
        else:
            print(f"Bot: Please provide the following missing information: {verification_result}")
            return True, False  # Continue collecting info

    # Process the query if it's not about contact information
    response = retrieval_chain.invoke({'input': query})['answer']
    print(f"Bot: {response}")
    return False, False 

In [37]:
def vector_embeddings():
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    loader = PyPDFDirectoryLoader("./pdfs")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    final_documents = text_splitter.split_documents(docs)
    vectors = FAISS.from_documents(final_documents, embeddings)

    retriever_vectordb = vectors.as_retriever(search_kwargs={"k": 4})
    keyword_retriever = BM25Retriever.from_documents(final_documents)
    ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb, keyword_retriever], weights=[0.5, 0.5])

    documents_chains = create_stuff_documents_chain(llm,prompt)
    retrieval_chain = create_retrieval_chain(ensemble_retriever, documents_chains)

    return vectors, retrieval_chain

In [38]:
vectors, retrieval_chain = vector_embeddings()
print("FAISS is ready")

# Initialize conversation state
collecting_info = False
info_saved = False

# Simulated chat loop
while True:
    user_input = input("Your message: ")

    if user_input.lower() == "exit":
        break

    collecting_info, info_saved = handle_user_query(user_input, vectors, retrieval_chain, collecting_info, info_saved)

FAISS is ready
Bot: The provided text is a snippet from what appears to be a demographic and health survey report, likely from Nepal (as indicated by mentions of "NDHS" - Nepal Demographic and Health Survey). It contains various tables and figures with data on:

* **Prevalence of anemia** among different demographics (residence, ecological zone, province)
* **Background characteristics of survey respondents** such as marital status, residence, education level
* **Fertility rates** including trends over time and teenage pregnancy rates. 

However, **none of this information helps answer the question "hello".**  The question is a simple greeting and doesn't relate to the provided data. 

Bot: Please provide your Name, Phone Number, and Email below in the format: Name: John Doe, Phone: 1234567890, Email: john.doe@example.com
Bot: Thank you! Your contact information has been saved.
Bot: The survey collected demographic, health, and social data from households and individuals in Nepal. 



Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Bot: ```python
exit()
```
