In [1]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

In [12]:
import xmltodict
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
import os
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.chains import RetrievalQAWithSourcesChain,VectorDBQAWithSourcesChain,ConversationalRetrievalChain

In [3]:
def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

In [4]:
r = requests.get("https://www.bluenectar.co.in/sitemap.xml")
xml = r.text
raw = xmltodict.parse(xml)

site_urls = []

for item in raw['sitemapindex']['sitemap']:
    site_urls.append(item['loc'])

site_urls = site_urls[:-1]

site_urls

['https://www.bluenectar.co.in/sitemap_products_1.xml?from=6197444870316&to=8054428336373',
 'https://www.bluenectar.co.in/sitemap_pages_1.xml',
 'https://www.bluenectar.co.in/sitemap_collections_1.xml']

In [None]:
pages = []

for site_url in site_urls:
    r = requests.get(site_url)
    xml = r.text
    raw = xmltodict.parse(xml)
    for info in raw['urlset']['url']:
        url = info['loc']
        pages.append({'text': extract_text_from(url), 'source': url})

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],chunk_overlap=200,)
doc_chunks = []
for page in pages:
    chunks = text_splitter.split_text(page['text'])
    for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={"source": page['source']},
            )
            doc_chunks.append(doc)
    print(f"Split {page['source']} into {len(chunks)} chunks")

Split https://www.bluenectar.co.in/ into 14 chunks
Split https://www.bluenectar.co.in/products/triphala-ayurvedic-anti-cellulite-oil-slimming-oil-fat-burning-oil-for-weight-loss into 33 chunks
Split https://www.bluenectar.co.in/products/anti-aging-brightening-face-cream-with-sandalwood-saffron-women into 11 chunks
Split https://www.bluenectar.co.in/products/tea-tree-undiluted-essential-oil-for-hair-dandruff-face-acne-care-blemish-free-skin into 18 chunks
Split https://www.bluenectar.co.in/products/eucalyptus-essential-oil-skin-face-hair-cough-mosquito-repellant-aroma-diffuser into 15 chunks
Split https://www.bluenectar.co.in/products/peppermint-essential-oil-hair-body-skin-congestion-aroma-diffuse into 19 chunks
Split https://www.bluenectar.co.in/products/french-lavender-undiluted-essential-oil into 17 chunks
Split https://www.bluenectar.co.in/products/niraa-udupi-jasmine-body-mist-for-long-lasting-freshness-and-relaxation into 13 chunks
Split https://www.bluenectar.co.in/products/jade

In [5]:
embeddings = OpenAIEmbeddings()

# vector_store = Chroma.from_documents(
#     doc_chunks,
#     embeddings,
#     collection_name="blue_nectar",
#     persist_directory="src/data/chroma",
# )

# # Save DB locally
# vector_store.persist()

In [6]:
# vector_store = Chroma(
#         collection_name="blue_nectar",
#         embedding_function=embeddings,
#         persist_directory="src/data/chroma",
#     )

# chain = RetrievalQAWithSourcesChain.from_chain_type(OpenAI(temperature=0,verbose=True),chain_type="map_reduce", retriever=vector_store.as_retriever(),verbose=True)

In [13]:
def get_open_ai_model(model_name,temperature = 0, openai_api_key = os.getenv("OPENAI_API_KEY"),chat_model = True, verbose = False):
    if chat_model:
        return ChatOpenAI(model_name=model_name,temperature=temperature,openai_api_key=openai_api_key,verbose=verbose)
    else:
        return OpenAI(model_name=model_name,temperature=temperature,openai_api_key=openai_api_key,verbose=verbose)
    
def get_vectorstore(collection_name,persist_dir,embedding = None):
    if embedding is None:
        embedding = OpenAIEmbeddings()

    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=embedding,
        persist_directory=persist_dir,
    )

    return vector_store

def make_conversational_chain(model, vector_store ,memory = None , return_source_documents = True , verbose = False, system_prompt = None, question_prompt = None):
    if memory is None:    
        memory = ConversationSummaryBufferMemory(llm=model , memory_key="chat_history", return_messages=True,input_key='question', output_key='answer')

    if question_prompt is None:
        _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:"""
        question_prompt = PromptTemplate.from_template(_template)

    if system_prompt is None:
        chain = ConversationalRetrievalChain.from_llm(
        model, 
        vector_store.as_retriever(), 
        memory=memory,
        return_source_documents=return_source_documents,
        verbose=verbose
        )
        return chain

    chain = ConversationalRetrievalChain.from_llm(
    model, 
    vector_store.as_retriever(), 
    memory=memory,
    return_source_documents=return_source_documents,
    condense_question_prompt= question_prompt,
    combine_docs_chain_kwargs=dict(prompt=system_prompt),
    verbose=verbose
    )

    return chain

In [14]:
# Front end web app
import gradio as gr
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("clear")
    model = get_open_ai_model("gpt-3.5-turbo",verbose= True,temperature=0)
    vector_store = get_vectorstore("blue_nectar","src/data/chroma")
    
    question_prompt = PromptTemplate.from_template('''
    Given the following conversation and a user question, condense the conversation and the user question into a single question in its original language.

    Chat History:
    {chat_history}
    User Question: {question}
    Condensed Question:'''
    )


    system_prompt = PromptTemplate.from_template('''
    You are an expert salesperson representing Blue Nectar, a reputable online platform known for its wide range of high-quality contemprory ayurvedic products. 
    Your role is to assist customers by providing accurate information, offering helpful recommendations, and guiding them towards the best product choices. 
    Feel free to ask clarifying questions to better understand the customer's needs and preferences. Leverage the provided context to answer the question effectively 
    without generating false or fictional information. Double check your response for accuracy. Your responses should be short and friendly.
    Respond only to the following question using only the context and if you don't know the answer simply respond accordingly, don't make up things:

    Context: {context}
    Question: {question}

    As a salesperson, you can also ask questions to gain more insights and lead the customer more accurately. 
    Remember, your expertise and helpfulness are key in assisting customers in making informed choices.
    Once you have decided on the best product recommendation, please include the name(s) of the recommended product(s) in your response.'''
    )

    chain = make_conversational_chain(model,vector_store,system_prompt=system_prompt,question_prompt= question_prompt)
    def user(user_message, history):
        # Get response from QA chain
        response = chain({"question": user_message})
        source = response["source_documents"]
        urls = ""
        print("\n\nSources:\n")
        for document in source:
            print(f"Url: {document.metadata['source']}")
            urls += document.metadata['source']
            urls += "\n"
        # Append user message and response to chat history
        history.append((user_message, response["answer"] +"\n"+ urls))
        return gr.update(value=""), history
    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)

In [None]:
demo.launch(debug=True)