In [1]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

In [2]:
import xmltodict
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
import os
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.chains import RetrievalQAWithSourcesChain,VectorDBQAWithSourcesChain,ConversationalRetrievalChain

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

In [4]:
def extract_text_from(url):
    html = requests.get(url,headers= headers).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

In [5]:
r = requests.get("https://vetic.in/sitemap.xml",headers= headers)
xml = r.text
raw = xmltodict.parse(xml)
print(raw)
site_urls = []

for item in raw['sitemapindex']['sitemap']:
    site_urls.append(item['loc'])

site_urls

{'sitemapindex': {'@xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9', 'sitemap': [{'loc': 'https://vetic.in/sitemap_website.xml'}, {'loc': 'https://vetic.in/sitemap_dynamic.xml'}, {'loc': 'https://vetic.in/sitemap_blogs.xml'}]}}


['https://vetic.in/sitemap_website.xml',
 'https://vetic.in/sitemap_dynamic.xml',
 'https://vetic.in/sitemap_blogs.xml']

In [6]:
pages = []

for site_url in site_urls:
    try:
        r = requests.get(site_url,headers= headers)
        xml = r.text
        raw = xmltodict.parse(xml)
        print(raw)
        for info in raw['urlset']['url']:
            url = info['loc']
            pages.append({'text': extract_text_from(url), 'source': url})
    except:
        print("Error")

{'urlset': {'@xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9', '@xmlns:image': 'http://www.google.com/schemas/sitemap-image/1.1', 'url': [{'loc': 'https://vetic.in/', 'lastmod': '2022-08-06T17:15:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/gurgaon/sector-57', 'lastmod': '2022-11-10T17:15:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/gurgaon', 'lastmod': '2022-11-10T17:15:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/gurgaon/golf-course-road', 'lastmod': '2022-11-10T11:50:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/gurgaon/sector-45', 'lastmod': '2023-01-10T11:50:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/gurgaon/sohna-road', 'lastmod': '2023-02-10T11:50:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/noida/sector-49', 'lastmod': '2023-01-10T11:50:49+05:30', 'changefreq': 'weekly'}, {'loc': 'https://vetic.in/clinics/delhi/sector-11', 

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],chunk_overlap=200,)
doc_chunks = []
for page in pages:
    chunks = text_splitter.split_text(page['text'])
    for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={"source": page['source']},
            )
            doc_chunks.append(doc)
    print(f"Split {page['source']} into {len(chunks)} chunks")

Split https://vetic.in/ into 10 chunks
Split https://vetic.in/clinics/gurgaon/sector-57 into 5 chunks
Split https://vetic.in/clinics/gurgaon into 9 chunks
Split https://vetic.in/clinics/gurgaon/golf-course-road into 5 chunks
Split https://vetic.in/clinics/gurgaon/sector-45 into 5 chunks
Split https://vetic.in/clinics/gurgaon/sohna-road into 5 chunks
Split https://vetic.in/clinics/noida/sector-49 into 5 chunks
Split https://vetic.in/clinics/delhi/sector-11 into 5 chunks
Split https://vetic.in/our-services into 8 chunks
Split https://vetic.in/urgent-care into 8 chunks
Split https://vetic.in/preventive-care into 8 chunks
Split https://vetic.in/veterinary-consultation into 7 chunks
Split https://vetic.in/about-us into 7 chunks
Split https://vetic.in/page-not-found into 5 chunks
Split https://vetic.in/terms-of-service into 52 chunks
Split https://vetic.in/privacy-policy into 29 chunks
Split https://vetic.in/grooming into 6 chunks
Split https://vetic.in/vaccination into 10 chunks
Split https

In [10]:
print(len(doc_chunks))

2830


In [9]:
embeddings = OpenAIEmbeddings()

vector_store = Chroma.from_documents(
    doc_chunks,
    embeddings,
    collection_name="vetic",
    persist_directory="src/data/chroma",
)

# Save DB locally
vector_store.persist()

IOException: IO Error: Could not set lock on file "src/data/chroma/chroma-embeddings.parquet.tmp": Resource temporarily unavailable

In [6]:
def get_open_ai_model(model_name,temperature = 0, openai_api_key = os.getenv("OPENAI_API_KEY"),chat_model = True, verbose = False):
    if chat_model:
        return ChatOpenAI(model_name=model_name,temperature=temperature,openai_api_key=openai_api_key,verbose=verbose)
    else:
        return OpenAI(model_name=model_name,temperature=temperature,openai_api_key=openai_api_key,verbose=verbose)
    
def get_vectorstore(collection_name,persist_dir,embedding = None):
    if embedding is None:
        embedding = OpenAIEmbeddings()

    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=embedding,
        persist_directory=persist_dir,
    )

    return vector_store

def make_conversational_chain(model, vector_store ,memory = None , return_source_documents = True , verbose = False, system_prompt = None):
    if memory is None:    
        memory = ConversationSummaryBufferMemory(llm=model , memory_key="chat_history", return_messages=True,input_key='question', output_key='answer')

    if system_prompt is None:
        chain = ConversationalRetrievalChain.from_llm(
        model, 
        vector_store.as_retriever(), 
        memory=memory,
        return_source_documents=return_source_documents,
        verbose=verbose
        )
        return chain

    chain = ConversationalRetrievalChain.from_llm(
    model, 
    vector_store.as_retriever(), 
    memory=memory,
    return_source_documents=return_source_documents,
    combine_docs_chain_kwargs=dict(prompt=system_prompt),
    verbose=verbose
    )

    return chain

In [7]:
import gradio as gr
import random
import time

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    model = get_open_ai_model("gpt-3.5-turbo",verbose= True,temperature=0.3)
    vector_store = get_vectorstore("vetic","src/data/chroma")

    system_prompt = PromptTemplate.from_template('''
    You are a customer support guide representing vetic, a reputable online platform known for offering pet healthcare services in India. 
    Your role is to assist customers by providing accurate information, offering helpful recommendations, and guiding them towards the solutions of their issues. 
    Feel free to ask clarifying questions to better understand the customer's needs and preferences. Leverage the provided context to answer the question effectively 
    without generating false or fictional information. Double check your response for accuracy. Your responses should be short and friendly.
    Respond only to the following question using only the context and if you don't know the answer respond with "May I connect you with an expert in this topic to discuss this in detail?":

    Context: {context}
    Question: {question}

    You can also ask questions to gain more insights and lead the customer more accurately. 
    Remember, your expertise and helpfulness are key in assisting customers in making informed choices.'''
    )


    # strict_system_prompt = PromptTemplate.from_template('''
    # Respond to the following question using only the context, and if you don't know the answer, 
    # simply respond accordingly without making up information:

    # Context: {context}
    # Question: {question}

    # You can also ask questions to gather more insights and guide the user more effectively. Remember, your expertise and helpfulness are crucial in assisting users 
    # with their technical concerns and providing them with the best solutions.'''
    # )


    chain = make_conversational_chain(model,vector_store,system_prompt=system_prompt)
    
    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        response = chain({"question": history[-1][0]})
        source = response["source_documents"]
        urls = ""
        print("\n\nSources:\n")
        for document in source:
            print(f"Url: {document.metadata['source']}")
            urls += document.metadata['source']
            urls += "\n"
        bot_message = response["answer"] + "\n" + urls
        history[-1][1] = ""
        for character in bot_message:
            history[-1][1] += character
            time.sleep(0.01)
            yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    # make it so that on clear button new conversation chain is made

    clear.click(lambda: chain.memory.clear() ,  None, chatbot, queue=False)


In [8]:
demo.queue()
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://ef54bcbc0e7d403825.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces






Sources:

Url: https://vetic.in/cat-surgeries-delhi
Url: https://vetic.in/Dr.-Danish-Bhutyal
Url: https://vetic.in/veterinary-clinic-near-me
Url: https://vetic.in/pet-emergency-care-near-me


Sources:

Url: https://vetic.in/privacy-policy
Url: https://vetic.in/privacy-policy
Url: https://vetic.in/privacy-policy
Url: https://vetic.in/terms-of-service
