In [None]:
# https://github.com/tomasonjo/blogs/blob/master/neo4jdocs/neo4j_support_bot.ipynb
# %pip install langchain==0.0.142 openai==0.27.4 beautifulsoup4==4.12.2 chromadb==0.3.21

In [1]:
import os
import dotenv

import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd

from langchain.document_loaders import (
    # GitLoader,
    # YoutubeLoader,
    # DataFrameLoader,
    UnstructuredURLLoader,
)
from langchain.text_splitter import CharacterTextSplitter

import tiktoken

# from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool

from crawl import crawl, clean, strip_content

# loads .env file with your OPENAI_API_KEY
dotenv.load_dotenv()

True

In [2]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r"^http[s]*://.+"

# Define root domain to crawl
DOMAIN = "accountingservices.sg"
FULL_URL = "https://accountingservices.sg/"

### Embedding documents for the first time

In [3]:
seen = crawl(FULL_URL, HTTP_URL_PATTERN)
print(len(seen))

https://accountingservices.sg/
https://accountingservices.sg/xbrl-filing/
https://accountingservices.sg/corporate-secretarial/
https://accountingservices.sg/tax/
https://accountingservices.sg/2019/05/07/
HTTP Error 500: Internal Server Error
https://accountingservices.sg/accounting/
https://accountingservices.sg/home/
https://accountingservices.sg/technological-disruption-in-tax-administration-an-interview-with-mr-peter-green-oecd/
https://accountingservices.sg/acras-new-free-online-interactive-programme/
https://accountingservices.sg/iras-recovers-10m-from-tax-avoiding-high-earners/
https://accountingservices.sg/revision-in-gst-import-relief-for-travellers-with-effect-from-19-february-2019/
https://accountingservices.sg/financial-reporting-practice-guidance-1-of-2019/
https://accountingservices.sg/wage-credit-scheme-payouts-for-2019/
https://accountingservices.sg/more-e-payment-options-for-customers/
https://accountingservices.sg/paying-taxes-2019_1/
https://accountingservices.sg/cate

In [4]:
clean = clean(seen)
print(len(clean))

69


In [5]:
urls = list(clean)

# Without headers User-Agent, results in forbidden 403
loader = UnstructuredURLLoader(urls=urls, headers={"User-Agent": "Mozilla/5.0"})
data = loader.load()
print(len(data))

The MIME type is 'application/x-empty'. This file type is not currently supported in unstructured.
Error fetching or processing https://accountingservices.sg/2019/09/15/, exeption: Invalid file. The FileType.UNK file type is not supported in partition.
The MIME type is 'application/x-empty'. This file type is not currently supported in unstructured.
Error fetching or processing https://accountingservices.sg/2019/10/17/, exeption: Invalid file. The FileType.UNK file type is not supported in partition.
The MIME type is 'application/x-empty'. This file type is not currently supported in unstructured.
Error fetching or processing https://accountingservices.sg/2019/12/11/, exeption: Invalid file. The FileType.UNK file type is not supported in partition.
The MIME type is 'application/x-empty'. This file type is not currently supported in unstructured.
Error fetching or processing https://accountingservices.sg/2019/05/29/, exeption: Invalid file. The FileType.UNK file type is not supported in

42


In [6]:
for d in data:
    new_content = strip_content(d.page_content)
    d.page_content = new_content

In [7]:
# Define text chunk strategy
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separator=" ")
# Split documents into chunks
data_split = splitter.split_documents(data)
print(len(data_split))

134


In [8]:
data_split

[Document(page_content='GST Tourist Refund Fraud Six employees of a local jeweller Abiraame Jewellers – Arumugam Chelladurai (aged 48), Kulamani Ganesan (aged 31), Manickavasagam Saravanan (aged 40), Pang Wei Koon (aged 46), Murugesan Saravanan (aged 41) and Shanmugam Sampathkumar (aged 33) – have been convicted in Court for assisting claimants in making fraudulent Goods and Services Tax (GST) tourist refund claims. The six employees faced a total of 80 charges for their involvement in the conspiracy, in which GST tourist refunds amounting to approximately $15,282 were fraudulently claimed. ‘ The Case The case is linked to the five Indian nationals (“claimants”) who were previously arrested and convicted for conspiring to make fraudulent GST refund claims at Changi Airport. As part of their illegal scheme, the claimants would loiter in and around a number of jewellery shops along Serangoon Road, including Abiraame Jewellers’ two shops at 85 and 123 Serangoon Road, for the purpose of ob

In [9]:
# # Define embedding model
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# store = Chroma.from_documents(
#     data_split, embeddings, collection_name="acs", persist_directory="db"
# )

# store.persist()
# store = None

Using embedded DuckDB with persistence: data will be stored in: db


### Chroma DB persisted

In [10]:
# Define embedding model
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
store = Chroma(
    collection_name="acs", persist_directory="db", embedding_function=embeddings
)
store._collection

Using embedded DuckDB with persistence: data will be stored in: db


Collection(name=acs)

In [None]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=OPENAI_API_KEY,
    max_tokens=512,
)

In [None]:
sales_template = """
As a customer marketing bot, your goal is to provide accurate and helpful information about Elliot & Co.
You should answer user inquiries based on the context provided and avoid making up answers.
If you don't know the answer, simply state that you don't know.
Remember to provide relevant information about how Elliot & Co can assist the user through its services, strengths and benefits.

{context}
=========
Question: {question}
"""

SALES_PROMPT = PromptTemplate(
    template=sales_template, input_variables=["context", "question"]
)

sales_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=store.as_retriever(),
    chain_type_kwargs={"prompt": SALES_PROMPT},
)

In [None]:
print(sales_qa.run("What does the company do?"))

### `RetrievalQAWithSourcesChain` (The answers aren't that good compared to the above method)

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)


system_template = """
As a customer marketing bot, your goal is to provide accurate and helpful information about Elliot & Co.
You should answer user inquiries based on the context provided and avoid making up answers.
If you don't know the answer, simply state that you don't know.
Remember to provide relevant information about how Elliot & Co can assist the user through its services, strengths and benefits.

{context}
=========
Question: {question}
"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=store.as_retriever(),
    return_source_documents=True,
)

In [None]:
from IPython.display import display, Markdown


def print_result(query, result):
    output_text = f"""### Question:\n{query}\n### Answer:\n{result['answer']}\n### Sources:\n{result['sources']}
    """
    display(Markdown(output_text))

In [None]:
query = "What is this company?"
result = chain(query)

In [None]:
print_result(query, result)

In [None]:
query = "What does retainer options for 6 months mean?"
result = chain(query)
print_result(query, result)

In [None]:
query = "I am a company that provides a chatbot as a service. How can you help?"
result = chain(query)
print_result(query, result)