In [1]:
# https://github.com/tomasonjo/blogs/blob/master/neo4jdocs/neo4j_support_bot.ipynb
# %pip install langchain==0.0.142 openai==0.27.4 beautifulsoup4==4.12.2 chromadb==0.3.21

In [2]:
import os
import dotenv

import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd

from langchain.document_loaders import (
    # GitLoader,
    # YoutubeLoader,
    # DataFrameLoader,
    UnstructuredURLLoader,
)
from langchain.text_splitter import CharacterTextSplitter

import tiktoken

# from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool

from crawl import crawl, clean, strip_content

# loads .env file with your OPENAI_API_KEY
dotenv.load_dotenv()

True

In [3]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r"^http[s]*://.+"

# Define root domain to crawl
DOMAIN = "mushroomworldacademy.com"
FULL_URL = "https://mushroomworldacademy.com/"

### Embedding documents for the first time

In [4]:
seen = crawl(FULL_URL, HTTP_URL_PATTERN)
print(len(seen))

https://mushroomworldacademy.com/
https://mushroomworldacademy.com/courses
https://mushroomworldacademy.com/courses-tr/diverting-food-waste-to-growing-edible-mushroom
https://mushroomworldacademy.com/product/diverting-waste-streams-to-grow-sustainable-edible-mushroom
https://mushroomworldacademy.com/product/learn-to-grow-medicinal-mushroom-lingzhi
https://mushroomworldacademy.com/wp-content/uploads/2023/01/Lingzhi.mp4
https://mushroomworldacademy.com/?add-to-cart=1987
https://mushroomworldacademy.com/wp-content/uploads/2023/01/learn-to-grow-medicinal-mushroom-lingzhi.png
https://mushroomworldacademy.com/?add-to-cart=1990
https://mushroomworldacademy.com/wp-content/uploads/2023/01/2-DAYS-PROGRAM-VIDEO.mp4
https://mushroomworldacademy.com/product-category/courses
https://mushroomworldacademy.com/product/learn-to-grow-edible-mushroom-home
https://mushroomworldacademy.com/wp-content/uploads/2023/01/learn-to-grow-edible-mushroom-home.png
https://mushroomworldacademy.com/wp-content/uploads/2

In [5]:
clean = clean(seen)
print(len(clean))

30


In [6]:
urls = list(clean)

loader = UnstructuredURLLoader(urls=urls)
data = loader.load()
print(len(data))

30


In [7]:
for d in data:
    new_content = strip_content(d.page_content)
    d.page_content = new_content

In [8]:
# Define text chunk strategy
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separator=" ")
# Split documents into chunks
data_split = splitter.split_documents(data)
print(len(data_split))

91


In [9]:
data_split

[Document(page_content="Learn Courses & Experiences Mushroom Guide Private Bookings The Team Impact FAQs Contact Us Shop MUSHROOM WORLD ACADEMY Learn From Us MUSHROOM WORLD ACADEMY Learn From Us MUSHROOM WORLD ACADEMY Learn From Us Mushroom World Academy In Action! 1st To Grow Lion's Mane Fresh in Singapore! What Our Participants Are Saying Chua Kheng Wee Louis, Member of Parliament Sengkang GRC (Rivervale) “William and his team conducted a ‘Mushroom Growing in The Community’ workshop for seniors and disadvantaged families in my constituency in January 2023. Who could have known that the world of mushrooms can be so fascinating! While the length of the workshop was close to four hours, it was a thoroughly engaging session and his passion for mushrooms truly shone through. William and his team were particularly patient in guiding our senior residents through each step in preparing their very own mushroom growing kit, and it was clear that every participant truly enjoyed the very meaning

In [None]:
# # Define embedding model
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# store = Chroma.from_documents(
#     data_split, embeddings, collection_name="mwa", persist_directory="db"
# )

# store.persist()
# store = None

### Chroma DB persisted

In [None]:
# Define embedding model
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
store = Chroma(
    collection_name="mwa", persist_directory="db", embedding_function=embeddings
)
store._collection