In [None]:
# https://github.com/tomasonjo/blogs/blob/master/neo4jdocs/neo4j_support_bot.ipynb
# %pip install langchain==0.0.142 openai==0.27.4 beautifulsoup4==4.12.2 chromadb==0.3.21

In [1]:
import os
import dotenv

import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd

from langchain.document_loaders import (
    # GitLoader,
    # YoutubeLoader,
    # DataFrameLoader,
    UnstructuredURLLoader,
)
from langchain.text_splitter import CharacterTextSplitter

import tiktoken

# from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool

from crawl import crawl, clean, strip_content

# loads .env file with your OPENAI_API_KEY
dotenv.load_dotenv()

True

In [2]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r"^http[s]*://.+"

# Define root domain to crawl
DOMAIN = "contactout.com"
FULL_URL = "https://contactout.com/"

### Embedding documents for the first time

In [3]:
seen = crawl(FULL_URL, HTTP_URL_PATTERN)
print(len(seen))

https://contactout.com/
https://contactout.com/chrome-extension
https://contactout.com/integrations
https://contactout.com/clients/bdex/
https://contactout.com/recruiters
https://contactout.com/clients/spreetail/
https://contactout.com/contact
https://contactout.com/optout
https://contactout.com/privacy
https://contactout.com/privacy_policy
https://contactout.com/privacy_data_subjects
https://contactout.com
https://contactout.com/terms
https://contactout.com/search-portal
https://contactout.com/pricing
https://contactout.com/meeting?utm_source=pricing
https://contactout.com/contact_us
https://contactout.com/dashboard/search
https://contactout.com/integrations-feature
https://contactout.com/integrations/
https://contactout.com/sales-users
https://contactout.com/meeting?utm_source=talk_to_sales_use_case
https://contactout.com/clients
https://contactout.com/clients/spreetail
https://contactout.com/clients/worth-search
https://contactout.com/clients/txt-international
https://contactout.com

In [4]:
clean = clean(seen)
print(len(clean))

39


In [5]:
urls = list(clean)

# Without headers User-Agent, results in forbidden 403
loader = UnstructuredURLLoader(urls=urls, headers={"User-Agent": "Mozilla/5.0"})
data = loader.load()
print(len(data))

39


In [6]:
for d in data:
    new_content = strip_content(d.page_content)
    d.page_content = new_content

In [7]:
# Define text chunk strategy
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separator=" ")
# Split documents into chunks
data_split = splitter.split_documents(data)
print(len(data_split))

282


In [8]:
data_split

[Document(page_content='Use Cases Sales Unlimited sales users for $299 Recruiters Speed up hiring of top talents Features Search Portal Find the right prospects quickly with 20+ search filters Email Campaigns Automate & accelerate your outreach Chrome Extension Supercharge your prospecting on LinkedIn and across the the web Integrations Power up your existing tools with ContactOut data Data Enrichment Know your leads in minutes with ContactOut data API Integrate ContactOut data into your workflow Use Cases Sales Recruiters Features Search Portal Email Campaigns Chrome Extension Integrations Data Enrichment API Our Data Clients Pricing Support Guides Get a demo Login Sign up for free Login The Power Hive: Improving Recruitment with ContactOut ContactOut is a massive advantage. My advice is do not go out and buy ‘shiny new toys’. You’ve got to go try something as simple and accurate as ContactOut. Keith Southern Co-Founder and Partner at The Power Hive The Power Hive The Power Hive has b

In [9]:
# # Define embedding model
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# store = Chroma.from_documents(
#     data_split, embeddings, collection_name="cto", persist_directory="db"
# )

# store.persist()
# store = None

Using embedded DuckDB with persistence: data will be stored in: db


### Chroma DB persisted

In [10]:
# Define embedding model
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
store = Chroma(
    collection_name="cto", persist_directory="db", embedding_function=embeddings
)
store._collection

Using embedded DuckDB with persistence: data will be stored in: db


Collection(name=cto)