In [1]:
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import openai
from langchain.embeddings import AzureOpenAIEmbeddings
import spacy
import warnings
import faiss
import numpy as np
import json
import pandas as pd
import matplotlib.pyplot as plt
from langchain.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import dotenv, os

_ = dotenv.load_dotenv(dotenv.find_dotenv())

embeddings = AzureOpenAIEmbeddings(
    deployment = os.getenv("Embeddings_GPT_4_DEPLOYMENT_NAME"),
    model = os.getenv("Embeddings_GPT_4_MODEL"),
    openai_api_base = os.getenv("Embeddings_GPT_4_API_BASE"),
    openai_api_key = os.getenv("Embeddings_GPT_4_API_KEY"),
    openai_api_version = os.getenv("Embeddings_GPT_4_API_VERSION"),
    openai_api_type = os.getenv("Embeddings_GPT_4_API_TYPE"),
    chunk_size = 1
)

def crawl_website(start_url, max_pages=1000):
    visited = set()
    frontier = [start_url]
    pages = []
    
    while frontier and len(visited) < max_pages:
        url = frontier.pop(0)
        if url in visited:
            continue
        print(f"Crawling: {url}")
        try:
            response = requests.get(url, timeout=10)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            visited.add(url)
            continue
        
        if response.status_code != 200:
            print(f"Non-200 status code for {url}")
            visited.add(url)
            continue
        
        content_type = response.headers.get("Content-Type", "")
        if "text/html" not in content_type:
            visited.add(url)
            continue
        
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator="\n")
        pages.append({"url": url, "content": text})
        visited.add(url)
        
        # Extract all links and add those within the aptos.dev domain
        for link in soup.find_all("a", href=True):
            href = link.get("href")
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)
            # Only follow links that belong to the aptos.dev domain
            if "aptos.dev" in parsed.netloc:
                # Remove any URL fragment for consistency
                full_url = full_url.split("#")[0]
                if full_url not in visited and full_url not in frontier:
                    frontier.append(full_url)
        
        # Be polite and wait a short period between requests
        time.sleep(0.5)
                    
    return pages

# Starting URL for the Aptos documentation
start_url = "https://aptos.dev/en/build/get-started"
pages = crawl_website(start_url, max_pages=1000)
print(f"Total pages scraped: {len(pages)}")

# Convert scraped pages to LangChain Document objects with metadata.
documents = [
    Document(page_content=page["content"], metadata={"source": page["url"]})
    for page in pages
]

index = faiss.IndexFlatIP(len(embeddings.embed_query("Hello World")))
index.reset()

vectorstore = FAISS( 
    embedding_function=embeddings, 
    index = index,
    docstore=InMemoryDocstore({}),
    index_to_docstore_id = {}
)

# Split documents into smaller chunks to improve the quality of vector embeddings.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)

# Initialize embeddings using a HuggingFace model and build a FAISS vector store.

vectorstore.add_texts(split_docs)

print("Vectorstore created.")
print("Number of vectors:", len(vectorstore.index_to_docstore_id))

Crawling: https://aptos.dev/en/build/get-started
Crawling: https://aptos.dev/
Crawling: https://aptos.dev/en/build/get-started/developer-setup
Crawling: https://aptos.dev/en/build/get-started/ethereum-cheatsheet
Crawling: https://aptos.dev/en/build/get-started/solana-cheatsheet
Crawling: https://aptos.dev/en/build/smart-contracts
Crawling: https://aptos.dev/en/build/smart-contracts/why-move
Crawling: https://aptos.dev/en/build/smart-contracts/create-package
Crawling: https://aptos.dev/en/build/smart-contracts/compiling
Crawling: https://aptos.dev/en/build/smart-contracts/book/unit-testing
Crawling: https://aptos.dev/en/build/smart-contracts/deployment
Crawling: https://aptos.dev/en/build/smart-contracts/debugging
Crawling: https://aptos.dev/en/build/smart-contracts/bcs
Crawling: https://aptos.dev/en/build/smart-contracts/object/creating-objects
Crawling: https://aptos.dev/en/build/smart-contracts/object/using-objects
Crawling: https://aptos.dev/en/build/smart-contracts/digital-asset
Cr

Created a chunk of size 5195, which is longer than the specified 1000
Created a chunk of size 4880, which is longer than the specified 1000
Created a chunk of size 5128, which is longer than the specified 1000
Created a chunk of size 5017, which is longer than the specified 1000
Created a chunk of size 5111, which is longer than the specified 1000
Created a chunk of size 5039, which is longer than the specified 1000
Created a chunk of size 5073, which is longer than the specified 1000
Created a chunk of size 5061, which is longer than the specified 1000
Created a chunk of size 5211, which is longer than the specified 1000
Created a chunk of size 5117, which is longer than the specified 1000
Created a chunk of size 1188, which is longer than the specified 1000
Created a chunk of size 5132, which is longer than the specified 1000
Created a chunk of size 5441, which is longer than the specified 1000
Created a chunk of size 5309, which is longer than the specified 1000
Created a chunk of s

Non-200 status code for https://aptos.dev/en/build/fetch-price-updates
Total pages scraped: 324


Created a chunk of size 5117, which is longer than the specified 1000
Created a chunk of size 5319, which is longer than the specified 1000
Created a chunk of size 5131, which is longer than the specified 1000
Created a chunk of size 4998, which is longer than the specified 1000
Created a chunk of size 4992, which is longer than the specified 1000
Created a chunk of size 5130, which is longer than the specified 1000
Created a chunk of size 5138, which is longer than the specified 1000
Created a chunk of size 5052, which is longer than the specified 1000
Created a chunk of size 5135, which is longer than the specified 1000
Created a chunk of size 5065, which is longer than the specified 1000
Created a chunk of size 5164, which is longer than the specified 1000
Created a chunk of size 5183, which is longer than the specified 1000
Created a chunk of size 4987, which is longer than the specified 1000
Created a chunk of size 4985, which is longer than the specified 1000
Created a chunk of s

ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.