In [1]:
om

NameError: name 'om' is not defined

In [1]:
from bs4 import BeautifulSoup
from rich import print as pprint
import aiohttp

In [None]:
session = aiohttp.ClientSession(headers={'user-agent': 'om'}, raise_for_status=True)

async def fetch_html(url: str) -> str:
    async with session.get(url) as response:
        return await response.text()

base_url = "https://plato.stanford.edu/"
toc = BeautifulSoup(await fetch_html(base_url + "projected-contents.html"))

print("Retrieved Table of Contents")

In [None]:
to_scrape = [base_url + a.attrs.get('href') for a in toc.find_all('a') if a.attrs.get('href') and a.attrs.get('href').startswith('entries')]
# to_scrape = to_scrape[:5]

print(f"Found {len(to_scrape)} articles to scrape")

In [None]:
from typing import List
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_core.documents import Document


all_documents: List[Document] = []
for url in to_scrape:
    article = BeautifulSoup(await fetch_html(url))

    title = article.title.text.split('(')[0].strip()
    body = article.find('div', {'id': 'main-text'})

    # docs: https://python.langchain.com/docs/how_to/HTML_header_metadata_splitter/#usage-examples
    splitter = HTMLHeaderTextSplitter([('h1', 'section'), ('h2', 'subsection')], return_each_element=True)  
    documents = splitter.split_text(str(body))

    for document in documents:
        document.metadata['title'] = title
        document.metadata['url'] = url

    all_documents += documents

    print(f"Retrieved {title} as {len(documents)} documents")

print(f"Completed fetching, created {len(all_documents)} documents")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")  # 512 tokens

embeddings = model.embed_documents([document.page_content for document in all_documents])

print("Embedded documents")

In [None]:
# docs: https://github.com/cmudig/emblaze#examples

import emblaze
from emblaze.utils import Field, ProjectionTechnique

emb = emblaze.Embedding({Field.POSITION: embeddings, Field.COLOR: [doc.metadata['title'] for doc in all_documents]})
emb.compute_neighbors(metric='cosine')


In [None]:
# Dimensionality reduction
variants = emblaze.EmbeddingSet([
    emb.project(method=ProjectionTechnique.TSNE),  # uhh i forgot how this works
    emb.project(method=ProjectionTechnique.PCA)  # PCA is Principal Compoenent Analysis, our embeddings of 384 dimensions are reduced to 2 dimensions, PCA finds the 2 dimensions with the most variation and uses them as 'anchors' to reorganize the points to preserve relationships, but in 2D
])

print("Reduced dimensions")

In [None]:
thumbnails = emblaze.TextThumbnails(descriptions=[doc.metadata['title'] for doc in all_documents], names=[doc.metadata.get("subsection") for doc in all_documents])
w = emblaze.Viewer(embeddings=variants, thumbnails=thumbnails)
w


Viewer(colorScheme='tableau', data={'data': [{'_format': 'compressed', '_idtype': 'u2', '_length': 386, 'ids':…