In [5]:
# set up environment and api key

import os
import glob
from bs4 import BeautifulSoup
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

import shutil
import getpass




In [6]:
# Prompt for OpenAI API key with password masking
print("Please enter your OpenAI API key:")
openai_api_key = getpass.getpass("API Key: ")

# Set as environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

# Verify it was set (show only first/last few characters for security)
if openai_api_key:
    masked_key = f"{openai_api_key[:7]}...{openai_api_key[-4:]}"
    print(f"✓ API key set successfully: {masked_key}")
else:
    print("✗ No API key entered")

Please enter your OpenAI API key:
✓ API key set successfully: sk-proj...Tr8A


### Only run this when necessary! as it will delete all existing data!  And will cost $

In [None]:
# Initialize embeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# Initialize vector store
vector_store = Chroma(
    collection_name="HTML_samples",
    embedding_function=embeddings,
    persist_directory='./chroma-db'
)

# Configure text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False
)

def extract_metadata(soup):
    """Extract metadata from the HTML record section."""
    metadata = {
        'title': '',
        'author': '',
        'pub_info': '',
        'citation': ''
    }
    
    # Find the record dl element
    record_dl = soup.find('dl', class_='record')
    if not record_dl:
        print("No record dl found.")
        return metadata
    
    # Extract title
    title_div = record_dl.find('div', {'data-key': 'title'})
    if title_div:
        title_dd = title_div.find('dd')
        if title_dd:
            metadata['title'] = title_dd.get_text(strip=True)
    
    # Extract author
    author_div = record_dl.find('div', {'data-key': 'author'})
    if author_div:
        author_dd = author_div.find('dd')
        if author_dd:
            metadata['author'] = author_dd.get_text(strip=True)
    
    # Extract publication info
    pubinfo_div = record_dl.find('div', {'data-key': 'pubinfo'})
    if pubinfo_div:
        pubinfo_dds = pubinfo_div.find_all('dd')
        pub_parts = [dd.get_text(strip=True) for dd in pubinfo_dds]
        metadata['pub_info'] = ' '.join(pub_parts)
    
    # Extract citation - look specifically in the citation section
    citation_dt = record_dl.find('dt', string='Cite this Item')
    if citation_dt:
        citation_dd = citation_dt.find_next('dd')
        if citation_dd:
            citation_span = citation_dd.find('span')
            if citation_span:
                metadata['citation'] = citation_span.get_text(strip=True)
    
    return metadata

def get_document_title(soup, metadata):
    """Extract the overall document title from HTML or metadata."""
    # Use metadata title if available
    if metadata.get('title'):
        return metadata['title']
    
    # Fallback to HTML title tag
    title_tag = soup.find('title')
    if title_tag and title_tag.text.strip():
        return title_tag.text.strip()
    
    # Try h1 or h2 for document title
    h1 = soup.find('h1')
    if h1:
        return h1.text.strip()
    
    h2 = soup.find('h2')
    if h2:
        return h2.text.strip()
    
    return "Untitled Document"

def extract_pages(soup):
    """Extract individual pages with their metadata from the HTML."""
    pages = []
    
    # Find all article elements that represent pages
    articles = soup.find_all('article', class_='fullview-page')
    
    for article in articles:
        # Extract page metadata from the h3 heading
        page_heading = article.find('h3', class_='js-toc-ignore')
        
        if page_heading:
            page_num = page_heading.get('data-p-num', 'Unknown')
            page_label = page_heading.get('data-heading-label', f'Page {page_num}')
            base = page_heading.get('data-base', '')
        else:
            page_num = 'Unknown'
            page_label = 'Unknown Page'
            base = ''
        
        # Extract text content from this page, removing script/style
        for script in article(["script", "style"]):
            script.decompose()
        
        # Get clean text from the page
        text = article.get_text(separator=' ', strip=True)
        text = ' '.join(text.split())
        
        if text.strip():
            pages.append({
                'text': text,
                'page_number': page_num,
                'page_label': page_label,
                'base': base
            })
    
    return pages

def process_html_files(html_dir='html_source'):
    """Process all HTML files in the specified directory."""
    html_files = glob.glob(os.path.join(html_dir, '*.html'))
    
    if not html_files:
        print(f"No HTML files found in {html_dir}")
        return
    
    total_chunks = 0
    total_pages = 0
    
    for filename in html_files:
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            # Parse HTML
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract metadata from the record section
            doc_metadata = extract_metadata(soup)
            
            # Get document-level title
            doc_title = get_document_title(soup, doc_metadata)
            
            # Extract pages
            pages = extract_pages(soup)
            
            if not pages:
                print(f"Warning: No pages found in {filename}")
                continue
            
            file_chunks = 0
            
            # Process each page separately
            for page in pages:
                # Create metadata for this page, combining document and page metadata
                page_metadata = {
                    "document_title": doc_title,
                    "title": doc_metadata['title'],
                    "author": doc_metadata['author'],
                    "pub_info": doc_metadata['pub_info'],
                    "citation": doc_metadata['citation'],
                    "page_number": page['page_number'],
                    "page_label": page['page_label'],
                    "base": page['base'],
                    "source_file": os.path.basename(filename)
                }
                
                # Split page text into chunks
                chunks = text_splitter.create_documents(
                    texts=[page['text']],
                    metadatas=[page_metadata]
                )
                
                # Add chunks to vector store
                vector_store.add_documents(documents=chunks)
                
                file_chunks += len(chunks)
                total_chunks += len(chunks)
            
            total_pages += len(pages)
            print(f'✓ {os.path.basename(filename)}')
            print(f'  Title: {doc_metadata["title"][:80]}...' if len(doc_metadata["title"]) > 80 else f'  Title: {doc_metadata["title"]}')
            print(f'  Author: {doc_metadata["author"]}')
            print(f'  Citation: {doc_metadata["citation"]}')
            print(f'  Pages: {len(pages)} | Chunks: {file_chunks}')
            
        except Exception as e:
            print(f"✗ Error processing {filename}: {str(e)}")
            import traceback
            traceback.print_exc()
            continue
    
    print(f'\n{"="*50}')
    print(f'ChromaDB Processing Complete')
    print(f'{"="*50}')
    print(f'Files processed: {len(html_files)}')
    print(f'Total pages: {total_pages}')
    print(f'Total chunks: {total_chunks}')

def query_example(query_text, n_results=5):
    """Example function to query the vector store."""
    results = vector_store.similarity_search(query_text, k=n_results)
    
    print(f"\nQuery: '{query_text}'")
    print(f"Found {len(results)} results:\n")
    
    for i, doc in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"  Title: {doc.metadata.get('title', 'N/A')[:80]}...")
        print(f"  Author: {doc.metadata.get('author', 'N/A')}")
        print(f"  Page: {doc.metadata.get('page_label', 'N/A')}")
        print(f"  Citation: {doc.metadata.get('citation', 'N/A')}...")
        print(f"  Source: {doc.metadata.get('source_file', 'N/A')}")
        print(f"  Content preview: {doc.page_content[:200]}...")
        print()

if __name__ == "__main__":
    # Ensure the HTML directory exists
    html_dir = 'html_source'
    if not os.path.exists(html_dir):
        print(f"Error: Directory '{html_dir}' not found")
        print(f"Please create the '{html_dir}' directory and add your HTML files")
        exit(1)
    
    # Process files
    process_html_files(html_dir)
    

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


✓ bevin_1631.html
  Title: A briefe and short instruction of the art of musicke to teach how to make discan...
  Author: Bevin, Elway, ca. 1554-1638.
  Citation: 
  Pages: 64 | Chunks: 64
✓ playford_1654.html
  Title: A breefe introduction to the skill of musick for song & violl / by J.P.
  Author: Playford, John, 1623-1686?
  Citation: 
  Pages: 38 | Chunks: 39


KeyboardInterrupt: 

In [32]:
def extract_metadata(soup):
    """Extract metadata from the HTML record section."""
    metadata = {
        'title': '',
        'author': '',
        'pub_info': '',
        'citation': ''
    }
    
    # Find the record dl element
    record_dl = soup.find('dl', class_='record')
    if not record_dl:
        print("No record dl found.")
        return metadata
    
    # Extract title
    title_div = record_dl.find('div', {'data-key': 'title'})
    if title_div:
        title_dd = title_div.find('dd')
        if title_dd:
            metadata['title'] = title_dd.get_text(strip=True)
    
    # Extract author
    author_div = record_dl.find('div', {'data-key': 'author'})
    if author_div:
        author_dd = author_div.find('dd')
        if author_dd:
            metadata['author'] = author_dd.get_text(strip=True)
    
    # Extract publication info
    pubinfo_div = record_dl.find('div', {'data-key': 'pubinfo'})
    if pubinfo_div:
        pubinfo_dds = pubinfo_div.find_all('dd')
        pub_parts = [dd.get_text(strip=True) for dd in pubinfo_dds]
        metadata['pub_info'] = ' '.join(pub_parts)
    
    # Extract citation - look specifically in the citation section
    citation_dt = record_dl.find('dt', string='Cite this Item')
    if citation_dt:
        citation_dd = citation_dt.find_next('dd')
        if citation_dd:
            citation_span = citation_dd.find('span')
            if citation_span:
                metadata['citation'] = citation_span.get_text(strip=True)
    
    return metadata

In [33]:
def process_single_html_file(file_path):
    # Load the HTML file
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract metadata
    metadata = extract_metadata(soup)

    # Print the relevant metadata
    print(f"File: {os.path.basename(file_path)}")
    print(f"Title: {metadata['title'][:80]}..." if len(metadata['title']) > 80 else f"Title: {metadata['title']}")
    print(f"Author: {metadata['author']}")
    print(f"PUB INFO: {metadata['pub_info']}")
    print(f"Citation: {metadata['citation']}")

    # Print the first few lines of the extracted text
    pages = []
    page_sections = soup.find_all('section', {'name': 'DIV1'})
    for section in page_sections:
        text = section.get_text(strip=True)
        pages.append({'text': text})
    
    print(f"\nPages: {len(pages)}")
    for i, page in enumerate(pages[:5], 1):  # Print first 5 pages
        print(f"Page {i}: {page['text'][:100]}...")  # Print first 100 characters

    print("-" * 50)

In [34]:
process_single_html_file('/Users/rfreedma/Documents/CRIM_Python/theory_llm/html_source/morley_1596.html')

File: morley_1596.html
Title: A plaine and easie introduction to practicall musicke set downe in forme of a di...
Author: Morley, Thomas, 1557-1603?
PUB INFO: Imprinted at London :: By Peter Short, dwelling on Breedstreet hill at the signe of the Starre, 1597.
Citation: "A plaine and easie introduction to practicall musicke set downe in forme of a dialogue: deuided into three partes, the first teacheth to sing with all things necessary for the knowledge of pricktsong. The second treateth of descante and to sing two parts in one vpon a plainsong or ground, with other things necessary for a descanter. The third and last part entreateth of composition of three, foure, fiue or more parts with many profitable rules to that effect. With new songs of 2. 3. 4. and .5 [sic] parts. By Thomas Morley, Batcheler of musick, & of the gent. of hir Maiesties Royall Chapell." In the digital collectionEarly English Books Online.https://name.umdl.umich.edu/A07753.0001.001. University of Michigan Library D