In [41]:

import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
import glob
import os

import re
from urllib.parse import urlparse

import shutil
import getpass

In [42]:
# Prompt for OpenAI API key with password masking
print("Please enter your OpenAI API key:")
openai_api_key = getpass.getpass("API Key: ")

# Set as environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

# Verify it was set (show only first/last few characters for security)
if openai_api_key:
    masked_key = f"{openai_api_key[:7]}...{openai_api_key[-4:]}"
    print(f"✓ API key set successfully: {masked_key}")
else:
    print("✗ No API key entered")

Please enter your OpenAI API key:
✓ API key set successfully: sk-proj...Tr8A


In [44]:
# Initialize embeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

# Initialize vector store
vector_store = Chroma(
    collection_name="HTML_samples_italian",
    embedding_function=embeddings,
    persist_directory='./chroma-db_italian'
)

# Configure text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False
)


  vector_store = Chroma(
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [51]:
from pathlib import Path
from bs4 import BeautifulSoup

# metadata extraction function


# metadata extraction function
def extract_metadata(soup, html_path):
    # Convert to Path object if it's a string
    if isinstance(html_path, str):
        html_path = Path(html_path)
    
    content_main = soup.find("main", class_="content")
    if not content_main:
        return None

    metadata_div = content_main.find("div")
    if not metadata_div:
        return None

    paragraphs = metadata_div.find_all("p")
    title = author = None

    for p in paragraphs:
        text = p.get_text(strip=True)
        if text.startswith("Title:"):
            title = text.split(":", 1)[-1].strip()
        elif text.startswith("Author:"):
            author = text.split(":", 1)[-1].strip()

    source_div = metadata_div.find("div")
    source_text = source_div.get_text(separator="\n", strip=True) if source_div else ""

    return {
        "Filename": html_path.name,
        "Title": title or "",
        "Author": author or "",
        "Source": source_text,
        "Citation": f"https://tmiweb.science.uu.nl/text/reading-edition/{html_path.name}"
    }

# # main page content extraction
# main page content extraction
def extract_pages_and_text(html_content, html_path):
    """
    Extract page numbers and their associated text from TEI HTML.
    
    Page numbers are in <span class="tei pb"><span class="tei pbSpan">page X</span></span>
    Page content follows each page break as sibling elements until the next page break.
    
    Args:
        html_content: Either a string containing TEI HTML or a BeautifulSoup object
        html_path: Path to the HTML file    
        
    Returns:
        List of dictionaries with 'PageNumber', 'PageText', and metadata
    """
    # Check if html_content is already a BeautifulSoup object
    if isinstance(html_content, BeautifulSoup):
        soup = html_content
    else:
        soup = BeautifulSoup(html_content, 'html.parser')

    metadata = extract_metadata(soup, html_path)
    
    # Find all page break tags - these are <span class="tei pb"> elements
    page_breaks = soup.find_all('span', class_='tei pb')
    
    results = []
    
    for i, pb in enumerate(page_breaks):
        # Extract page number from the nested pbSpan
        # Structure: <span class="tei pb"><span class="tei pbSpan">page X</span></span>
        page_span = pb.find('span', class_='tei pbSpan')
        page_number = page_span.get_text(strip=True) if page_span else f"page {i+1}"
        
        # Collect ALL text content that follows this page break until the next one
        # Strategy: Get all text nodes between the two page breaks in document order
        
        # Get the next page break to know where to stop
        next_pb = page_breaks[i + 1] if i + 1 < len(page_breaks) else None
        
        # Collect all NavigableStrings (text nodes) between page breaks
        page_text_parts = []
        
        # Start after current page break
        for element in pb.next_elements:
            # Stop if we hit the next page break
            if next_pb and element == next_pb:
                break
            
            # Only collect NavigableString objects (actual text nodes)
            # This avoids double-counting when we have nested tags
            from bs4 import NavigableString
            if isinstance(element, NavigableString):
                text = str(element).strip()
                if text:
                    page_text_parts.append(text)
        
        page_text = ' '.join(page_text_parts)
        
        result_entry = {
            'PageNumber': page_number,
            'PageText': page_text
        }
        
        # Attach metadata if available
        if metadata:
            result_entry.update(metadata)
        
        results.append(result_entry)
    
    return results



In [None]:
# run this on all files in the source directory:  this is just to get the text, not the vector db
tei_dir = Path("tei_source")
for html_path in tei_dir.glob("*.html"):
    with html_path.open("r", encoding="utf-8") as handle:
        html_content = handle.read()    

        results = extract_pages_and_text(html_content, html_path)

In [None]:
# just to check one result

results[0]

{'PageNumber': 'page i',
 'PageText': "page i DIALOGO DEL R. M. DON PIETRO PONTIO PARMIGIANO, Oue si tratta della Theorica, e Prattica di Musica. \n                        Et anco si mostra la diuersità de' Contraponti, & Canoni. [Figure] In Parma , appresso Erasmo Viothi . 1595. Con licenza de' Superiori.",
 'Filename': 'pondia.html',
 'Title': 'Dialogo di musica',
 'Author': 'Pietro Pontio',
 'Source': 'Copyright © 2001, Utrecht University, Netherlands',
 'Citation': 'https://tmiweb.science.uu.nl/text/reading-edition/pondia.html'}

In [54]:
def process_html_files(html_dir='tei_source'):
    """Process all HTML files in the specified directory."""
    html_files = glob.glob(os.path.join(html_dir, '*.html'))
    
    if not html_files:
        print(f"No HTML files found in {html_dir}")
        return
    
    total_chunks = 0
    total_pages = 0
    
    for filepath in html_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            # Parse HTML
            pages = extract_pages_and_text(html_content, Path(filepath))
            
            # # Extract metadata from the record section
            # doc_metadata = extract_metadata(soup)
            
            # # Get document-level title
            # doc_title = get_document_title(soup, doc_metadata)
            
            # # Extract pages
            # pages = extract_pages(soup)
            
            if not pages:
                print(f"Warning: No pages found in {filepath}")
                continue
            
            file_chunks = 0
            
            # Process each page separately
            for page in pages:
                # Create metadata for this page, combining document and page metadata
                page_metadata = {
                    "title": page['Title'],
                    "author": page['Author'],
                    "source": page['Source'],
                    "citation": page['Citation'],
                    "page_number": page['PageNumber']
                }
                
                # Split page text into chunks
                chunks = text_splitter.create_documents(
                    texts=[page['PageText']],
                    metadatas=[page_metadata]
                )
                
                # Add chunks to vector store
                vector_store.add_documents(documents=chunks)
                
                file_chunks += len(chunks)
                total_chunks += len(chunks)
            
            total_pages += len(pages)
            print(f'✓ {os.path.basename(filepath)}')
            print(f'  Title: {page["Title"][:80]}...' if len(page["Title"]) > 80 else f'  Title: {page["Title"]}')
            print(f'  Author: {page["Author"]}')
            print(f'  Citation: {page["Citation"]}')
            print(f'  Pages: {len(pages)} | Chunks: {file_chunks}')
            
        except Exception as e:
            print(f"✗ Error processing {filepath}: {str(e)}")
            import traceback
            traceback.print_exc()
            continue
    
    print(f'\n{"="*50}')
    print(f'ChromaDB Processing Complete')
    print(f'{"="*50}')
    print(f'Files processed: {len(html_files)}')
    print(f'Total pages: {total_pages}')
    print(f'Total chunks: {total_chunks}')

In [55]:
process_html_files(html_dir='tei_source')

✓ artart.html
  Title: L'Artusi
  Author: Giovanni Maria Artusi
  Citation: https://tmiweb.science.uu.nl/text/reading-edition/artart.html
  Pages: 155 | Chunks: 314
✓ zarins58.html
  Title: Le istitutioni harmoniche
  Author: Gioseffo Zarlino
  Citation: https://tmiweb.science.uu.nl/text/reading-edition/zarins58.html
  Pages: 353 | Chunks: 1027
✓ tigcom.html
  Title: Il compendio della musica
  Author: Orazio Tigrini
  Citation: https://tmiweb.science.uu.nl/text/reading-edition/tigcom.html
  Pages: 146 | Chunks: 254
✓ vicant.html
  Title: L'antica musica ridotta alla moderna prattica
  Author: Nicola Vicentino
  Citation: https://tmiweb.science.uu.nl/text/reading-edition/vicant.html
  Pages: 276 | Chunks: 574
✓ pondia.html
  Title: Dialogo di musica
  Author: Pietro Pontio
  Citation: https://tmiweb.science.uu.nl/text/reading-edition/pondia.html
  Pages: 160 | Chunks: 224

ChromaDB Processing Complete
Files processed: 5
Total pages: 1090
Total chunks: 2393


In [None]:
# for checking page and character counts


pages = results
        # Calculate lengths of all PageText entries
page_lengths = [len(page['PageText']) for page in pages]
avg_length = sum(page_lengths) / len(page_lengths)
    
# print(f"\n{filename}:")
print(f"  Total pages: {len(pages)}")
print(f"  Average PageText length: {avg_length:.2f} characters")
print(f"  Min length: {min(page_lengths)} characters")
print(f"  Max length: {max(page_lengths)} characters")


  Total pages: 155
  Average PageText length: 3025.00 characters
  Min length: 24 characters
  Max length: 4215 characters


In [None]:
# another check

for page in pages:
    if len(page['PageText']) > 3000:
        print(f"long page found: {page['PageNumber']} with length {len(page['PageText'])}")