In [None]:
# Standard library imports
import glob
import os
import re
import shutil
import getpass
from pathlib import Path
from urllib.parse import urlparse

# Third-party imports
import pandas as pd
import requests
from bs4 import BeautifulSoup, NavigableString

# LangChain imports
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTe

In [12]:
# Prompt for OpenAI API key with password masking
print("Please enter your OpenAI API key:")
openai_api_key = getpass.getpass("API Key: ")

# Set as environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

# Verify it was set (show only first/last few characters for security)
if openai_api_key:
    masked_key = f"{openai_api_key[:7]}...{openai_api_key[-4:]}"
    print(f"✓ API key set successfully: {masked_key}")
else:
    print("✗ No API key entered")

Please enter your OpenAI API key:
✓ API key set successfully: sk-proj...jXgA


In [63]:
# Delete the existing database
db_path = Path('./chroma-db_latin2')
if db_path.exists():
    shutil.rmtree(db_path)
    print(f"✓ Deleted existing database at {db_path}")


# # Initialize embeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-large')


# Initialize Chroma vector store

vector_store = Chroma(
    collection_name="HTML_samples_latin",
    embedding_function=embeddings,
    persist_directory='./chroma-db_latin3'
)

# vector_store, embeddings = initialize_vector_store()

# Configure text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False
)





Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [None]:
# Standard library imports
import glob
import os
import re
import shutil
import getpass
from pathlib import Path
from urllib.parse import urlparse

# Third-party imports
import pandas as pd
import requests
from bs4 import BeautifulSoup, NavigableString

# LangChain imports
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTe

In [65]:
# run this on all files in the source directory:  this is just to get the text, not the vector db
tei_dir = Path("latin_sources")
for html_path in tei_dir.glob("*.xml"):
    with html_path.open("r", encoding="utf-8") as handle:
        html_content = handle.read()    

        results = extract_pages_and_text(html_content, html_path)

        

In [66]:
# just to check one result

results[0]

{'PageNumber': 'page 1',
 'PageText': 'Incipit tractatus IX. Caput I. De coniunctionibus vocum, quid sint, et quot. Coniunctio in musica est dispositio sive ordinatio sonorum sive vocum ad invicem in syllabis et dictionibus. Coniunctiones autem in musica 16. esse dicuntur, scilicet semitonium, tonus, semiditonus, ditonus, diatessaron, diapente imperfectum, tritonus, diapente, hexade minus, hexade maius, heptade minus, (supplevi heptade maius, diapason imperfectum) diapason perfectum, diapason diapente, et bis diapason. Harum autem omnium coniunctionum aliae dicuntur coniunctiones et syllabae, aliae coniunctiones et species, et aliae coniunctiones tantum. Quae autem dicuntur coniunctiones et syllabae, sunt quatuor, scilicet semitonium, tonus, semiditonus et ditonus, quarum exempla sunt haec:',
 'Filename': 'MARLU9.xml',
 'Title': 'Lucidarium, tractatus nonus',
 'Author': 'Marchetus de Padua',
 'Date': '14th',
 'Source': 'chtmlMARLU9.xml'}

In [67]:
# get metaddata df
tei_dir = Path("latin_sources")
all_metadata = []
for html_path in tei_dir.glob("*.xml"):
    with html_path.open("r", encoding="utf-8") as handle:
        html_content = handle.read()    
        metadata = extract_metadata(BeautifulSoup(html_content, 'xml'), html_path)  
        all_metadata.append(metadata)   
metadata_df = pd.DataFrame(all_metadata)
metadata_df.to_csv("latin_tei_metadata.csv", index=False)

In [68]:
def process_xml_files(xml_dir='latin_sources'):
    """Process all TEI XML files in the specified directory."""
    xml_files = glob.glob(os.path.join(xml_dir, '*.xml'))
    
    if not xml_files:
        print(f"No TEI XML files found in {xml_dir}")
        return
    
    total_chunks = 0
    total_pages = 0
    
    for filepath in xml_files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                xml_content = f.read()

            metadata = extract_metadata(BeautifulSoup(xml_content, 'xml'), Path(filepath)) 
            print(f"Metadata for {filepath}: {metadata}")
            
            # Parse XML
            pages = extract_pages_and_text(xml_content, Path(filepath))
            
            
            if not pages:
                print(f"Warning: No pages found in {filepath}")
                continue
            
            file_chunks = 0
            
            # Process each page separately
            for page in pages:
                # Create metadata for this page, combining document and page metadata
                page_metadata = {
                    "title": page['Title'],
                    "author": page['Author'],
                    "date": page['Date'],
                    "source": page['Source'],
                    "page_number": page['PageNumber']
                }
                
                # Split page text into chunks
                chunks = text_splitter.create_documents(
                    texts=[page['PageText']],
                    metadatas=[page_metadata]
                )
                
                # Add chunks to vector store
                vector_store.add_documents(documents=chunks)
                
                file_chunks += len(chunks)
                total_chunks += len(chunks)
            
            total_pages += len(pages)
            print(f'✓ {os.path.basename(filepath)}')
            print(f'  Title: {page["Title"][:80]}...' if len(page["Title"]) > 80 else f'  Title: {page["Title"]}')
            print(f'  Author: {page["Author"]}')
            print(f'  Date: {page["Date"]}')
            print(f'  Source: {page["Source"]}')
            
            print(f'  Pages: {len(pages)} | Chunks: {file_chunks}')
            
        except Exception as e:
            print(f"✗ Error processing {filepath}: {str(e)}")
            import traceback
            traceback.print_exc()
            continue
    
    print(f'\n{"="*50}')
    print(f'ChromaDB Processing Complete')
    print(f'{"="*50}')
    print(f'Files processed: {len(xml_files)}')
    print(f'Total pages: {total_pages}')
    print(f'Total chunks: {total_chunks}')

In [69]:
process_xml_files(xml_dir='latin_sources')

Metadata for latin_sources/MARLU4.xml: {'Filename': 'MARLU4.xml', 'Title': 'Lucidarium, tractatus quartus', 'Author': 'Marchetus de Padua', 'Date': '14th', 'Source': 'chtmlMARLU4.xml'}
✓ MARLU4.xml
  Title: Lucidarium, tractatus quartus
  Author: Marchetus de Padua
  Date: 14th
  Source: chtmlMARLU4.xml
  Pages: 3 | Chunks: 4
Metadata for latin_sources/MARLU5.xml: {'Filename': 'MARLU5.xml', 'Title': 'Lucidarium, tractatus quintus', 'Author': 'Marchetus de Padua', 'Date': '14th', 'Source': 'chtmlMARLU5.xml'}
✓ MARLU5.xml
  Title: Lucidarium, tractatus quintus
  Author: Marchetus de Padua
  Date: 14th
  Source: chtmlMARLU5.xml
  Pages: 4 | Chunks: 5
Metadata for latin_sources/MARLU7.xml: {'Filename': 'MARLU7.xml', 'Title': 'Lucidarium, tractatus septimus', 'Author': 'Marchetus de Padua', 'Date': '14th', 'Source': 'chtmlMARLU7.xml'}
✓ MARLU7.xml
  Title: Lucidarium, tractatus septimus
  Author: Marchetus de Padua
  Date: 14th
  Source: chtmlMARLU7.xml
  Pages: 2 | Chunks: 2
Metadata for l

In [57]:
# for checking page and character counts


pages = results
        # Calculate lengths of all PageText entries
page_lengths = [len(page['PageText']) for page in pages]
avg_length = sum(page_lengths) / len(page_lengths)
    
# print(f"\n{filename}:")
print(f"  Total pages: {len(pages)}")
print(f"  Average PageText length: {avg_length:.2f} characters")
print(f"  Min length: {min(page_lengths)} characters")
print(f"  Max length: {max(page_lengths)} characters")


  Total pages: 32
  Average PageText length: 1337.62 characters
  Min length: 215 characters
  Max length: 1782 characters


In [None]:
# another check

for page in pages:
    if len(page['PageText']) > 3000:
        print(f"long page found: {page['PageNumber']} with length {len(page['PageText'])}")

In [51]:
all_docs = vector_store.get()
authors = set()
if 'Metadata' in all_docs:
    for metadata in all_docs['Metadata']:
        if metadata and 'Author' in metadata:
            authors.add(metadata['Author'])

In [52]:
authors

set()

In [70]:
all_docs

{'ids': ['575b5998-f2f0-43d0-846b-76f41b94af70',
  'c10aa9b0-9d5e-4508-87c0-fcbe11646244',
  '10ee31fe-7134-439c-83dc-152dbb537dc4',
  '6c40befb-aec0-4fc6-8169-db1f33780916',
  '9ba4896a-b9d3-43dd-8262-5735d0cf70a6',
  'b379e73d-c9be-463f-a53d-99c60699f4cf',
  '9edf8ec4-cc72-4cfc-8e38-b555a86aa825',
  'ba9383b0-f23c-4d65-bd44-5c8194959530',
  'e8dcca28-e5d1-4321-8eab-b65ef99226d7',
  '8f580275-a2de-4a3c-9611-1dcebd3753e6',
  '9b814c3b-d86f-4f85-8c97-68aeac9ba79d',
  '500a5d96-fd3d-47f6-b809-9c9265762cff',
  '7659cc8b-7855-4ce1-af07-01b6e1d04405',
  'cb3ba85d-6609-49a9-bdea-cc52b57e3e00',
  'cd9a6c32-4d7f-47cf-9ece-98af7291da78',
  '76632a59-b947-4643-9e5b-0fdffcfa7c36',
  'd54f75eb-b019-4434-ab90-412200fe090e',
  '3e8054f1-068f-4139-bb07-32339f86fef2',
  '3e32c668-e8b2-4355-ad8a-ce00181ffbdf',
  '7283ed6e-8d53-4dca-a494-21af2a4a98b7',
  '453d32ba-2c95-481e-8939-fd1627fcbe6e',
  '1b185540-7b4a-47f6-840f-662204ec0ed3',
  'bb9fffad-549d-424c-af1a-dcbbf5d65f93',
  '8d7d7de2-c770-4b79-9673-