In [32]:
import httpx
from lxml import etree
import json
# Placeholder paths
XSD_PATH = "schemas/pbcore-2.1.xsd"
XSL_PATH = "stylesheets/pbcore-xml-to-json.xsl"
JSON_SCHEMA_PATH = "schemas/pbcore-schema.json"
PBCORE_DIR = 'pbcore-xml'
JSON_DIR = 'pbcore-json'


In [33]:
with open('guids2.txt') as f:
    guids = f.read().splitlines()
    

In [34]:
aapb = 'https://americanarchive.org'
def pbcore_url(guid:str):
    return f"{aapb}/catalog/{guid}.pbcore"
url = 'http://localhost:8000/convert'

In [35]:
async def convert_xml_to_json(xml: bytes):

    pbcore_xml = etree.fromstring(xml)
    xslt_doc = etree.parse(XSL_PATH)
    transform = etree.XSLT(xslt_doc)
    
    json_str = str(transform(pbcore_xml))
    return json.loads(json_str)
        

In [36]:
async def convert_local_files(path: str=PBCORE_DIR):
    from pathlib import Path
    import json

    for file in Path(path).glob('*.pbcore'):
        with open(file, 'rb') as f:
            xml = f.read()
        json_data = await convert_xml_to_json(xml)
        with open(f'{JSON_DIR}/{file.stem}.json', 'w') as f:
            json.dump(json_data, f, indent=2)
    
# await convert_local_files('pbcore-xml')

In [37]:

ns = {'pb': 'http://www.pbcore.org/PBCore/PBCoreNamespace.html'}
async def convert_xml_to_json_from_url(guid: str):
    async with httpx.AsyncClient(timeout=10) as client:
        response = await client.get(pbcore(guid))
        response.raise_for_status()
        xml = response.content
        return await convert_xml_to_json(xml)

In [38]:
def save_converted_json(json_data, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(json_data, json_file, ensure_ascii=False, indent=2)

In [39]:
async def convert_local_guids(guids):
    success, errors = [], []
    for guid in guids:
        try:
            pbcore_json = await convert_xml_to_json_from_url(guid)
            if not pbcore_json:
                print(f"Failed to convert {guid}")
                errors.append(guid)
                continue
            
            save_converted_json(pbcore_json, f'{guid}.json')
            success.append(guid)
        except Exception as e:
            print(f"Error processing {guid}: {e}")
            errors.append(guid)


In [40]:
import httpx

async def fetch(url: str):
    async with httpx.AsyncClient(timeout=10) as client:
        response = await client.get(url)
        response.raise_for_status()
        return response.json()


In [41]:

async def search_and_convert(query, rows=100):
    url = f'{aapb}/api.json?q={query}&rows={rows}'

    results = await fetch(url)
    success, errors = [], []

    print(f"{results['response']['numFound']} documents found.")
    for doc in results['response']['docs']:
        guid = doc['id']
        try:
            xml = doc['xml']
            if not xml:
                print(f"No XML found for {guid}")
                errors.append(guid)
                continue
            with open(f'{PBCORE_DIR}/{guid}.pbcore', 'w') as f:
                f.write(xml)
            pbcore_json = await convert_xml_to_json(xml.encode('utf-8'))
            if not pbcore_json:
                print(f"Failed to convert {guid}")
                errors.append(guid)
                continue

            save_converted_json(pbcore_json, f'{JSON_DIR}/{guid}.json')
            success.append(guid)
        except Exception as e:
            print(f"Error processing {guid}: {e}")
            errors.append(guid)
    s = len(success)
    e = len(errors)

    if (e + s) > 0:
        print(f'{s/(e+s):.2%} of {s+e} documents were processed successfully with {e} errors.')
    return s, e
    

In [42]:
# await search_and_convert('river')

In [43]:
from faker import Faker
from random import randrange


fake = Faker()

In [44]:
for i in range(10):
    q = ' '.join(fake.words(randrange(1, 5)))
    print(q)
    await search_and_convert(q)

push opportunity before professional
43378 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
them career how show
118156 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
officer claim final second
61865 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
final shake always
76749 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
safe wide with late
42432 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
building
103022 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
see
162350 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
matter
95633 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
church race
16163 documents found.
100.00% of 100 documents were processed successfully with 0 errors.
option memory play
445