In [1]:

# Placeholder paths
XSD_PATH = "schemas/pbcore-2.1.xsd"
XSL_PATH = "stylesheets/pbcore-xml-to-json.xsl"
JSON_SCHEMA_PATH = "schemas/pbcore-schema.json"

In [2]:
with open('guids2.txt') as f:
    guids = f.read().splitlines()
    

In [3]:
aapb = 'https://americanarchive.org'
def pbcore_url(guid:str):
    return f"{aapb}/catalog/{guid}.pbcore"
url = 'http://localhost:8000/convert'

In [4]:
import httpx
from lxml import etree
import json
ns = {'pb': 'http://www.pbcore.org/PBCore/PBCoreNamespace.html'}
async def convert_xml_to_json_from_url(guid: str):
    async with httpx.AsyncClient(timeout=10) as client:
        response = await client.get(pbcore(guid))
        response.raise_for_status()
        xml = response.content
        return await convert_xml_to_json(xml)

In [5]:
async def convert_xml_to_json(xml: bytes):

    pbcore_xml = etree.fromstring(xml)
    xslt_doc = etree.parse(XSL_PATH)
    transform = etree.XSLT(xslt_doc)
    
    json_str = str(transform(pbcore_xml))
    return json.loads(json_str)
        

In [6]:
def save_converted_json(json_data, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(json_data, json_file, ensure_ascii=False, indent=2)

In [7]:
async def convert_local_guids(guids):
    success, errors = [], []
    for guid in guids:
        try:
            pbcore_json = await convert_xml_to_json_from_url(guid)
            if not pbcore_json:
                print(f"Failed to convert {guid}")
                errors.append(guid)
                continue
            
            save_converted_json(pbcore_json, f'{guid}.json')
            success.append(guid)
        except Exception as e:
            print(f"Error processing {guid}: {e}")
            errors.append(guid)


In [8]:
import httpx

async def fetch(url: str):
    async with httpx.AsyncClient(timeout=10) as client:
        response = await client.get(url)
        response.raise_for_status()
        return response.json()


In [9]:
PBCORE_DIR = 'pbcore-xml'
JSON_DIR = 'pbcore-json'

async def search_and_convert(query, rows=100):
    url = f'{aapb}/api.json?q={query}&rows={rows}'

    results = await fetch(url)
    success, errors = [], []

    print(f"{results['response']['numFound']} documents found.")
    for doc in results['response']['docs']:
        guid = doc['id']
        try:
            xml = doc['xml']
            if not xml:
                print(f"No XML found for {guid}")
                errors.append(guid)
                continue
            with open(f'{PBCORE_DIR}/{guid}.pbcore', 'w') as f:
                f.write(xml)
            pbcore_json = await convert_xml_to_json(xml.encode('utf-8'))
            if not pbcore_json:
                print(f"Failed to convert {guid}")
                errors.append(guid)
                continue

            save_converted_json(pbcore_json, f'{JSON_DIR}/{guid}.json')
            success.append(guid)
        except Exception as e:
            print(f"Error processing {guid}: {e}")
            errors.append(guid)
    s = len(success)
    e = len(errors)

    if (e + s) > 0:
        print(f'{s/(e+s):.2%} of {s+e} documents were processed successfully with {e} errors.')
    return s, e
    

In [51]:
await search_and_convert('river')

49740 documents found.
100.00% of 100 documents were processed successfully with 0 errors.


(100, 0)

In [53]:
from faker import Faker

f = Faker()

In [140]:
f.word()

'all'