In [6]:
import os
import gzip
import shutil

def ungzip_metdata(dir_path, file_type):
    """Iterate over the `path_dir` and its children and
    ungizp the .gz metadata files found.
    """
    metadata_list = []

    for subdir, dirs, files in os.walk(dir_path, topdown=True):
        for file in files:
            # logger.debug(f"{file}")
            filepath = subdir + os.sep + file
            if filepath.endswith(str(file_type)):
                metadata_list.append(filepath)
                with gzip.open(filepath, "rb") as f, open(filepath + ".xml", "wb") as r:
                    shutil.copyfileobj(f, r, 65536)
    return metadata_list

In [7]:
list = ungzip_metdata("/Users/leonardovida/dev/hist-aware/data/raw/delpher/1960s", '.gz')

In [18]:
list[1:2]

['/Users/leonardovida/dev/hist-aware/data/raw/delpher/1960s/1967/04-27/DDD_110590220/DDD:ddd:110590220:mpeg21.didl.xml.gz']

In [263]:
import re 
from datetime import datetime
import pathlib
import pprint
import xmltodict
from functools import reduce

def parse_XML_metadata(path, met_dir, title, index):
    """Parse the input XML file and store the result in a pandas
    DataFrame with the given columns.

    Takes the filepath, file title and index integer of the df
    """
    metadata = {}
    list_metadata = []
    list_articles = []

    # Parse the date with regex
    match = re.search(r"\d{4}[/]\d{2}[-]\d{2}", path)
    date = datetime.strptime(match.group(), "%Y/%m-%d").date()

    # Parse DIDL XML
    with open(pathlib.Path(path), "r") as f:
        file = f.read()
        doc = xmltodict.parse(file)
        temp_data = doc["didl:DIDL"]["didl:Item"]["didl:Component"][0]["didl:Resource"][
            "srw_dc:dcx"
        ]
        pages = doc["didl:DIDL"]["didl:Item"]#["didl:Item"]

    metadata["metadata_title"] = title
    metadata["date"] = date
    metadata["index"] = index
    metadata["filepath"] = path
    metadata["dir"] = met_dir

    # Retrieve informations about the newspaper
    metadata["newspaper_title"] = deep_get(temp_data, "dc:title")
    metadata["newspaper_date"] = deep_get(temp_data, "dc:date")
    metadata["newspaper_publisher"] = deep_get(temp_data, "dc:publisher")
    metadata["newspaper_source"] = deep_get(temp_data, "dc:source")
    metadata["newspaper_volume"] = deep_get(temp_data, "dcx:volume")
    metadata["newspaper_issuenumber"] = deep_get(temp_data, "dcx:issuenumber")
    metadata["newspaper_recordIdentifier"] = deep_get(temp_data, "dcx:recordIdentifier")
    
    for page in range(len(pages["didl:Item"])):
        p = pages["didl:Item"][page]
        try:
            articles = p["didl:Item"]
            for article in range(len(articles)):
                article_dict = {}
                art = articles[article]
                article_dict["subject"] = art["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]["dc:subject"]
                article_dict["title"] = art["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]["dc:title"]
                article_dict["access_rights"] = art["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]["dcterms:accessRights"]
                article_dict["recordIdentifier"] = art["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]["dcx:recordIdentifier"]
                article_dict["identifier"] = art["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]["dc:identifier"]
                # Append newspaper-and metadata-specific data
                article_dict.update(metadata)
                list_articles.append(article_dict)
        except KeyError as e:
            continue

    return list_articles

def deep_get(dictionary, keys, default=None):
    return reduce(
        lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
        keys.split("."),
        dictionary,
    )

In [265]:
test = parse_XML_metadata("/Users/leonardovida/dev/hist-aware/data/raw/delpher/1960s/1969/01-02/DDD_010690741/DDD:ddd:010690741:mpeg21.didl.xml.gz.xml", "dir", "name", 1)
test = pd.DataFrame(test)
test

Unnamed: 0,subject,title,access_rights,recordIdentifier,identifier,metadata_title,date,index,filepath,dir,newspaper_title,newspaper_date,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_recordIdentifier
0,artikel,Na 2 uren onderhandelen Vietcong laat drie Ame...,accessible,ddd:010690741:mpeg21:a0001,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
1,illustratie met onderschrift,ONTSNAPT,accessible,ddd:010690741:mpeg21:a0002,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
2,artikel,HET WEER,accessible,ddd:010690741:mpeg21:a0003,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
3,artikel,Zacht,accessible,ddd:010690741:mpeg21:a0004,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
4,artikel,Geen sancties,accessible,ddd:010690741:mpeg21:a0005,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,advertentie,Advertentie,accessible,ddd:010690741:mpeg21:a0061,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
192,advertentie,Advertentie,accessible,ddd:010690741:mpeg21:a0062,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
193,advertentie,Advertentie,accessible,ddd:010690741:mpeg21:a0063,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21
194,advertentie,Advertentie,accessible,ddd:010690741:mpeg21:a0064,http://resolver.kb.nl/resolve?urn=ddd:01069074...,name,1969-01-02,1,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,Friese koerier : onafhankelĳk dagblad voor Fri...,1969-01-02,Stichting Je Maintiendrai-Friesland,KB C 199,24,91,ddd:010690741:mpeg21


In [46]:
from datetime import datetime
from functools import reduce
import re
import pathlib
import xmltodict
import xml.etree.ElementTree as et


def parse_XML_article(path, art_dir, title, index):
    """Parse the input XML file and store the result in a pandas
    DataFrame with the given columns.

    Takes the filepath, file title and index integer of the df
    """

    xtree = et.parse(path)
    xroot = xtree.getroot()
    list_articles = []

    # Parse the date with regex
    match = re.search(r"\d{4}[/]\d{2}[-]\d{2}", path)
    date = datetime.strptime(match.group(), "%Y/%m-%d").date()
    
    list_p = []
    article = {}
    for i, node in enumerate(xroot):
        article["article_name"] = str(title)
        article["date"] = str(date)
        article["index"] = index
        article["filepath"] = path
        article["dir"] = art_dir
        if node.tag == "title":
            article["title"] = node.text
        else:
            list_p.append(node.text)
    article["p"] = list_p
    list_p = []

    # Returns list of dict of articles and titles
    return article

In [47]:
a = parse_XML_article("/Users/leonardovida/dev/hist-aware/data/raw/delpher/1960s/1960/01-01/DDD_010872554/DDD_010872554_0001_articletext.xml", "dir", "title", "index")

In [48]:
import pandas as pd
main = pd.DataFrame.from_dict([a])

In [45]:
main

Unnamed: 0,article_name,date,index,filepath,dir,title,p
0,title,1960-01-01,index,/Users/leonardovida/dev/hist-aware/data/raw/de...,dir,De zes en dertig,"[~Le dernier des justes"", de laatste der Tsadd..."
