In [41]:
import pandas as pd
import os, re, sys
from glob import glob
from pathlib import Path

In [19]:
# the volumes used in the ACS project
HATHIFILE = "google_ids_1800-1850.txt.gz"

# corrected field names file. See also:
# https://www.hathitrust.org/hathifiles_description
HATHICOLS = "hathifiles/hathi_field_list.txt"

In [3]:
def search_hathifile(ht_file, col_file):
    """
    Return rows matching the query, as well as stubbytree paths for htids
    """

    # Use iterative method to scale to full hathifiles
    with open(col_file, "r") as fp:
        col_names = fp.readline().strip('\n').split('\t')
        num_cols = len(col_names)

    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    iter_csv = pd.read_csv(
        ht_file, 
        sep='\t', 
        header=None,
        names=col_names,
        engine='c',
        # quicker if we can assert some types for the fields
        dtype={
            'htid': 'str',
            'rights_date_used': 'object',
            'pub_place': 'str', # sadly, this is just the partner lib
            'imprint': 'str'
        },
        iterator=True,
        chunksize=5000,
        error_bad_lines=False)

    df = pd.DataFrame()
    for i, chunk in enumerate(iter_csv):

        # hard code query: use a basic regex with matching group
        # find: "Munroe, Francis", "Munroe and Francis", "Munroe & Francis"
        conditions = (chunk['imprint'].str.contains(
            r"\bMunroe(?:,| and| &) Francis\b",
            na=False,
            flags=re.IGNORECASE)
        )
        # concatenate valid rows, idx doesn't matter
        df = pd.concat([df, chunk[conditions]], ignore_index=True)
    return df

In [4]:
df = search_hathifile(HATHIFILE, HATHICOLS)

In [5]:
df.shape

(360, 26)

In [14]:
# Utility functions from Hathi's feature datasets
# https://github.com/htrc/htrc-feature-reader/blob/39010fd41c049f4f86b9c8ff4a44e000217093c2/htrc_features/utils.py

def _id_encode(id):
    '''
    :param id: A Pairtree ID. If it's a Hathitrust ID, this is the part after the library
        code; e.g. the part after the first period for vol.123/456.
    :return: A sanitized id. e.g., 123/456 will return as 123=456 to avoid filesystem issues.
    '''
    return id.replace(":", "+").replace("/", "=").replace(".", ",")

def _id_decode(id):
    '''
    :param id: A sanitized Pairtree ID.
    :return: An original Pairtree ID.
    '''
    return id.replace("+", ":").replace("=", "/").replace(",", ".")

def clean_htid(htid):
    '''
    :param htid: A HathiTrust ID of form lib.vol; e.g. mdp.1234
    :return: A sanitized version of the HathiTrust ID, appropriate for filename use.
    '''
    libid, volid = htid.split('.', 1)
    volid_clean = _id_encode(volid)
    return '.'.join([libid, volid_clean])

def id_to_stubbytree(htid, format = None, suffix = None, compression = None):
    '''
    Take an HTRC id and convert it to a 'stubbytree' location.
    '''
    libid, volid = htid.split('.', 1)
    volid_clean = _id_encode(volid)

    suffixes = [s for s in [format, compression] if s is not None]
    filename = ".".join([clean_htid(htid), *suffixes])
    path = os.path.join(libid, volid_clean[::3], filename)
    return path

In [42]:
stubby_ids = [id_to_stubbytree(htid) for htid in df.htid.values]

In [59]:
len(stubby_ids)

360

In [43]:
stubby_ids[:5]

['mdp\\31331\\mdp.39015038731918',
 'mdp\\31197\\mdp.39015010791476',
 'mdp\\31198\\mdp.39015010791484',
 'mdp\\31199\\mdp.39015010791492',
 'mdp\\31190\\mdp.39015010791500']

In [44]:
rois_dir = Path(os.path.abspath('../_app-files/roi-vectors/vectors'))

In [54]:
# for each volume, find associated .npy vectors within stubbytree directory
munroe_francis = {}

for stubby_id in stubby_ids:
    vol_vectors = glob(os.path.join(rois_dir, stubby_id + "*.npy"))
    if len(vol_vectors) != 0:
        munroe_francis[stubby_id] = vol_vectors

In [57]:
total = 0
for vec_list in munroe_francis.values():
    total += len(vec_list)

In [60]:
# 1477 vectors from 118 illustrated volumes (out of 360 total)
total

1477

In [65]:
with open('munroe.csv', 'w') as fp:
    for stubby_id in munroe_francis.keys():
        fp.write(os.path.normpath(stubby_id))
        fp.write('\n')