# Create Training Data

This notebook is modified from Programming Historian lesson. The high-level purpose is to take a pickled array of htids and discover all the pages marked with IMAGE_ON_PAGE, returning a dict.

In [9]:
import json
import os
import pickle
import time

In [3]:
# Import the HT Data API wrapper and non-versioned keys
from hathitrust_api import DataAPI
from keys import ht_keys

# Replace placeholder strings with your HT credentials (leaving the quote marks)
ht_access_key = ht_keys['access']
ht_secret_key = ht_keys['secret']

# instantiate the Data API connection object
data_api = DataAPI(ht_access_key, ht_secret_key)

In [15]:
# file generated by sample_from_ids.py
sample_path = "sample_ids_1800_1850.pkl"

# we want to get all the image page candidates for the volumes in vol_ids
with open(sample_path, "rb") as fp:
    vol_ids = pickle.load(fp)

In [16]:
def ht_picture_download(item_id, out_dir=None):
    """
    :param item_id: unique HathiTrust volume identifier
    :param out_dir: destination for images; if None, no download
    
    Note: if supplied, out_dir must be an existing directory and
    the caller must have write permissions in that directory
    
    :rtype list of pages with IMAGE_ON_PAGE feature
    """
    
    print("[{}] Starting processing".format(item_id))
    
    # metadata from API in json format (different than HT collection metadata)
    meta = json.loads(data_api.getmeta(item_id, json=True))

    # sequence gets us each page of the PDF in order, with any
    # additional information that might be available for it
    sequence = meta['htd:seqmap'][0]['htd:seq']

    # list of pages with pictures (empty to start)
    img_pages = []

    # try/except block handles situation where no "pfeats" exist OR
    # the sequence numbers are not numeric
    for page in sequence:
        try:
            if 'IMAGE_ON_PAGE' in page['htd:pfeat']:
                img_pages.append(int(page['pseq']))
        except (KeyError, TypeError) as e:
            continue
        
    # track for download progress report
    total_pages = len(img_pages)

    # if out_dir is not None, then also download page images
    if out_dir:
        
         # return if folder already exists (reasonable inference that volume already processed)
        if os.path.isdir(out_dir):
            print("[{}] Directory already exists.".format(item_id))
            return img_pages

        # otherwise, create folder to put the images
        print("[{}] Making directory {}".format(item_id, out_dir))
        os.makedirs(out_dir)
        
        for i, page in enumerate(img_pages):
            try:
                # simple status message
                print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
                
                img = data_api.getpageimage(item_id, page)
            
                img_out = os.path.join(out_dir, str(page) + ".jpg")
    
                # write out the image
                with open(img_out, 'wb') as fp:
                    fp.write(img)

                # to avoid exceeding the allowed API usage, we take a quick
                # two-second break before requesting the next image
                time.sleep(2)

            except Exception as e:
                print("[{}] Error downloading page {}: {}".format(item_id, page,e))
                
    # return the list of image pages
    return img_pages

In [17]:
def ht_picture_locate(item_id):
    """
    :param item_id: unique HathiTrust volume identifier
    
    :rtype list of all IMAGE_ON_PAGE candidates for item_id
    """
    
    # metadata from API in json format (different than HT collection metadata)
    meta = json.loads(data_api.getmeta(item_id, json=True))

    # sequence gets us each page of the PDF in order, with any
    # additional information that might be available for it
    sequence = meta['htd:seqmap'][0]['htd:seq']

    # list of pages with pictures (empty to start)
    img_pages = []

    # try/except block handles situation where no "pfeats" exist OR
    # the sequence numbers are not numeric
    for page in sequence:
        try:
            if 'IMAGE_ON_PAGE' in page['htd:pfeat']:
                img_pages.append(int(page['pseq']))
        except (KeyError, TypeError) as e:
            continue
        
    # track for download progress report
    total_pages = len(img_pages)
                
    # return the list of image pages
    return img_pages

In [23]:
# Dictionary that maps ids to list of page candidates
pdict = {}
count = 0
total = len(vol_ids)

for item_id in vol_ids:
    pdict[item_id] = ht_picture_locate(item_id)
    
    # don't max out the API
    count += 1
    time.sleep(2)
    
    # occasional update
    if count % 20 == 0:
        print("{}/{}".format(count,total))    

20/2401
40/2401
60/2401
80/2401
100/2401
120/2401
140/2401
160/2401
180/2401
200/2401
220/2401
240/2401
260/2401
280/2401
300/2401
320/2401
340/2401
360/2401
380/2401
400/2401
420/2401
440/2401
460/2401
480/2401
500/2401
520/2401
540/2401
560/2401
580/2401
600/2401
620/2401
640/2401
660/2401
680/2401
700/2401
720/2401
740/2401
760/2401
780/2401
800/2401
820/2401
840/2401
860/2401
880/2401
900/2401
920/2401
940/2401
960/2401
980/2401
1000/2401
1020/2401
1040/2401
1060/2401
1080/2401
1100/2401
1120/2401
1140/2401
1160/2401
1180/2401
1200/2401
1220/2401
1240/2401
1260/2401
1280/2401
1300/2401
1320/2401
1340/2401
1360/2401
1380/2401
1400/2401
1420/2401
1440/2401
1460/2401
1480/2401
1500/2401
1520/2401
1540/2401
1560/2401
1580/2401
1600/2401
1620/2401
1640/2401
1660/2401
1680/2401
1700/2401
1720/2401
1740/2401
1760/2401
1780/2401
1800/2401
1820/2401
1840/2401
1860/2401
1880/2401
1900/2401
1920/2401
1940/2401
1960/2401
1980/2401
2000/2401
2020/2401
2040/2401
2060/2401
2080/2401
2100/2401
212

In [25]:
# test by writing to a file
with open("sample_pages.json", "w") as fp:
    json.dump(pdict, fp, sort_keys=True, indent=4)

In [26]:
print(pdict.keys())

dict_keys(['hvd.32044083479782', 'mdp.39015011275263', 'njp.32101047989213', 'nyp.33433075852107', 'hvd.hwhfwu', 'uc2.ark:/13960/t5r789k6p', 'mdp.39015020063239', 'gri.ark:/13960/t79s71v92', 'hvd.32044037714938', 'umn.31951001992232t', 'mdp.39015063635828', 'mdp.39015064563649', 'mdp.39015063629664', 'chi.084974265', 'nyp.33433069332793', 'uc1.a0010563658', 'uc1.b4185160', 'hvd.32044004527859', 'osu.32435055146864', 'hvd.hw27xk', 'nyp.33433068252877', 'uc1.c2725120', 'njp.32101061256523', 'nyp.33433081644191', 'uc2.ark:/13960/t3hx1d295', 'hvd.32044023792187', 'hvd.hn2u95', 'njp.32101060054309', 'uc1.b2792426', 'nyp.33433061810994', 'hvd.32044060209616', 'mdp.39015065478961', 'mdp.39015064337887', 'uva.x000982745', 'mdp.39015077807769', 'hvd.hw2ph4', 'hvd.ah3ywy', 'uiuo.ark:/13960/t9z03s81p', 'mdp.39015069885864', 'keio.10811057801', 'uiuc.6850122', 'uc1.$b556037', 'uc1.a0003851664', 'mdp.39015005068526', 'uiug.30112084313839', 'njp.32101076424041', 'hvd.32044102788544', 'nnc1.cu5417417

In [27]:
!dir

 Volume in drive C is Windows
 Volume Serial Number is C2C5-01EE

 Directory of C:\Users\stephen-krewson\Documents\hathi-images

07/23/2019  04:01 PM    <DIR>          .
07/23/2019  04:01 PM    <DIR>          ..
07/23/2019  12:39 PM             1,317 .gitignore
07/23/2019  12:35 PM    <DIR>          .ipynb_checkpoints
06/25/2019  03:12 PM        39,025,962 all_ids_1800_1850.csv.gz
06/25/2019  10:19 AM    <DIR>          artifacts
07/23/2019  11:07 AM    <DIR>          DEPRECATED
06/25/2019  12:54 PM    <DIR>          hathifiles
07/23/2019  04:01 PM            10,221 hathitrust.ipynb
07/23/2019  12:15 PM             2,624 ids_from_range.py
04/25/2019  08:25 PM               734 keys.py
07/23/2019  03:13 PM             3,616 README.md
07/23/2019  12:19 PM             2,472 sample_from_ids.py
07/23/2019  11:59 AM            62,876 sample_ids_1800_1850.pkl
07/23/2019  04:01 PM           804,147 sample_pages.json
07/23/2019  12:36 PM    <DIR>          __pycache__
               9 File(s)    

In [29]:
total = 0
for k,v in pdict.items():
    total += len(v)
    
print(total)

52895
