# Hathitrust Bibliography API Test

In [2]:
import requests as re
import pickle
import pandas as pd
import numpy as np
import xml.dom.minidom
from bs4 import BeautifulSoup
import lxml
import time
import os
import random

In [7]:
# sample call for bibliography of Journey to the West https://catalog.hathitrust.org/Record/002289079
record_num = "002289079"
r = re.get("https://catalog.hathitrust.org/api/volumes/full/recordnumber/{}.json".format(record_num))
bib = r.json()

# Desired Features
# 1. Title
# 2. Language
# 3. record_num

def get_title(record_num):
    r = re.get("https://catalog.hathitrust.org/api/volumes/full/recordnumber/{}.json".format(record_num))
    bib = r.json()
    return bib["records"][record_num]["titles"]

# Not inside json, use dublin core instead
# def get_language(record_num):
#     r = re.get("https://catalog.hathitrust.org/api/volumes/full/recordnumber/{}.json".format(record_num))
#     bib = r.json()
#     return bib["records"][record_num]["language"]

<b>What we want:</b>
![](img/journey_to_the_west_bib.png)

In [4]:
# No non-roman script returned due to utf-8 encoding (my best guess).
print(get_title(record_num))

# From Hathitrust:
# titles: The list of titles associated with this record, for sanity checking.
# This list includes the standard (MARC field 245) title with and without leading articles, and any vernacular (foreign language) titles provided in the record (MARC field 880).
#

# Many sources only have "romaji/pinyin" and no native script. Hard to derive meaning due to multiplicity.

['[Xi you ji].']


In [5]:
# MARC XML

def xml_pretty_print(xml_str : str):
    dom = xml.dom.minidom.parseString(xml_str)
    pretty_xml_as_string = dom.toprettyxml()
    print(pretty_xml_as_string)

marc_xml = bib["records"][record_num]['marc-xml']

xml_pretty_print(marc_xml)

<?xml version="1.0" ?>
<collection>
	<record>
		<leader>01106cam a2200277M  4500</leader>
		<controlfield tag="001">002289079</controlfield>
		<controlfield tag="003">MiAaHDL</controlfield>
		<controlfield tag="005">20210817000000.0</controlfield>
		<controlfield tag="006">m        d        </controlfield>
		<controlfield tag="007">cr bn ---auaua</controlfield>
		<controlfield tag="008">991021s1696    xx a          000 0 chi d</controlfield>
		<datafield tag="035" ind1=" " ind2=" ">
			<subfield code="a">(MiU)990022890790106381</subfield>
		</datafield>
		<datafield tag="035" ind1=" " ind2=" ">
			<subfield code="a">sdr-miu.990022890790106381</subfield>
		</datafield>
		<datafield tag="035" ind1=" " ind2=" ">
			<subfield code="a">(OCoLC)42944948</subfield>
		</datafield>
		<datafield tag="035" ind1=" " ind2=" ">
			<subfield code="a">(RLIN)MIUOAKW0976-B</subfield>
		</datafield>
		<datafield tag="035" ind1=" " ind2=" ">
			<subfield code="z">(MiU)Aleph002289079</subfield>
		</datafiel

In [13]:
# Dublin Core Output: https://en.wikipedia.org/wiki/Dublin_Core

r = re.get("https://quod.lib.umich.edu/cgi/o/oai/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:quod.lib.umich.edu:MIU01-{}".format(record_num))
# xml_pretty_print(r.text)

# string manipulation wins

start = time.time()
soup = BeautifulSoup(r.text, "html.parser")
soup.find("dc:language").getText()
end = time.time()
print("Soup Elapsed: ", end - start)

start = time.time()
r.text.split("<dc:language>")[1].partition('<')[0]
end = time.time()
print("String Manipulation: ", end - start)


Soup Elapsed:  0.00099945068359375
String Manipulation:  0.0


In [14]:
def get_language(record_num):
    r = re.get("https://quod.lib.umich.edu/cgi/o/oai/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:quod.lib.umich.edu:MIU01-{}".format(record_num))
    return r.text.split("<dc:language>")[1].partition('<')[0]

In [119]:
get_language("002304742")

'jpn'

## For the purposes of the toy model, let's limit ourselves to languages that utf-8 supports

In [15]:
langs = set(pd.read_csv("language-codes-3b2.csv")["alpha3-b"])
non_roman = {"ara", "chi", "bel", "rus", "bul", "jpn"}
roman_langs = langs - non_roman


# create dataset
num_records = 0
record_nums = []
titles = []
languages = []

def get_random_record_num():
    return "".join([str(random.randint(0,9)) for i in range(7)])

In [16]:
# generate records - doesn't work because too many requests - get flat file from hathitrust instead
# this is terrible, just run a query on the full dataset in the future after approval
# download tsv next time

import threading

class my_thread(threading.Thread):
    def __init__(self, threadID, name, num_times):
        super().__init__()
        self.threadID = threadID
        self.name = name
        self.count = num_times
        self.total = num_times
    def run(self):
        while self.count > 0:
            record_num = '00' + get_random_record_num()
            try:
                title = get_title(record_num)
                language = get_language(record_num)
                if language in non_roman:
                    raise Exception
            except:
                continue
            else:
                record_nums.append(record_num)
                titles.append(title)
                languages.append(language)
                self.count -= 1
                #print(self.name + ' ' + str(self.total - self.count) + '/' + str(self.total))

thread1 = my_thread(1, "Thread-1", 100)
thread2 = my_thread(2, "Thread-2", 100)
thread3 = my_thread(3, "Thread-3", 100)
thread4 = my_thread(4, "Thread-4", 100)
thread5 = my_thread(5, "Thread-5", 100)
thread6 = my_thread(6, "Thread-6", 100)
thread7 = my_thread(7, "Thread-7", 100)
thread8 = my_thread(8, "Thread-8", 100)
thread9 = my_thread(9, "Thread-9", 100)
thread10 = my_thread(10, "Thread-10", 100)

thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
thread6.start()
thread7.start()
thread8.start()
thread9.start()
thread10.start()



Thread-1 1/100
Thread-6 1/100
Thread-1 2/100
Thread-5 1/100
Thread-9 1/100
Thread-3 1/100
Thread-7 1/100
Thread-2 1/100
Thread-10 1/100
Thread-2 2/100
Thread-5 2/100
Thread-7 2/100
Thread-8 1/100
Thread-10 2/100
Thread-5 3/100
Thread-9 2/100
Thread-2 3/100
Thread-1 3/100
Thread-5 4/100
Thread-3 2/100
Thread-4 1/100
Thread-5 5/100
Thread-10 3/100
Thread-5 6/100
Thread-2 4/100
Thread-3 3/100
Thread-6 2/100
Thread-6 3/100
Thread-3 4/100
Thread-2 5/100
Thread-5 7/100
Thread-6 4/100
Thread-4 2/100
Thread-6 5/100
Thread-3 5/100
Thread-5 8/100
Thread-2 6/100
Thread-5 9/100
Thread-7 3/100
Thread-4 3/100
Thread-3 6/100
Thread-8 2/100
Thread-1 4/100
Thread-7 4/100
Thread-2 7/100
Thread-5 10/100
Thread-2 8/100
Thread-10 4/100
Thread-9 3/100
Thread-6 6/100
Thread-2 9/100
Thread-3 7/100
Thread-1 5/100
Thread-8 3/100
Thread-6 7/100
Thread-5 11/100
Thread-1 6/100
Thread-7 5/100
Thread-4 4/100
Thread-6 8/100
Thread-5 12/100
Thread-8 4/100
Thread-1 7/100
Thread-10 5/100
Thread-3 8/100
Thread-9 4/100
Th

In [42]:
titles = [i[0] for i in titles]

In [43]:
data = pd.DataFrame({"record number" : record_nums, "title" : titles, "language" : languages})
data.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
data = data.astype(str)
data.to_csv("random_data.csv")