In [1]:
import multiprocessing.popen_spawn_posix
from dask.distributed import Client, progress
import dask.bag as db
import json
import pandas as pd
from pathlib import Path

In [2]:
client = Client(n_workers=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:61691  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB


In [3]:
def exception_handler(orig_func):
  def wrapper(*args,**kwargs):
    try:
      return orig_func(*args,**kwargs)
    except:
      print(*args)
  return wrapper
json_except = exception_handler(json.loads)

In [4]:
nmah_dir = '/Users/triznam/Downloads/OpenAccess-GH/metadata/objects/NMAH/'
nmah_file_list = [file for file in Path(nmah_dir).glob('*.txt.bz2') if file.name != 'index.txt.bz2']
len(nmah_file_list)

256

In [5]:
b = db.read_text(nmah_file_list, compression='bz2').map(json.loads)

In [6]:
nmah_example = b.take(1)[0]
nmah_example

{'id': 'edanmdm-nmah_325469',
 'version': '',
 'unitCode': 'NMAH',
 'linkedId': '0',
 'type': 'edanmdm',
 'content': {'descriptiveNonRepeating': {'record_ID': 'nmah_325469',
   'unit_code': 'NMAH',
   'title_sort': 'COL. ELMER E. ELLSWORTH, OF THE FIRE ZOUAVES.',
   'guid': 'http://n2t.net/ark:/65665/ng49ca746b4-fa45-704b-e053-15f76fa0b4fa',
   'title': {'label': 'Title',
    'content': 'Col. Elmer E. Ellsworth, of the Fire Zouaves.'},
   'metadata_usage': {'access': 'CC0'},
   'data_source': 'National Museum of American History'},
  'indexedStructured': {'date': ['1860s'],
   'geoLocation': [{'Other': {'type': 'Place', 'content': 'United States'}},
    {'Other': {'type': 'Place', 'content': 'Pennsylvania'}},
    {'Other': {'type': 'Place', 'content': 'Philadelphia'}},
    {'points': {'point': {'latitude': {'type': 'decimal', 'content': '39.95'},
       'longitude': {'type': 'decimal', 'content': '-75.15'}}}}],
   'object_type': ['Lithographs'],
   'name': ['Magee, John L.', 'Ellsworth

In [7]:
def flatten(record):
    flattened_record = dict()
    if record:
        flattened_record['id'] = record['id']
        flattened_record['unitCode'] = record['unitCode']
        flattened_record['title'] = record['title']
        recordID = record['content'].get('descriptiveNonRepeating', {}).get('record_ID', {})
        flattened_record['record_ID'] = recordID
        metadata = record['content'].get('descriptiveNonRepeating', {}).get('metadata_usage', {})
        flattened_record['metadata_usage'] = metadata
        DataSource = record['content'].get('descriptiveNonRepeating', {}).get('data_source', {})
        flattened_record['data_source'] = DataSource
        ObjectType = record['content'].get('indexedStructured', {}).get('object_type', {})
        flattened_record['object_type'] = ObjectType

        if 'freetext' in record['content']:
            if 'date' in record['content']['freetext']:
                for date in record['content']['freetext']['date']:
                    if date['label'] == 'Date':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'Date made':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'date made':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'associated dates':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'date on object':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'plate date':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'BEP certification date':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'series date':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'alternate calendar date':
                        flattened_record['Date'] = str(date['content'])
                    elif date['label'] == 'patent date':
                        flattened_record['Date'] = str(date['content'])
    return flattened_record

In [8]:
nmah_json = b.map(flatten).compute()
nmah_df = pd.DataFrame(nmah_json)
nmah_df.head()

Unnamed: 0,id,unitCode,title,record_ID,metadata_usage,data_source,object_type,Date
0,edanmdm-nmah_325469,NMAH,"Col. Elmer E. Ellsworth, of the Fire Zouaves.",nmah_325469,{'access': 'CC0'},National Museum of American History,[Lithographs],1861.0
1,edanmdm-nmah_1419775,NMAH,Sacramento Dixieland Jubilee Button,nmah_1419775,{'access': 'CC0'},National Museum of American History,[button],1986.0
2,edanmdm-nmah_1421414,NMAH,Los Angeles Classic Jazz Festival Badge,nmah_1421414,{'access': 'CC0'},National Museum of American History,[Badges],1993.0
3,edanmdm-nmah_554334,NMAH,photograph,nmah_554334,{'access': 'CC0'},National Museum of American History,"[Photographs, Photograph; Photograph; Daguerre...",
4,edanmdm-nmah_1994382,NMAH,photograph,nmah_1994382,{'access': 'CC0'},National Museum of American History,[Photographs],


In [9]:
nmah_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326357 entries, 0 to 1326356
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   id              1326357 non-null  object
 1   unitCode        1326357 non-null  object
 2   title           1326357 non-null  object
 3   record_ID       1326357 non-null  object
 4   metadata_usage  1326357 non-null  object
 5   data_source     1326357 non-null  object
 6   object_type     1326357 non-null  object
 7   Date            433101 non-null   object
dtypes: object(8)
memory usage: 81.0+ MB


In [10]:
nmah_df.to_csv('nmah_github_df.tsv.gz', index=False, sep='\t', compression='gzip')