In [1]:
from dask.distributed import Client
import dask.bag as db
import json
from collections import Counter
import time
import numpy as np
import pandas as pd
import s3fs
from PIL import Image
import humanize

In [2]:
fs = s3fs.S3FileSystem(anon=True)
fs.ls('smithsonian-open-access')

['smithsonian-open-access/media', 'smithsonian-open-access/metadata']

In [3]:
metadata = fs.ls('smithsonian-open-access/metadata/edan')
metadata

['smithsonian-open-access/metadata/edan/aaa',
 'smithsonian-open-access/metadata/edan/acah',
 'smithsonian-open-access/metadata/edan/acm',
 'smithsonian-open-access/metadata/edan/cfchfolklife',
 'smithsonian-open-access/metadata/edan/chndm',
 'smithsonian-open-access/metadata/edan/eepa',
 'smithsonian-open-access/metadata/edan/fbr',
 'smithsonian-open-access/metadata/edan/fs',
 'smithsonian-open-access/metadata/edan/fsa',
 'smithsonian-open-access/metadata/edan/fsg',
 'smithsonian-open-access/metadata/edan/hac',
 'smithsonian-open-access/metadata/edan/hmsg',
 'smithsonian-open-access/metadata/edan/hsfa',
 'smithsonian-open-access/metadata/edan/index.txt',
 'smithsonian-open-access/metadata/edan/naa',
 'smithsonian-open-access/metadata/edan/nasm',
 'smithsonian-open-access/metadata/edan/nasmac',
 'smithsonian-open-access/metadata/edan/nmaahc',
 'smithsonian-open-access/metadata/edan/nmafa',
 'smithsonian-open-access/metadata/edan/nmah',
 'smithsonian-open-access/metadata/edan/nmai',
 's

In [4]:
nmah_metadata = fs.ls('smithsonian-open-access/metadata/edan/nmah')
print(len(nmah_metadata))
for metadata_file in nmah_metadata[:5]:
    print(metadata_file)
    print(humanize.naturalsize(fs.du(metadata_file)))

257
smithsonian-open-access/metadata/edan/nmah/00.txt
9.5 MB
smithsonian-open-access/metadata/edan/nmah/01.txt
9.5 MB
smithsonian-open-access/metadata/edan/nmah/02.txt
9.5 MB
smithsonian-open-access/metadata/edan/nmah/03.txt
9.4 MB
smithsonian-open-access/metadata/edan/nmah/04.txt
9.6 MB


In [5]:
client = Client(threads_per_worker=4, n_workers=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:60371  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 17.02 GB


In [6]:
b = db.read_text('s3://smithsonian-open-access/metadata/edan/nmah/*.txt',
                storage_options={'anon': True}).map(json.loads)

In [7]:
nmah_example = b.take(1)[0]
with open('nmah_metadata_example.json', 'w') as json_out:
    json.dump(nmah_example, json_out, indent=2)

In [8]:
print(json.dumps(nmah_example, indent=2))

{
  "id": "edanmdm-nmah_1452166",
  "version": "",
  "unitCode": "NMAH",
  "linkedId": "0",
  "type": "edanmdm",
  "content": {
    "descriptiveNonRepeating": {
      "record_ID": "nmah_1452166",
      "unit_code": "NMAH",
      "title_sort": "PART OF MOLD",
      "guid": "http://n2t.net/ark:/65665/ng49ca746b2-afe0-704b-e053-15f76fa0b4fa",
      "title": {
        "label": "Title",
        "content": "Part of Mold"
      },
      "metadata_usage": {
        "access": "CC0"
      },
      "data_source": "National Museum of American History"
    },
    "indexedStructured": {
      "date": [
        "1900s"
      ],
      "geoLocation": [
        {
          "Other": {
            "type": "Place",
            "content": "United States"
          }
        },
        {
          "points": {
            "point": {
              "latitude": {
                "type": "decimal",
                "content": "38"
              },
              "longitude": {
                "type": "decimal",
   

In [9]:
def flatten(record):
    flattened_record = dict()
    flattened_record['id'] = record['id']
    flattened_record['unitCode'] = record['unitCode']
    flattened_record['title'] = record['title']
    recordID = record['content'].get('descriptiveNonRepeating', {}).get('record_ID', {})
    flattened_record['record_ID'] = recordID
    metadata = record['content'].get('descriptiveNonRepeating', {}).get('metadata_usage', {})
    flattened_record['metadata_usage'] = metadata
    DataSource = record['content'].get('descriptiveNonRepeating', {}).get('data_source', {})
    flattened_record['data_source'] = DataSource
    ObjectType = record['content'].get('indexedStructured', {}).get('object_type', {})
    flattened_record['object_type'] = ObjectType
    
    if 'freetext' in record['content']:
        if 'date' in record['content']['freetext']:
            for date in record['content']['freetext']['date']:
                if date['label'] == 'Date':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'Date made':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'date made':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'associated dates':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'date on object':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'plate date':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'BEP certification date':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'series date':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'alternate calendar date':
                    flattened_record['Date'] = str(date['content'])
                elif date['label'] == 'patent date':
                    flattened_record['Date'] = str(date['content'])
    return flattened_record

In [10]:
b.count().compute()

5199

In [11]:
flattened_example = flatten(nmah_example)
flattened_example

{'id': 'edanmdm-nmah_1452166',
 'unitCode': 'NMAH',
 'title': 'Part of Mold',
 'record_ID': 'nmah_1452166',
 'metadata_usage': {'access': 'CC0'},
 'data_source': 'National Museum of American History',
 'object_type': ['mold, butter/cheese, part of'],
 'Date': '1800 - 1900'}

In [12]:
nmah_json = b.map(flatten).compute()
nmah_df = pd.DataFrame(nmah_json)
nmah_df.head()

Unnamed: 0,id,unitCode,title,record_ID,metadata_usage,data_source,object_type,Date
0,edanmdm-nmah_1452166,NMAH,Part of Mold,nmah_1452166,{'access': 'CC0'},National Museum of American History,"[mold, butter/cheese, part of]",1800 - 1900
1,edanmdm-nmah_1314014,NMAH,Model Truck from the “Futurama” Exhibition at ...,nmah_1314014,{'access': 'CC0'},National Museum of American History,[Model Truck],1939-1940
2,edanmdm-nmah_209276,NMAH,Dr. Drake's Glessco Cough & Croup Remedy,nmah_209276,{'access': 'CC0'},National Museum of American History,"[Medicine, Drugs]",after 1948
3,edanmdm-nmah_648464,NMAH,Probably parts for Spinning Frame T02338.000,nmah_648464,{'access': 'CC0'},National Museum of American History,"[Spinning Machine Parts, Batch of]",
4,edanmdm-nmah_1377036,NMAH,button,nmah_1377036,{'access': 'CC0'},National Museum of American History,[button],


In [13]:
nmah_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5199 entries, 0 to 5198
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              5199 non-null   object
 1   unitCode        5199 non-null   object
 2   title           5199 non-null   object
 3   record_ID       5199 non-null   object
 4   metadata_usage  5199 non-null   object
 5   data_source     5199 non-null   object
 6   object_type     5199 non-null   object
 7   Date            1691 non-null   object
dtypes: object(8)
memory usage: 325.1+ KB


In [15]:
nmah_df.to_csv('nmah_df.tsv', index=False, sep='\t')