In [1]:
import pandas as pd
import re

import requests
import json
from pprint import pprint
import time

In [46]:
# require object id for cat and dog, unable to avoid overlap with other words

cat_url = "https://collectionapi.metmuseum.org/public/collection/v1/search?q=cat"
cat_id = requests.get(cat_url).json()

dog_url = "https://collectionapi.metmuseum.org/public/collection/v1/search?q=dog"
dog_id = requests.get(dog_url).json()

In [47]:
# combine object id list
id_list = cat_id['objectIDs'] + dog_id['objectIDs']

cat_n = len(cat_id['objectIDs'])
dog_n = len(dog_id['objectIDs'])

print(f'cat objects {cat_n}')
print(f'dog objects {dog_n}')
print(f'total objects {len(id_list)}')

cat objects 45903
dog objects 5463
total objects 51366


In [23]:
# require each object's data, due to api shut down in middle, there are multiple file for api request
object_json = {}
id_n = len(id_list)

for i in range(i,id_n):
    temp_url = f'https://collectionapi.metmuseum.org/public/collection/v1/objects/{id_list[i]}'
    temp_response = requests.get(temp_url).json()
    object_json[f'id_{id_list[i]}'] = temp_response
    
    time.sleep(0.01) # less than 70 calls/second
    
    if (i+1) % 1000 == 0 :
        print( f'{i+1} / {id_n}' )
        
print("----------complete-----")

47000 / 51366
48000 / 51366
49000 / 51366
50000 / 51366
51000 / 51366
----------complete-----


In [25]:
# save json file. due to api shut down for several times, we have multiple files
with open("../data/original_data/original_api_output_4.json", "w") as outfile:
    json.dump(object_json, outfile)

In [4]:
# help file to assinge geolocation
map_df = pd.read_csv( "../data/original_data/map_point.csv" )

# function to clean data
def CleanData(object_json) :
    clean_json = {}
    for o_id in object_json :
        try: # SKIP - "not valid object"

            temp_object = {}
            object_one = object_json[o_id]
            # object_one['testline'] = "cat's"

            # test if the object fit word 'cat|cats' or 'dog|dogs' , there are lots overlapping in search, ie. category
            test_cat = bool(re.search( '[^a-z](cat|cats)[^a-z]' , str(object_one) , re.IGNORECASE )) # team cat
            test_dog = bool(re.search( '[^a-z](dog|dogs)[^a-z]' , str(object_one) , re.IGNORECASE )) # team dog

            if any( [test_cat , test_dog] ) : # SKIP - objects with overlapping words

                # team
                if all( [test_cat , test_dog] ):
                    temp_object['team'] = 'both' # team both
                elif test_cat :
                    temp_object['team'] = 'cat'
                elif test_dog :
                    temp_object['team'] = 'dog'
                else :
                    temp_object['team'] = 'NA' # based on loop design, should not have team NA

                # basic object info
                temp_object['objectID'] = object_one['objectID']
                temp_object['displayGallery'] = object_one['GalleryNumber']
                temp_object['objectName'] = object_one['objectName']
                temp_object['objectURL'] = object_one['objectURL']
                temp_object['objectImage'] = object_one['primaryImageSmall']
                if len(temp_object['objectImage']) > 0 :
                    temp_object['isImage'] = True
                else:
                    temp_object['isImage'] = False
                temp_object['title'] = object_one['title']

                temp_object['artist'] = object_one['artistDisplayName']
                temp_object['department'] = object_one['department']
                temp_object['isHighlight'] = object_one['isHighlight']
                temp_object['classification'] = object_one['classification']
                temp_object['medium'] = object_one['medium']
                temp_object['culture'] = object_one['culture']
                temp_object['period'] = object_one['period']
                temp_object['region'] = object_one['region']

                # Object Year and dateing 
                temp_year = int(object_one['objectBeginDate'])
                temp_object['year'] = temp_year
                
                # YearDecade
                temp_object['yearDecade'] = f'{(temp_year // 10)*10}s'

                # YearCentury. BC, AD
                temp_century = (temp_year - 1) // 100 + 1
                try:
                    if temp_century < 0 :
                        temp_object['yearCentury'] = f'{ abs(temp_century) } BC'
                    else :
                        temp_object['yearCentury'] = f'{ temp_century } AD'
                    temp_object['yearCenturyInt'] = temp_century*100
                except:
                    temp_object['yearCentury'] = 'NA'
                
                # YearMultipleCentury
                temp_mcentry = temp_century // 5
                try:
                    if temp_mcentry < 0 :
                        temp_object['yearCenturyMultiple'] = "B.C."
                    elif temp_mcentry == 1 :
                        temp_object['yearCenturyMultiple'] = "1st to 5th Century"
                    elif temp_mcentry == 2 :
                        temp_object['yearCenturyMultiple'] = "6th to 10th Century"
                    elif temp_mcentry == 3 :
                        temp_object['yearCenturyMultiple'] = "11th to 15th Century"
                    elif temp_mcentry == 4 :
                        temp_object['yearCenturyMultiple'] = "16th to 20th Century"
                    elif temp_mcentry == 5 :
                        temp_object['yearCenturyMultiple'] = "21th Century"
                except:
                    temp_object['yearCenturyMultiple'] = 'NA'                        

                # Object Country, use object country if not blank, otherwise use artistNationality or culture
                try:
                    if len( object_one['country'] ) > 0:
                        temp_country = re.findall( '[\w\s\.]+' , object_one['country'] )[0].strip()
                    elif len( object_one['artistNationality'] ) > 0:
                        temp_country = re.findall( '[\w\s\.]+' , object_one['artistNationality'] )[0].strip()
                    else:
                        temp_country = re.findall( '[\w\s\.]+' , object_one['culture'] )[0].strip()
                except:
                    temp_country = 'NA'

                temp_object['country'] = temp_country
                
                # TBC - countryGeo, convert country to formal country with Geocode. Geocode file is prepared based on met_data's country list.
                try:
                    temp_object['geoCode'] = map_df.loc[ map_df['country'] == temp_country]["geoCode"].item()
                except:
                    temp_object['geoCode'] = "NA"
                
                # TBC - Continent
                try:
                    temp_object['continent'] = 'NA'
                except:
                    pass
                
                # Object Tags. combine multiple tags to one string
                try:
                    object_tags = object_one['tags']
                    tmep_tags = []
                    for one_tag in object_tags:
                        tmep_tags.append(one_tag['term'])
                    temp_object['tags'] = ', '.join(tmep_tags)
                except:
                    temp_object['tags'] = 'NA'

                # add object pass test, with selected content to clean database
                clean_json[o_id] = temp_object
                # print( o_id, temp_object['team'] )
        except:
            pass
                
    return clean_json

In [2]:
# read all files and select object with proper key words

def ReadJson( path ):
    print( path )
    path = open(path)
    object_json = json.load( path )
    print(f'original count {len(object_json)}')
    clean_json = CleanData(object_json) # use function to clean and select data
    print(f'clean count {len(clean_json)}' )
    return clean_json

clean_1 = ReadJson( '../data/original_data/original_api_output_1.json' )
clean_2 = ReadJson( '../data/original_data/original_api_output_2.json' )
clean_3 = ReadJson( '../data/original_data/original_api_output_3.json' )
clean_4 = ReadJson( '../data/original_data/original_api_output_4.json' )

../data/original_data/original_api_output_1.json
original count 28172


NameError: name 'CleanData' is not defined

In [6]:
# combine clean data to json
clean_json = {}

for o_id in clean_1 :
    clean_json[ o_id ] = clean_1[ o_id ]
    
for o_id in clean_2 :
    clean_json[ o_id ] = clean_2[ o_id ]

for o_id in clean_3 :
    clean_json[ o_id ] = clean_3[ o_id ]

for o_id in clean_4 :
    clean_json[ o_id ] = clean_4[ o_id ]
    
print(f'total clean count {len(clean_json)}' )

with open("../data/met_data.json", "w") as outfile:
    json.dump(clean_json, outfile)

total clean count 4843
