In [None]:
import requests
import pandas as pd
import json
import csv
import sys
import os
import time

sys.path.append(os.path.abspath('queries'))
from queries import *

# Met
source: csv file. We read it as pandas dataframe, count how many ids with the desired characteristics

In [None]:
met_path = "downloaded-data/MetObjects.txt"
met_df = pd.read_csv(met_path, delimiter=',', dtype=str)
met_df.head()

In [None]:
columns_to_keep = ["Object Number", "Culture", "Period", "Dynasty", "Reign", "Object Date", "Object Begin Date", "Object End Date", "Artist Alpha Sort", "Artist Nationality", "Artist Begin Date", "Artist End Date", "Object Wikidata URL", "Artist ULAN URL", "Artist Wikidata URL"]

In [None]:
filtered_df = met_df[columns_to_keep]
duplicated_mask = filtered_df['Object Number'].duplicated(keep=False)

# Invert the mask to keep only unique values
unique_values_df = filtered_df[~duplicated_mask]
unique_values_df = unique_values_df.replace("NaN", " ")
unique_values_df.head(10)

In [None]:
# calculate the total number of artworks by counting the ids
met_art_tot = unique_values_df["Object Number"].notna().sum() - (unique_values_df["Object Number"] == '').sum()
print(met_art_tot)

In [None]:
# Define column groups of each filed of interest (date, artist, artist dates, artworks aligned, authors aligned)
date_columns = ["Object Date", "Object Begin Date", "Object End Date"]
artist_columns = ["Artist Alpha Sort"]
artist_date_columns = ["Artist Begin Date", "Artist End Date"]
art_aligned = ["Object Wikidata URL"]
artist_aligned_columns = ["Artist ULAN URL", "Artist Wikidata URL"]

# Function to count non-empty cells
def count_non_empty(df, columns):
    return df[columns].notna().sum(axis=1).gt(0).sum()

# Count non-empty cells in each group
date_tot = count_non_empty(unique_values_df, date_columns)
artist_tot = count_non_empty(unique_values_df, artist_columns)
artist_date_tot = count_non_empty(unique_values_df, artist_date_columns)
art_aligned_tot = count_non_empty(unique_values_df, art_aligned)
artist_aligned_tot = count_non_empty(unique_values_df, artist_aligned_columns)

#print(f"date_tot: {date_tot}, artist_tot: {artist_tot}, artist_date_tot: {artist_date_tot}, aligned_tot: {art_aligned_tot}")

res_list = ["Metropolitan", int(met_art_tot), int(date_tot), 0, 0, int(artist_tot), int(artist_date_tot), 0, int(art_aligned_tot), int(artist_aligned_tot)]
prop_list = make_prop_list(res_list)
print(res_list)
print(prop_list)


In [None]:
headers = ['Dataset', 'Artworks', 'With date', 'With place', 'With date and place', 'With author', 'With author with date', 'With author with place','With artworks aligned to Wikidata', 'With author aligned to Wikidata or Ulan']
headers_prop = headers.copy()
headers_prop.remove('Artworks')

In [None]:
# save results

with open('met_res.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerow(res_list)
with open('met_res_prop.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers_prop)
    writer.writerow(prop_list)

# Tate
The Tate API limits the number of results. For this reason, we downloaded the dataset available at [https://github.com/tategallery/collection/tree/master](https://github.com/tategallery/collection/tree/master). Nevertheless, this dataset is not maintained anymore. 

In [None]:
# identify null values (entry, with not known/negative entry)
null_values = ["date not known", "None", "null", "Null"]

Extraction of the artists having a date, a place or both

In [None]:
# query the json files organized in folders
def artist_date_place(base_path):
    with_date = set()
    with_place = set()
    count_files = 0
    files_crashed = 0
    # Traverse the directory structure

    start = time.time()
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.json'):
                count_files+=1
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        # Check if there is the id: 
                        if "id" in data and str(data['id']) not in null_values: 
                            artist_id = data['id']
                            artist_name = data['fc']
                            #print("------------artist: ", artist_name, artist_id)
                            # set details presence to None
                            date_info = None
                            place_info = None
                            if 'birth' in data: 
                                if 'place' in data['birth']:
                                    birth_place = data['birth']['place']
                                    if 'name' in birth_place and str(birth_place['name']) not in null_values:
                                       # print("birth place: ", birth_place['name'])
                                        place_info = True
                                if 'time' in data['birth']: 
                                    birth_time = data['birth']['time']
                                    if 'startYear' in birth_time and str(birth_time['startYear']) not in null_values: 
                                        date_info = True
    
                            if place_info == None and 'death' in data: 
                                if 'place' in data['death']:
                                    death_place = data['death']['place']
                                    if 'name' in death_place and str(death_place['name']) not in null_values:
                                       # print("death place: ", death_place['name'])
                                        place_info = True
                                        
                            if 'activePlaces' in data:
                                for d in data['activePlaces']:
                                    if place_info == None: 
                                        if 'name' in d and d['name'] not in null_values:
                                            place_info = True
                                        
                            if "date" in data and str(data["date"]) not in null_values: 
                               # print("date: ", data["date"])
                                date_info = True
                            
                                
                            # add artists with details to the respective set
                            if place_info == True: 
                                with_place.add(artist_id) 
                            if date_info == True: 
                                with_date.add(artist_id)                           
                                
                except Exception as e: 
                    print(f"Error reading {file}: {e}")
                    files_crashed+=1
                    
                print("files processed: ", count_files)
    end = time.time()
    print("tot files processed: ", count_files, "tot files crashed: ", files_crashed)
    print("tot artists with date: ", len(with_date), "tot artists with place: ", len(with_place))

    t = end -start
    print("Total time: ", t)
    return with_place, with_date

artists_path = "downloaded-data/collection-master/artists"
artist_test_path = "downloaded-data/collection-master/artists/b"


# Count the total number of artworks
artists_with_place, artists_with_date = artist_date_place(artists_path)

print(artists_with_place)


We query the artworks files. We check whether they have further infomation about date and place, and if the author is part of the previously created lists. We count how many artworks have one or more of these aspects. 

In [None]:

def count_artworks(base_path):
    artwork_count = 0
    tate_art_date_count = 0
    tate_art_place_count = 0
    tate_artist_count = 0
    tate_artist_date_count = 0
    tate_artist_place_count = 0

    start = time.time()
    count_files = 0
    files_crashed = 0
    # Traverse the directory structure
    for root, dirs, files in os.walk(base_path):
        tate_artwork_count = 0
        for file in files:
            if file.endswith('.json'):
                count_files +=1
                art_has_auth_place = None
                art_has_auth_date = None
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        # Check if it's an artwork file
                        if 'id' in data and 'title' in data:
                          #  print("---------- Artwork: ", data["title"], data["id"])
                            artwork_count += 1
                        if 'dateText' in data and str(data['dateText']) not in null_values: 
                           # print("artwork date: ", data['dateText'])
                            tate_art_date_count+=1
                        if 'all_artists' in data and str(data['all_artists']) not in null_values:
                            tate_artist_count+=1
                            artist_details = data['contributors']

                            for d in artist_details: 
                                if art_has_auth_place == None or art_has_auth_date == None: 
                                    if 'id' in d and str(d['id']) not in null_values: 
                                        artist_id = d['id']
                                        if artist_id in artists_with_place: 
                                          #  print("artist has place: ", artist_id)
                                            art_has_auth_place = True
                                        if artist_id in artists_with_date: 
                                          #  print("artist has date: ", artist_id)
                                            art_has_auth_date = True
                                        
                                       
                        if art_has_auth_date == True: 
                            tate_artist_date_count+=1
                        if art_has_auth_place == True: 
                            tate_artist_place_count+=1
                                
                except Exception as e: 
                    print(f"Error reading {file}: {e}")
                    files_crashed+=1
                print("files processed: ", count_files)

    end = time.time()
    t = end - start
    print("total time: ", t)
    print("files processed: ", count_files, " File crashed: ", files_crashed)
    result_list = ["Tate", artwork_count, tate_art_date_count, tate_art_place_count, 0, tate_artist_count, tate_artist_date_count, tate_artist_place_count, 0, 0]
    print("Result: list with artwork_count, tate_art_date_count, tate_art_place_count, tate_artist_count, tate_artist_date_count, tate_artist_place_count, artist wd or ulan")

    count_list = ["Tate", artwork_count]
    prop_list = ["Tate"]
    results = result_list[2:]
    for number in results: 
        p = pprint_prop(number, artwork_count, count_list, prop_list)
    print("count_list: ", count_list)
    print("prop_list: ", prop_list)
    return count_list, prop_list

# Define the path to the 'artworks' folder
tate_artworks_path = "downloaded-data/collection-master/artworks"

# Count the total number of artworks
tate_count, tate_prop = count_artworks(tate_artworks_path)


In [None]:
headers = ['Dataset', 'Artworks', 'With date', 'With place', 'With date and place', 'With author', 'With author with date', 'With author with place','With artworks aligned to Wikidata', 'With author aligned to Wikidata or ULAN']
headers_prop = headers.copy()
headers_prop.remove('Artworks')

In [None]:
res = [headers, tate_count]
res_prop = [headers_prop, tate_prop]

In [None]:
# save results

with open('tate_res.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerow(tate_count)
with open('tate_res_prop.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers_prop)
    writer.writerow(tate_prop)