In [69]:
import requests
from bs4 import BeautifulSoup
import time
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [78]:
all_items = []
commitcounter = 0
commits = 0
commitsize = 100
# I observed that each item has a unique number in the URL but some numbers re missing. So, I
# can loop through the numbers and if the link works I can find an item.
# Loop through all the numbers
for item_number in range(0,6010):
    url = f"https://collections.frick.org/objects/{item_number}"
    r = requests.get(url)
    # This number has an item
    if r.status_code == 200:
        # In case I get locked out for pulling too many URLs too fast
        #time.sleep(1)
        title = None
        date_created = None
        acs_num = None
        creator_url = None
        creator = None
        creator_nationality = None
        medium = None
        locations = None
        tags = []
        # Use Beutiful Soup to parse the HTML
        soup = BeautifulSoup(r.text, features="html.parser")
        # I analyzed the HTML to find the infomration I wanted. The following is for each case
        # Title
        title_div = soup.find_all("div",{'class':'detailField titleField'})
        if len(title_div) == 1:
            title = title_div[0].text.strip()
        else:
            print("Title not found")
        # Date created
        date_created_div = soup.find("div",{'class':'detailField displayDateField'})
        if date_created_div != None:
            date_created = date_created_div.find('span',{'class':'detailFieldValue'})
            if date_created != None:
                date_created = date_created.text.strip()
        # Accession number
        acs_num_div = soup.find_all("div",{'class':'detailField invnoField'})
        if len(acs_num_div) == 1:
            acs_num = acs_num_div[0]
            acs_num = acs_num.find("span",{'class':'detailFieldValue'})
            acs_num = acs_num.text.strip()
        # Creator - name and nationality
        creator_div = soup.find('div',{'class':'detailField peopleField'})
        if creator_div != None:
            creator_a_link = creator_div.find('a')
            creator_url = creator_a_link['href']
            creator_data_elements = creator_div.find_all('span')
            if len(creator_data_elements) == 3:
                creator = creator_data_elements[1].text.strip()
                creator_nationality = creator_data_elements[2].text.strip()
            elif len(creator_data_elements) == 2:
                creator = creator_data_elements[1].text.strip()
            elif len(creator_data_elements) == 1:
                creator = creator_data_elements[0].text.strip()
        # Medium
        medium_div = soup.find('div',{'class':'detailField mediumField'})
        if medium_div != None:
            medium = medium_div.find('span',{'class':'detailFieldValue'})
            if medium != None:
                medium = medium.text.strip()
        # Locations
        locations_div = soup.find('div',{'class':'detailField locationsField'})
        if locations_div != None:
            locations = locations_div.find('span',{'class':'loctionFieldLabel'})
            if locations != None:
                locations = locations.text.strip()
        # Tags
        tags_div = soup.find('div',{'class':'detailField collectionsField'})
        if tags_div != None:
            tag_span_elements = tags_div.find_all('span')
            for span in tag_span_elements:
                tags.append(span.text.strip())
        # Put everything collected into a dictionary
        list_item = {
            'url':url
            , 'title':title
            , 'date_created':date_created
            , 'acs_num':acs_num
            , 'creator_url':creator_url
            , 'creator':creator
            , 'creator_nationality':creator_nationality
            , 'medium':medium
            , 'locations':locations
            , 'tags':tags
        }
        # Add to the list
        all_items.append(list_item)
        # Write out every commit size, this allows for restart
        if commitcounter == commitsize:
            commits += 1
            print(f"Commit {commits}")
            with open('frick_data.json','w') as frickfile:
                json.dump(all_items,frickfile,indent=2)
            commitcounter = 0
        else:
            commitcounter += 1
# Write it as JSON so I do not have to keep reruning during analysis
with open('frick_data.json','w') as frickfile:
    json.dump(all_items,frickfile,indent=2)
print("Done")


Commit 1
Title not found
Commit 2
Commit 3
Commit 4
Commit 5
Commit 6
Commit 7
Commit 8
Commit 9
Commit 10
Commit 11
Commit 12
Commit 13
Commit 14
Commit 15
Title not found
Commit 16
Commit 17
Done


In [81]:
# Change the pandas settings so we can see all the data
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

with open('frick_data.json', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)
df.info()
# how many items are there by location
dfSummary = df.groupby("locations").agg(
   items=("url", "count"))
print(dfSummary)
# how many items are there by medium
dfSummary = df.groupby("medium").agg(
   items=("url", "count"))
print(dfSummary)
# for each year, how many items did they acquire and how many were paitings or furniture
# Pull the year from the accession number
df['acs_year'] = (df['acs_num'].astype(str).str[:4])
# Encode the tags as columns with a 1 or 0
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('tags')),
                          columns=mlb.classes_,
                          index=df.index))
print(df.shape)
print(df.columns)
dfSummary = df.groupby("acs_year").agg(
   items_acquired=("url", "count")
   , paintings=("Painting", "sum")
   , sculpture=("Sculpture", "sum")
   , furniture=("Furniture", "sum"))
print(dfSummary)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1768 entries, 0 to 1767
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   url                  1768 non-null   object
 1   title                1766 non-null   object
 2   date_created         1742 non-null   object
 3   acs_num              1766 non-null   object
 4   creator_url          1735 non-null   object
 5   creator              1519 non-null   object
 6   creator_nationality  998 non-null    object
 7   medium               1746 non-null   object
 8   locations            1766 non-null   object
 9   tags                 1768 non-null   object
dtypes: object(10)
memory usage: 138.3+ KB
             items
locations         
Not On View   1766
                                                    items
medium                                                   
Alabaster                                               1
Basswood                       