In [107]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
# nltk.download('stopwords')
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import re, string
from deep_translator import GoogleTranslator

In [108]:
# Load Savvas's key to Rijks API
key = "3AzvvBJ0"

# Set language
culture = "en"

# Read Object codes list
object_names = pd.read_csv('object_names.txt', header = None)
object_names.columns = ["object_id"]

object_names.head()

Unnamed: 0,object_id
0,SK-A-4830
1,SK-A-4821
2,SK-A-3059
3,SK-A-1627
4,SK-A-1451


In [109]:
# Load paintings in list from Rijks API
database = {}

for id in object_names["object_id"]:
    print("Downloading item: "+id+"...")
    database[id] = rq.get("https://www.rijksmuseum.nl/api/"+culture+"/collection/"+id+"?key="+key).json()
    print("Done.")

Downloading item: SK-A-4830...
Done.
Downloading item: SK-A-4821...
Done.
Downloading item: SK-A-3059...
Done.
Downloading item: SK-A-1627...
Done.
Downloading item: SK-A-1451...
Done.
Downloading item: SK-A-4646...
Done.
Downloading item: SK-A-137...
Done.
Downloading item: SK-A-4820...
Done.
Downloading item: SK-C-1458...
Done.
Downloading item: SK-A-335...
Done.
Downloading item: SK-A-3988...
Done.
Downloading item: RP-P-H-1086...
Done.
Downloading item: SK-A-671...
Done.
Downloading item: SK-A-1751...
Done.
Downloading item: SK-A-4867...
Done.
Downloading item: SK-A-4052...
Done.
Downloading item: SK-A-718...
Done.
Downloading item: SK-C-206...
Done.
Downloading item: SK-A-385...
Done.
Downloading item: SK-A-4717...
Done.
Downloading item: SK-A-390...
Done.
Downloading item: SK-A-3467...
Done.
Downloading item: SK-C-187...
Done.
Downloading item: SK-C-301...
Done.
Downloading item: SK-A-2180...
Done.
Downloading item: SK-C-149...
Done.
Downloading item: SK-C-152...
Done.
Downloadin

In [110]:
# Save paintings metadata into file
json.dump( database, open( "paintings_metadata.json", 'w' ) )

In [111]:
database = json.load(open("paintings_metadata.json"))

In [112]:
# For descriptions missing in english, look for Dutch descriptions and translate them.
# For completely missing descriptions, replace None english description with en empty String

translator = GoogleTranslator(source = 'nl', target='en')

for id in object_names["object_id"]:
    #print(id)
    desc = database[id]["artObject"]["label"]["description"]
    if desc == None:
        #print(id)
        desc_nl = database[id]["artObject"]["description"]
        #print(desc_nl)
        if desc_nl:
            desc_en = translator.translate(desc_nl)
        else:
            desc_en = ""
        database[id]["artObject"]["label"]["description"] = desc_en

In [114]:
# NLP
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

uncleaned_objects = {}

for id in object_names["object_id"]:
    #print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    #print(desc)
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    uncleaned_objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))
    
    #print(uncleaned_objects[id])

In [125]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

sw = set(nltk.corpus.stopwords.words('english'))
puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

objects = {}

# clean set of objects
for id in uncleaned_objects:
    #print(id)
    for word in uncleaned_objects[id]:
        # print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            continue
    
        # remove any punctuation
        word = re.sub(r'[^\w\s]','',word)
        tokenized = word.split()
        
        for token in tokenized:
            # skip letters, numbers, puncs
            if (len(token) < 3) or token.isdigit() or (token in sw) or (token in puncs):
                continue
            
            # add object in dict
            if (id in objects) and (token not in objects[id]):
                objects[id].append(token)
            else:
                objects[id] = list([token])

In [126]:
for id in objects:
    print(id)
    print(objects[id])
#print(objects)

SK-A-4830
['glass', 'wine', 'palette', 'silver']
SK-A-4821
['edge', 'cheese', 'bread', 'fruit', 'table', 'reality', 'extending', 'painter', 'floris', 'van', 'dijck', 'haarlem', 'illusion', 'plate', 'painting', 'pewter', 'type']
SK-A-3059
['painting']
SK-A-1627
['influence', 'lay', 'pair', 'work', 'workshop', 'rendering', 'delilah', 'lievens', 'hair', 'source', 'lap', 'asleep', 'rembrandt', 'secret', 'samson', 'strength', 'man']
SK-A-1451
['vegetables', 'plates', 'kitchenpersonnel', 'poultry', 'game', 'display', 'mary', 'fireplace', 'left', 'food', 'background', 'kitchen', 'pitchers', 'chicken', 'martha', 'spit', 'foreground', 'fruits', 'jesus']
SK-A-4646
['fruit', 'century', 'cloves', 'beginning', 'cinnamon', 'painter', 'instance', 'mace', 'banquet', 'piece', 'claesz', 'shell', 'molluscs', 'oyster', 'plate', 'presence', 'pieter', 'asia', 'savoury', 'ginger', 'porcelain']
SK-A-137
['wine', 'fruits', 'lemon', 'silver']
SK-A-4820
['scene', 'vermeyen', 'effect', 'candlelight', 'leaf', 'noc

In [127]:
# Save objects into file
json.dump( objects, open( "objects_nlp.json", 'w' ) )