In [16]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import re, string

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\scham\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\scham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [24]:
# Load Savvas's key to Rijks API
key = "3AzvvBJ0"

# Set language
culture = "en"

# Read Object codes list
object_names = pd.read_csv('object_names.txt', header = None)
object_names.columns = ["object_id"]

object_names.head()

Unnamed: 0,object_id
0,SK-A-4830
1,SK-A-4821
2,SK-A-3059
3,SK-A-1627
4,SK-A-1451


In [26]:
# Load paintings in list from Rijks API
database = {}

for id in object_names["object_id"]:
    print("Downloading item: "+id+"...")
    database[id] = rq.get("https://www.rijksmuseum.nl/api/"+culture+"/collection/"+id+"?key="+key).json()
    print("Done.")

Downloading item: SK-A-4830...
Done.
Downloading item: SK-A-4821...
Done.
Downloading item: SK-A-3059...
Done.
Downloading item: SK-A-1627...
Done.
Downloading item: SK-A-1451...
Done.
Downloading item: SK-A-4646...
Done.
Downloading item: SK-A-137...
Done.
Downloading item: SK-A-4820...
Done.
Downloading item: SK-C-1458...
Done.
Downloading item: SK-A-335...
Done.
Downloading item: SK-A-3988...
Done.
Downloading item: RP-P-H-1086...
Done.
Downloading item: SK-A-671...
Done.
Downloading item: SK-A-1751...
Done.
Downloading item: SK-A-4867...
Done.
Downloading item: SK-A-4052...
Done.
Downloading item: SK-A-718...
Done.
Downloading item: SK-C-206...
Done.
Downloading item: SK-A-385...
Done.
Downloading item: SK-A-4717...
Done.
Downloading item: SK-A-390...
Done.
Downloading item: SK-A-3467...
Done.
Downloading item: SK-C-187...
Done.
Downloading item: SK-C-301...
Done.
Downloading item: SK-A-2180...
Done.
Downloading item: SK-C-149...
Done.
Downloading item: SK-C-152...
Done.
Downloadin

In [35]:
# Save paintings metadata into file
json.dump( database, open( "paintings_metadata.json", 'w' ) )

In [27]:
# NLP
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

objects = {}

for id in object_names["object_id"]:
    print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    #print(desc)
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))
    
    print(objects[id])

SK-A-4830
['laid table as still life', 'ochre', 'range', 'damask', 'pewter', 'silver', 'table', 'yellow', 'bread, loaf', 'grey', 'glass', 'compliment', '–', 'glass of wine', 'banquet', 'mother', 'molluscs: oyster', 'interplay', 'palette', 'heda']
SK-A-4821
['pewter', 'haarlem', 'extending', 'cheese', 'laid table as still life', 'bread', 'table', 'fruit', 'painter', 'edge', 'van', 'plate', 'type', 'floris', 'painting', 'reality', 'illusion', 'dijck']
SK-A-3059
['scene', 'body', 'portrayal', 'patient, sick person', 'amsterdam', 'virgin', 'plague', 'back', 'representation', 'background', 'pieta', 'time', 's', 'wall', 'ten', 'grey', 'child', 'son', 'lap', 'mother and son(s) (family group)', 'painting', 'crucified Christ, with particular persons under the cross', 'suffering', 'poignant', 'crucifixion', 'picture, painting']
SK-A-1627
['lievens', 'workshop', 'strength', 'delilah', 's', 'influence', 'rembrandt', 'rendering', 'hair', 'lap', 'man', "Samson asleep in Delilah's lap; she is usually

['manuscript of musical score', 'viola, violoncello; double bass', 'music', 'room', 'art', 'drawing, sketch', 'amsterdam', 'armorial bearing, heraldry', 'face', 'century', 'powder', 'shoulder', 'viewer', 'sitter', 'visit', 'man', 'member', 'portrait', 'family', 'likeness', 'gentleman', 'distance', 'lack', 'wig', 'globe']
SK-C-1672
['painter', 'garden vase', 'buckle', 'society', 'fashion', 'assink', 'skirt', 'setting', 'belt', 'book - MM - book open', 'puff', 'dog', 'kruseman', 'silhouette', 'waist', 'hunting', 'hoop', 'vase', 'portraiture', 'clothing']
SK-A-4941
['kind', 'tale', 'painter', "'fahrende Leute', vagrants", 'dutch', 'scene', 'fishes', 'meaning', 'fraai', "minstrel, 'jongleur'", 'child and art', 'mieris', 'interior of the house', 'bird in a cage', 'willem', 'romance', 'manner', 'curieus', 'peepshow', 'painting', 'performer', 'van', 'title', '’', 'triptych']
SK-A-2205
Error, spacy nlp failed on item: SK-A-2205
Error, textBlob nlp failed on item: SK-A-2205
Error, NLTK failed o

['pewter', 'haarlem', 'extending', 'cheese', 'laid table as still life', 'bread', 'table', 'fruit', 'painter', 'edge', 'van', 'plate', 'type', 'floris', 'painting', 'reality', 'illusion', 'dijck']
SK-A-2213
Error, spacy nlp failed on item: SK-A-2213
Error, textBlob nlp failed on item: SK-A-2213
Error, NLTK failed on item: SK-A-2213
['rodents: rabbit', 'still life of plants, flowers and fruit', 'carpet, rug', 'monkeys, apes', 'garden fountain']
SK-A-4163
Error, spacy nlp failed on item: SK-A-4163
Error, textBlob nlp failed on item: SK-A-4163
Error, NLTK failed on item: SK-A-4163
['cow', 'portrait, self-portrait of painter (+ out of doors)', 'sheep']
SK-A-613
['picture', 'son', 'image', 'insubordination', 'command', 'admiralty', 'Torquatus has his son beheaded because he has fought an enemy in single combat against the strict orders of his father', 'amsterdam', 'headquarters', 'consul', 'story']
SK-A-2152
['butterflies', 'layer', 'impression', 'transparent', 'hermit', 'molluscs (+ shell,

RP-F-2014-7-1-3
Error, spacy nlp failed on item: RP-F-2014-7-1-3
Error, textBlob nlp failed on item: RP-F-2014-7-1-3
Error, NLTK failed on item: RP-F-2014-7-1-3
['foodstuffs; still life of foodstuffs', 'vegetables']
SK-A-2366
Error, spacy nlp failed on item: SK-A-2366
Error, textBlob nlp failed on item: SK-A-2366
Error, NLTK failed on item: SK-A-2366
['fruits', 'foodstuffs; still life of foodstuffs', 'still life of related objects']
SK-A-4779
['infidelity', 'emperor', 'group', 'carousal, drinking-bout, orgy', 'woman', 'palace', 'sillius', 'greed', 'wife', 's', 'downfall', 'end', 'centre', 'wedding', 'claudius', 'something', 'Messalina in the brothel', 'banquet', 'vine', 'drinking', 'merrymaking', 'messalina', 'adultery', 'window']
SK-A-180
['convincing', 'violin, fiddle', 'laughing', 'tapestry', 'order', 'man', 'surprise', 'frame', 'fiddler', 'window', 's', 'painting', 'viewer', 'glass of wine', 'clothing']
SK-A-2452
Error, spacy nlp failed on item: SK-A-2452
Error, textBlob nlp failed

In [30]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

# clean set of objects
for id in objects:
    #print(item)
    for word in objects[id]:
        #print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            objects[id].remove(word)
            continue
    
        #remove unecessary parts of words
        if "(s)" in word:
            rep_word = re.sub("\(s\)","",word)
            objects[id].remove(word)
            objects[id].append(rep_word)
            continue
    
        if "'s" in word:
            rep_word = re.sub("'s","",word)
            objects[id].remove(word)
            objects[id].append(rep_word)
            continue
        
        # remove "word" if it's not a word
        if word in puncs:
            objects[id].remove(word)
            continue
    
        if word.isdigit():
            objects[id].remove(word)
            continue
    
        #remove EOW punctuation
        if word[-1] in puncs:
            rep_word = word[:-1]
            objects[id].remove(word)
            objects[id].append(rep_word)
            continue
            
#print(objects)

In [34]:
# Save objects into file
json.dump( objects, open( "objects_nlp.json", 'w' ) )