In [41]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download('stopwords')
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import re, string
from deep_translator import GoogleTranslator

In [42]:
# Load Savvas's key to Rijks API
key = "3AzvvBJ0"

# Set language
culture = "en"

# Read Object codes list
object_names = pd.read_csv('object_names.txt', header = None)
object_names.columns = ["object_id"]

object_names.head()

Unnamed: 0,object_id
0,SK-A-4830
1,SK-A-4821
2,SK-A-3059
3,SK-A-1627
4,SK-A-1451


In [43]:
# Load paintings in list from Rijks API
database = {}

for id in object_names["object_id"]:
    print("Downloading item: "+id+"...")
    database[id] = rq.get("https://www.rijksmuseum.nl/api/"+culture+"/collection/"+id+"?key="+key).json()
    print("Done.")

Downloading item: SK-A-4830...
Done.
Downloading item: SK-A-4821...
Done.
Downloading item: SK-A-3059...
Done.
Downloading item: SK-A-1627...
Done.
Downloading item: SK-A-1451...
Done.
Downloading item: SK-A-4646...
Done.
Downloading item: SK-A-137...
Done.
Downloading item: SK-A-4820...
Done.
Downloading item: SK-C-1458...
Done.
Downloading item: SK-A-335...
Done.
Downloading item: SK-A-3988...
Done.
Downloading item: RP-P-H-1086...
Done.
Downloading item: SK-A-671...
Done.
Downloading item: SK-A-1751...
Done.
Downloading item: SK-A-4867...
Done.
Downloading item: SK-A-4052...
Done.
Downloading item: SK-A-718...
Done.
Downloading item: SK-C-206...
Done.
Downloading item: SK-A-385...
Done.
Downloading item: SK-A-4717...
Done.
Downloading item: SK-A-390...
Done.
Downloading item: SK-A-3467...
Done.
Downloading item: SK-C-187...
Done.
Downloading item: SK-C-301...
Done.
Downloading item: SK-A-2180...
Done.
Downloading item: SK-C-149...
Done.
Downloading item: SK-C-152...
Done.
Downloadin

In [44]:
# Save paintings metadata into file
json.dump( database, open( "paintings_metadata.json", 'w' ) )

In [45]:
database = json.load(open("paintings_metadata.json"))

In [46]:
# For descriptions missing in english, look for Dutch descriptions and translate them.
# For completely missing descriptions, replace None english description with en empty String

translator = GoogleTranslator(source = 'nl', target='en')

for id in object_names["object_id"]:
    print(id)
    desc = database[id]["artObject"]["label"]["description"]
    if desc == None:
        #print(id)
        desc_nl = database[id]["artObject"]["description"]
        #print(desc_nl)
        if desc_nl:
            desc_en = translator.translate(desc_nl)
        else:
            desc_en = ""
        database[id]["artObject"]["label"]["description"] = desc_en

SK-A-4830
SK-A-4821
SK-A-3059
SK-A-1627
SK-A-1451
SK-A-4646
SK-A-137
SK-A-4820
SK-C-1458
SK-A-335
SK-A-3988
RP-P-H-1086
SK-A-671
SK-A-1751
SK-A-4867
SK-A-4052
SK-A-718
SK-C-206
SK-A-385
SK-A-4717
SK-A-390
SK-A-3467
SK-C-187
SK-C-301
SK-A-2180
SK-C-149
SK-C-152
SK-C-147
SK-A-180
SK-A-4981
SK-A-3518
SK-A-86
SK-A-3924
SK-A-690
SK-A-3962
SK-C-229
SK-A-2344
SK-A-3948
SK-C-1672
SK-A-4941
SK-A-2205
SK-A-3930
SK-C-535
SK-A-4908
SK-A-4290
SK-A-4152
SK-C-185
SK-A-263
SK-A-4157
SK-A-752
SK-A-1857
SK-A-662
SK-A-1595
SK-A-4054
SK-A-4968
SK-A-3988
SK-A-2388
SK-A-4163
SK-C-149
SK-A-4118
SK-C-177
SK-C-112
SK-C-134
SK-A-98
SK-A-113
SK-A-1649
SK-A-3103
SK-A-3254
SK-A-3995
SK-A-2565
SK-C-610
SK-A-3103
SK-A-2836
SK-C-291
SK-A-2713
SK-A-4981
SK-A-4839
SK-A-3123
SK-A-4830
SK-A-4821
SK-A-2213
SK-A-4163
SK-A-613
SK-A-2152
SK-A-4646
SK-A-3930
SK-C-1672
SK-A-2150
SK-A-671
SK-A-718
SK-A-3347
SK-C-535
SK-C-177
SK-A-4908
SK-A-4098
SK-A-2962
SK-A-93
SK-A-1130
SK-A-129
SK-A-4941
SK-A-4646
SK-A-3930
SK-A-199
SK-A-490

In [47]:
# NLP
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

uncleaned_objects = {}

for id in object_names["object_id"]:
    #print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    #print(desc)
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    uncleaned_objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))
    
    #print(uncleaned_objects[id])

In [48]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

sw = set(nltk.corpus.stopwords.words('english'))
puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

objects = {}

# clean set of objects
for id in uncleaned_objects:
    #print(id)
    for word in uncleaned_objects[id]:
        # print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            continue
    
        # remove any punctuation
        word = re.sub(r'[^\w\s]','',word)
        tokenized = word.split()
        
        for token in tokenized:
            # skip letters, numbers, puncs
            if (len(token) < 3) or token.isdigit() or (token in sw) or (token in puncs):
                continue
            
            # add object in dict
            if (id in objects) and (token not in objects[id]):
                objects[id].append(token)
            else:
                objects[id] = list([token])

In [49]:
for id in objects:
    print(id)
    print(objects[id])
#print(objects)

SK-A-4830
['glass', 'interplay', 'heda', 'silver', 'pewter', 'bread', 'loaf']
SK-A-4821
['painter', 'van', 'type', 'cheese', 'extending', 'fruit', 'bread', 'pewter', 'edge', 'floris', 'table', 'reality', 'plate', 'dijck', 'painting', 'haarlem', 'illusion']
SK-A-3059
['painting', 'child']
SK-A-1627
['workshop', 'samson', 'lievens', 'strength', 'work', 'man', 'lay', 'source', 'rembrandt', 'pair', 'secret', 'asleep', 'delilah', 'hair', 'influence', 'rendering', 'lap']
SK-A-1451
['kitchenpersonnel', 'foreground', 'game', 'display', 'background', 'vegetables', 'kitchen', 'mary', 'poultry', 'left', 'plates', 'martha', 'spit', 'fruits', 'chicken', 'pitchers', 'jesus', 'fireplace', 'food']
SK-A-4646
['savoury', 'fruit', 'ginger', 'instance', 'plate', 'century', 'molluscs', 'oyster', 'painter', 'banquet', 'piece', 'claesz', 'pieter', 'shell', 'cloves', 'mace', 'porcelain', 'cinnamon', 'asia', 'beginning', 'presence']
SK-A-137
['wine', 'drinking']
SK-A-4820
['leaf', 'effect', 'candlelight', 'art

In [56]:
# wordnet usage analysis
# filter out anything that is not considered lexicographically as Food, Plant, Animal or Artifact
# also, convert any words in plural to singular

objects_new = {}

lem = WordNetLemmatizer()

for id in objects:
    #print(id)
    for word in objects[id]:
        
        word = word = lem.lemmatize(word)
        syns = wn.synsets(word, pos = wn.NOUN)
        
        for syn in syns:
            #print("SYN")
            if any(x in syn.lexname() for x in ['food','plant','animal','artifact']):
                #print(word, " -->", syn.lexname())
                if id in objects_new:
                    objects_new[id].append(word)
                else:
                    objects_new[id] = list([word])
                break
                #continue

In [57]:
objects_new

{'SK-A-4830': ['glass', 'silver', 'bread', 'loaf'],
 'SK-A-4821': ['painter',
  'van',
  'type',
  'cheese',
  'fruit',
  'bread',
  'edge',
  'table',
  'plate',
  'painting'],
 'SK-A-3059': ['painting'],
 'SK-A-1627': ['workshop',
  'work',
  'man',
  'source',
  'hair',
  'rendering',
  'lap'],
 'SK-A-1451': ['foreground',
  'game',
  'display',
  'background',
  'vegetable',
  'kitchen',
  'poultry',
  'left',
  'plate',
  'spit',
  'fruit',
  'chicken',
  'pitcher',
  'fireplace',
  'food'],
 'SK-A-4646': ['savoury',
  'fruit',
  'ginger',
  'plate',
  'mollusc',
  'oyster',
  'painter',
  'banquet',
  'piece',
  'shell',
  'clove',
  'mace',
  'porcelain',
  'cinnamon'],
 'SK-A-137': ['wine'],
 'SK-A-4820': ['leaf', 'scene', 'novelty', 'genre'],
 'SK-C-1458': ['baby', 'altarpiece', 'king', 'panel'],
 'SK-A-335': ['lobster', 'grape', 'orange'],
 'SK-A-3988': ['glass',
  'wine',
  'tobacco',
  'pitcher',
  'rummer',
  'knife',
  'test'],
 'RP-P-H-1086': ['candle'],
 'SK-A-671': ['k

In [58]:
# remove hypernyms. For exmaple, if both "insect" and "butterfly" are on list, remove "insect"

for id in objects_new:
    #print(id)
    for word in objects_new[id]:
        #print(word)
        syns = wn.synsets(word, pos = wn.NOUN)
        for syn in syns:
            for hypernym in syn.hypernyms():
                #print(hypernym.name())
                if any(x in hypernym.name() for x in objects_new[id]):               
                    for hyper_word in objects_new[id]:
                        if hyper_word in hypernym.name() and hyper_word != word:
                            #print(hyper_word)
                            #print(word)
                            objects_new[id].remove(hyper_word)

In [59]:
objects_new

{'SK-A-4830': ['glass', 'silver', 'loaf'],
 'SK-A-4821': ['painter',
  'van',
  'type',
  'cheese',
  'fruit',
  'bread',
  'edge',
  'table',
  'plate',
  'painting'],
 'SK-A-3059': ['painting'],
 'SK-A-1627': ['workshop', 'source', 'hair', 'rendering', 'lap'],
 'SK-A-1451': ['foreground',
  'game',
  'display',
  'background',
  'vegetable',
  'kitchen',
  'left',
  'plate',
  'spit',
  'fruit',
  'chicken',
  'pitcher',
  'fireplace',
  'food'],
 'SK-A-4646': ['savoury',
  'fruit',
  'ginger',
  'plate',
  'mollusc',
  'oyster',
  'painter',
  'banquet',
  'piece',
  'clove',
  'mace',
  'porcelain',
  'cinnamon'],
 'SK-A-137': ['wine'],
 'SK-A-4820': ['leaf', 'scene', 'novelty', 'genre'],
 'SK-C-1458': ['baby', 'altarpiece', 'panel'],
 'SK-A-335': ['lobster', 'grape', 'orange'],
 'SK-A-3988': ['wine', 'tobacco', 'pitcher', 'rummer', 'knife', 'test'],
 'RP-P-H-1086': ['candle'],
 'SK-A-671': ['spectator', 'baby', 'palace', 'painting'],
 'SK-A-1751': ['pomegranate', 'table', 'medlar'

In [60]:
# Save objects into file
json.dump( objects_new, open( "objects_nlp.json", 'w' ) )