In [64]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
# nltk.download('stopwords')
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import re, string
from deep_translator import GoogleTranslator

import sqlite3

In [40]:
# Load Savvas's key to Rijks API
key = "3AzvvBJ0"

# Set language
culture = "en"

# Read Object codes list
object_names = pd.read_csv('object_names.txt', header = None)
object_names.columns = ["object_id"]

object_names.head()

Unnamed: 0,object_id
0,SK-A-4830
1,SK-A-4821
2,SK-A-3059
3,SK-A-1627
4,SK-A-1451


In [41]:
# Load paintings in list from Rijks API
database = {}

for id in object_names["object_id"]:
    print("Downloading item: "+id+"...")
    database[id] = rq.get("https://www.rijksmuseum.nl/api/"+culture+"/collection/"+id+"?key="+key).json()
    print("Done.")

Downloading item: SK-A-4830...
Done.
Downloading item: SK-A-4821...
Done.
Downloading item: SK-A-3059...
Done.
Downloading item: SK-A-1627...
Done.
Downloading item: SK-A-1451...
Done.
Downloading item: SK-A-4646...
Done.
Downloading item: SK-A-137...
Done.
Downloading item: SK-A-4820...
Done.
Downloading item: SK-C-1458...
Done.
Downloading item: SK-A-335...
Done.
Downloading item: SK-A-3988...
Done.
Downloading item: RP-P-H-1086...
Done.
Downloading item: SK-A-671...
Done.
Downloading item: SK-A-1751...
Done.
Downloading item: SK-A-4867...
Done.
Downloading item: SK-A-4052...
Done.
Downloading item: SK-A-718...
Done.
Downloading item: SK-C-206...
Done.
Downloading item: SK-A-385...
Done.
Downloading item: SK-A-4717...
Done.
Downloading item: SK-A-390...
Done.
Downloading item: SK-A-3467...
Done.
Downloading item: SK-C-187...
Done.
Downloading item: SK-C-301...
Done.
Downloading item: SK-A-2180...
Done.
Downloading item: SK-C-149...
Done.
Downloading item: SK-C-152...
Done.
Downloadin

In [42]:
# Save paintings metadata into file
json.dump( database, open( "paintings_metadata.json", 'w' ) )

In [43]:
database = json.load(open("paintings_metadata.json"))

In [44]:
# For descriptions missing in english, look for Dutch descriptions and translate them.
# For completely missing descriptions, replace None english description with en empty String

translator = GoogleTranslator(source = 'nl', target='en')

for id in object_names["object_id"]:
    #print(id)
    desc = database[id]["artObject"]["label"]["description"]
    if desc == None:
        #print(id)
        desc_nl = database[id]["artObject"]["description"]
        #print(desc_nl)
        if desc_nl:
            desc_en = translator.translate(desc_nl)
        else:
            desc_en = ""
        database[id]["artObject"]["label"]["description"] = desc_en

In [45]:
# NLP
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

uncleaned_objects = {}

for id in object_names["object_id"]:
    #print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    #print(desc)
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    uncleaned_objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))
    
    #print(uncleaned_objects[id])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

sw = set(nltk.corpus.stopwords.words('english'))
puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

objects = {}

# clean set of objects
for id in uncleaned_objects:
    #print(id)
    for word in uncleaned_objects[id]:
        # print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            continue
    
        # remove any punctuation
        word = re.sub(r'[^\w\s]','',word)
        tokenized = word.split()
        
        for token in tokenized:
            # skip letters, numbers, puncs
            if (len(token) < 3) or token.isdigit() or (token in sw) or (token in puncs):
                continue
            
            # add object in dict
            if (id in objects) and (token not in objects[id]):
                objects[id].append(token)
            else:
                objects[id] = list([token])

In [None]:
for id in objects:
    print(id)
    print(objects[id])
#print(objects)

In [None]:
# Save objects into file
json.dump( objects, open( "objects_nlp.json", 'w' ) )

In [74]:
add_objs_to_db(objects)

SK-A-4830 with object glass added to db
SK-A-4830 with object wine added to db
SK-A-4830 with object palette added to db
SK-A-4830 with object silver added to db
SK-A-4821 with object edge added to db
SK-A-4821 with object cheese added to db
SK-A-4821 with object bread added to db
SK-A-4821 with object fruit added to db
SK-A-4821 with object table added to db
SK-A-4821 with object reality added to db
SK-A-4821 with object extending added to db
SK-A-4821 with object painter added to db
SK-A-4821 with object floris added to db
SK-A-4821 with object van added to db
SK-A-4821 with object dijck added to db
SK-A-4821 with object haarlem added to db
SK-A-4821 with object illusion added to db
SK-A-4821 with object plate added to db
SK-A-4821 with object painting added to db
SK-A-4821 with object pewter added to db
SK-A-4821 with object type added to db
SK-A-3059 with object painting added to db
SK-A-1627 with object influence added to db
SK-A-1627 with object lay added to db
SK-A-1627 with obj

SK-A-3467 with object sprinkler added to db
SK-C-187 with object spider added to db
SK-C-187 with object fruit added to db
SK-C-187 with object peaches added to db
SK-C-187 with object oysters added to db
SK-C-187 with object bun added to db
SK-C-187 with object butterflies added to db
SK-C-187 with object quinces added to db
SK-C-187 with object watch added to db
SK-C-187 with object ribbon added to db
SK-C-187 with object lemon added to db
SK-C-187 with object tabletop added to db
SK-C-187 with object plums added to db
SK-C-187 with object chestnuts added to db
SK-C-187 with object molluscs added to db
SK-C-187 with object oyster added to db
SK-C-187 with object front added to db
SK-C-187 with object life added to db
SK-C-187 with object glass added to db
SK-C-187 with object rummer added to db
SK-C-187 with object plate added to db
SK-C-187 with object gourds added to db
SK-C-187 with object vines added to db
SK-C-187 with object pomegranate added to db
SK-C-187 with object goblet a

SK-A-3948 with object heraldry added to db
SK-A-3948 with object art added to db
SK-A-3948 with object distance added to db
SK-A-3948 with object viewer added to db
SK-A-3948 with object shoulder added to db
SK-A-3948 with object lack added to db
SK-A-3948 with object family added to db
SK-A-3948 with object man added to db
SK-C-1672 with object vase added to db
SK-C-1672 with object hunting added to db
SK-C-1672 with object waist added to db
SK-C-1672 with object assink added to db
SK-C-1672 with object kruseman added to db
SK-C-1672 with object belt added to db
SK-C-1672 with object portraiture added to db
SK-C-1672 with object fashion added to db
SK-C-1672 with object society added to db
SK-A-4941 with object manner added to db
SK-A-4941 with object fahrende added to db
SK-A-4941 with object Leute added to db
SK-A-4941 with object vagrants added to db
SK-A-4941 with object tale added to db
SK-A-4941 with object mieris added to db
SK-A-4941 with object dutch added to db
SK-A-4941 wit

SK-A-4968 with object door added to db
SK-A-4968 with object haaxman added to db
SK-A-4968 with object coach added to db
SK-A-4968 with object kind added to db
SK-A-4968 with object reference added to db
SK-A-4968 with object colonial added to db
SK-A-4968 with object authority added to db
SK-A-4968 with object regime added to db
SK-A-4968 with object sultan added to db
SK-A-4968 with object firm added to db
SK-A-4968 with object painting added to db
SK-A-2388 with object carter added to db
SK-A-2388 with object greengrocer added to db
SK-A-2388 with object canal added to db
SK-A-2388 with object view added to db
SK-A-2388 with object zuiderhavendijk added to db
SK-A-2388 with object boat added to db
SK-A-2388 with object horse added to db
SK-A-2388 with object right added to db
SK-A-2388 with object zuiderspui added to db
SK-A-2388 with object enkhuizen added to db
SK-A-2388 with object street added to db
SK-A-2388 with object man added to db
SK-A-4163 with object bakhuyzen added to d

SK-A-3123 with object books added to db
SK-A-3123 with object crucifix added to db
SK-A-3123 with object holder added to db
SK-A-3123 with object papers added to db
SK-A-2213 with object fountain added to db
SK-A-2213 with object crockery added to db
SK-A-2213 with object corn added to db
SK-A-613 with object amsterdam added to db
SK-A-613 with object son added to db
SK-A-613 with object headquarters added to db
SK-A-613 with object command added to db
SK-A-613 with object insubordination added to db
SK-A-613 with object picture added to db
SK-A-613 with object consul added to db
SK-A-613 with object admiralty added to db
SK-A-613 with object story added to db
SK-A-613 with object image added to db
SK-A-2152 with object insects added to db
SK-A-2152 with object butterfly added to db
SK-A-2152 with object shell added to db
SK-A-2152 with object glaze added to db
SK-A-2152 with object grasshoppers added to db
SK-A-2152 with object transparent added to db
SK-A-2152 with object viewer adde

In [69]:
# objects = json.load(open("objects_nlp.json"))

In [65]:
# open db
con = sqlite3.connect('rijksstudio.db')
cur = con.cursor()

In [73]:
def add_objs_to_db(objects):
    for painting in objects:
        for obj in objects[painting]:
            q = cur.execute("""INSERT OR IGNORE INTO Objects 
                   VALUES (?,?,?,?,?,?,?,?,?);""", (create_obj_id(painting,obj), painting, obj, 0, None, None, None, None, "NLP"))

            con.commit()
            print(painting + " with object " + obj + " added to db")

In [71]:
# turns a combination of painting and object name to an id
def create_obj_id(painting,obj_name):
    obj_id = ''.join([str(ord(char)-96) for char in obj_name])    # convert chars to numbers
    return (painting.split(".")[0] + obj_id) + "N"    # + N for NLP