In [64]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
# nltk.download('stopwords')
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import re, string
from deep_translator import GoogleTranslator

import sqlite3

In [40]:
# Load Savvas's key to Rijks API
key = "3AzvvBJ0"

# Set language
culture = "en"

# Read Object codes list
object_names = pd.read_csv('object_names.txt', header = None)
object_names.columns = ["object_id"]

object_names.head()

Unnamed: 0,object_id
0,SK-A-4830
1,SK-A-4821
2,SK-A-3059
3,SK-A-1627
4,SK-A-1451


In [41]:
# Load paintings in list from Rijks API
database = {}

for id in object_names["object_id"]:
    print("Downloading item: "+id+"...")
    database[id] = rq.get("https://www.rijksmuseum.nl/api/"+culture+"/collection/"+id+"?key="+key).json()
    print("Done.")

Downloading item: SK-A-4830...
Done.
Downloading item: SK-A-4821...
Done.
Downloading item: SK-A-3059...
Done.
Downloading item: SK-A-1627...
Done.
Downloading item: SK-A-1451...
Done.
Downloading item: SK-A-4646...
Done.
Downloading item: SK-A-137...
Done.
Downloading item: SK-A-4820...
Done.
Downloading item: SK-C-1458...
Done.
Downloading item: SK-A-335...
Done.
Downloading item: SK-A-3988...
Done.
Downloading item: RP-P-H-1086...
Done.
Downloading item: SK-A-671...
Done.
Downloading item: SK-A-1751...
Done.
Downloading item: SK-A-4867...
Done.
Downloading item: SK-A-4052...
Done.
Downloading item: SK-A-718...
Done.
Downloading item: SK-C-206...
Done.
Downloading item: SK-A-385...
Done.
Downloading item: SK-A-4717...
Done.
Downloading item: SK-A-390...
Done.
Downloading item: SK-A-3467...
Done.
Downloading item: SK-C-187...
Done.
Downloading item: SK-C-301...
Done.
Downloading item: SK-A-2180...
Done.
Downloading item: SK-C-149...
Done.
Downloading item: SK-C-152...
Done.
Downloadin

In [42]:
# Save paintings metadata into file
json.dump( database, open( "paintings_metadata.json", 'w' ) )

In [43]:
database = json.load(open("paintings_metadata.json"))

In [44]:
# For descriptions missing in english, look for Dutch descriptions and translate them.
# For completely missing descriptions, replace None english description with en empty String

translator = GoogleTranslator(source = 'nl', target='en')

for id in object_names["object_id"]:
    #print(id)
    desc = database[id]["artObject"]["label"]["description"]
    if desc == None:
        #print(id)
        desc_nl = database[id]["artObject"]["description"]
        #print(desc_nl)
        if desc_nl:
            desc_en = translator.translate(desc_nl)
        else:
            desc_en = ""
        database[id]["artObject"]["label"]["description"] = desc_en

In [45]:
# NLP
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

uncleaned_objects = {}

for id in object_names["object_id"]:
    #print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    #print(desc)
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    uncleaned_objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))
    
    #print(uncleaned_objects[id])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

sw = set(nltk.corpus.stopwords.words('english'))
puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

objects = {}

# clean set of objects
for id in uncleaned_objects:
    #print(id)
    for word in uncleaned_objects[id]:
        # print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            continue
    
        # remove any punctuation
        word = re.sub(r'[^\w\s]','',word)
        tokenized = word.split()
        
        for token in tokenized:
            # skip letters, numbers, puncs
            if (len(token) < 3) or token.isdigit() or (token in sw) or (token in puncs):
                continue
            
            # add object in dict
            if (id in objects) and (token not in objects[id]):
                objects[id].append(token)
            else:
                objects[id] = list([token])

In [None]:
for id in objects:
    print(id)
    print(objects[id])
#print(objects)

In [None]:
# Save objects into file
json.dump( objects, open( "objects_nlp.json", 'w' ) )

In [72]:
add_objs_to_db(objects)

NameError: name 'null' is not defined

In [69]:
objects = json.load(open("objects_nlp.json"))

In [65]:
# open db
con = sqlite3.connect('rijksstudio.db')
cur = con.cursor()

In [67]:
cur.execute("""SELECT * FROM Objects""").fetchall()

[('SK-A-2152-101195G',
  'SK-A-2152.png',
  'Vase',
  0.825371503829956,
  0.7948744297027588,
  0.5773231387138367,
  0.8968730568885803,
  0.8031973242759705,
  'GOOGLE_VISION'),
 ('SK-A-2152-261821920G',
  'SK-A-2152.png',
  'Fruit',
  0.6683477163314819,
  0.03586866334080696,
  0.6656956076622009,
  0.14711251854896545,
  0.8308190703392029,
  'GOOGLE_VISION'),
 ('SK-A-2344-16518191514G',
  'SK-A-2344.png',
  'Person',
  0.8338317275047302,
  0.3135993778705597,
  0.152947798371315,
  0.7459824085235596,
  0.9234931468963623,
  'GOOGLE_VISION'),
 ('SK-A-2344-301524G',
  'SK-A-2344.png',
  'Box',
  0.8195826411247253,
  0.7758808732032776,
  0.8272160887718201,
  0.9191758036613464,
  0.9314410090446472,
  'GOOGLE_VISION'),
 ('SK-A-385-2621181492021185G',
  'SK-A-385.png',
  'Furniture',
  0.6197009086608887,
  0.7483448386192322,
  0.6829424500465393,
  0.9953608512878418,
  0.9885066151618958,
  'GOOGLE_VISION'),
 ('SK-A-390-121516G',
  'SK-A-390.png',
  'Top',
  0.62104630470275

In [70]:
def add_objs_to_db(objects):
    for painting in objects:
        for obj in objects[painting]:
            q = cur.execute("""INSERT OR IGNORE INTO Objects 
                   VALUES (?,?,?,?,?,?,?,?,?);""", (create_obj_id(painting,obj), painting, obj, 0, null, null, null, null, "NLP"))

            con.commit()
            print(painting + " with object " + obj + " added to db")

In [71]:
# turns a combination of painting and object name to an id
def create_obj_id(painting,obj_name):
    obj_id = ''.join([str(ord(char)-96) for char in obj_name])    # convert chars to numbers
    return (painting.split(".")[0] + obj_id) + "N"    # + N for NLP