In [None]:
!pip install 

In [1]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import re, string

In [2]:
# Load Savvas's key to Rijks API
key = "3AzvvBJ0"

# Set language
culture = "en"

# Read Object codes list
object_names = pd.read_csv('object_names.txt', header = None)
object_names.columns = ["object_id"]

object_names.head()

Unnamed: 0,object_id
0,SK-A-4830
1,SK-A-4821
2,SK-A-3059
3,SK-A-1627
4,SK-A-1451


In [3]:
# Load paintings in list from Rijks API
database = {}

for id in object_names["object_id"]:
    print("Downloading item: "+id+"...")
    database[id] = rq.get("https://www.rijksmuseum.nl/api/"+culture+"/collection/"+id+"?key="+key).json()
    print("Done.")

Downloading item: SK-A-4830...
Done.
Downloading item: SK-A-4821...
Done.
Downloading item: SK-A-3059...
Done.
Downloading item: SK-A-1627...
Done.
Downloading item: SK-A-1451...
Done.
Downloading item: SK-A-4646...
Done.
Downloading item: SK-A-137...
Done.
Downloading item: SK-A-4820...
Done.
Downloading item: SK-C-1458...
Done.
Downloading item: SK-A-335...
Done.
Downloading item: SK-A-3988...
Done.
Downloading item: RP-P-H-1086...
Done.
Downloading item: SK-A-671...
Done.
Downloading item: SK-A-1751...
Done.
Downloading item: SK-A-4867...
Done.
Downloading item: SK-A-4052...
Done.
Downloading item: SK-A-718...
Done.
Downloading item: SK-C-206...
Done.
Downloading item: SK-A-385...
Done.
Downloading item: SK-A-4717...
Done.
Downloading item: SK-A-390...
Done.
Downloading item: SK-A-3467...
Done.
Downloading item: SK-C-187...
Done.
Downloading item: SK-C-301...
Done.
Downloading item: SK-A-2180...
Done.
Downloading item: SK-C-149...
Done.
Downloading item: SK-C-152...
Done.
Downloadin

In [6]:
# Save paintings metadata into file
json.dump( database, open( "paintings_metadata.json", 'w' ) )

In [5]:
# NLP
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

objects = {}

for id in object_names["object_id"]:
    print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    #print(desc)
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))
    
    print(objects[id])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [30]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

# clean set of objects
for id in objects:
    #print(item)
    for word in objects[id]:
        #print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            objects[id].remove(word)
            continue
    
        #remove unecessary parts of words
        if "(s)" in word:
            rep_word = re.sub("\(s\)","",word)
            objects[id].remove(word)
            objects[id].append(rep_word)
            continue
    
        if "'s" in word:
            rep_word = re.sub("'s","",word)
            objects[id].remove(word)
            objects[id].append(rep_word)
            continue
        
        # remove "word" if it's not a word
        if word in puncs:
            objects[id].remove(word)
            continue
    
        if word.isdigit():
            objects[id].remove(word)
            continue
    
        #remove EOW punctuation
        if word[-1] in puncs:
            rep_word = word[:-1]
            objects[id].remove(word)
            objects[id].append(rep_word)
            continue
            
#print(objects)

In [34]:
# Save objects into file
json.dump( objects, open( "objects_nlp.json", 'w' ) )