In [None]:
import pandas as pd
import requests as rq
import json
import spacy
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob
import re, string
from deep_translator import GoogleTranslator
# nltk.download('stopwords')
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('en_core_web_sm')

In [None]:
# Load data from save file
database = json.load(open("paintings_metadata_raw.json"))

In [None]:
# -----------Data Preprocessing-------------------
# For descriptions missing in english, look for Dutch descriptions and translate them.
# For completely missing descriptions, replace None english description with en empty String

translator = GoogleTranslator(source = 'nl', target='en')

for id in database:
    
    desc = database[id]["artObject"]["label"]["description"]
    if desc == None:
        desc_nl = database[id]["artObject"]["description"]
        if desc_nl:
            desc_en = translator.translate(desc_nl)
        else:
            desc_en = ""
            
        database[id]["artObject"]["label"]["description"] = desc_en
        
# Save processed metadata into file
json.dump(database, open("paintings_metadata.json", 'w' ))

In [None]:
# Load processed data from save file
database = json.load(open("paintings_metadata.json"))

In [None]:
paintings_descriptions = {}
paintings_painter = {}

for id in database:
    desc = database[id]["artObject"]["label"]["description"]
    if desc:
        #print(desc)
        paintings_descriptions[id] = desc
    else:
        paintings_descriptions[id] = "" #Description unavailable neither in Dutch or English
    
    painter = database[id]["artObject"]["principalMaker"]
    if painter:
        #print(painter)
        paintings_painter[id] = painter
    else:
        paintings_painter[id] = "" #Painter Unvailable
        
# Save descriptions into file
json.dump(paintings_descriptions, open("Paintings_descriptions.json", 'w'))
# Save painters into file
json.dump(paintings_painter, open( "Paintings_painters.json", 'w'))

del paintings_descriptions,paintings_painter

In [None]:
# ------------NLP-----------------
# For each painting, run three different nlp models (spacy, nltk, textblob) 
# and save the "dirty" findings along with IconClasses values extracted from metadata.
# Exceptions will occur on objects that have no text description 

nlp_spacy = spacy.load('en_core_web_sm')

uncleaned_objects = {}

for id in database:
    #print(id)
    
    desc = database[id]["artObject"]["label"]["description"]
    
    # get objects on painting from iconClass
    objects_icon = database[id]["artObject"]["classification"]["iconClassDescription"]
    
    #print(objects)
    nouns_spacy = []
    try:
        doc = nlp_spacy(desc.lower())
    
        for token in doc:
            if token.pos_=='NOUN' and (token.dep_ in ['nsubjpass','conj']):
                nouns_spacy.append(token.text)
    except:
        print("Error, spacy nlp failed on item: "+id)
    
    nouns_textblob = []
    try:
        blob = TextBlob(desc.lower())
        
        nouns_textblob = [word for word, tag in blob.tags if tag in ('NN')]
    except:
        print("Error, textBlob nlp failed on item: "+id)
    
    nouns_nltk = []
    try:
        tokenized = nltk.word_tokenize(desc.lower())
        tagged = nltk.pos_tag(tokenized)
        
        for token in tagged:
            if token[1]=='NN':
                nouns_nltk.append(token[0])
    except:
        print("Error, NLTK failed on item: "+id)
        
    uncleaned_objects[id] = list(set(objects_icon + nouns_spacy + nouns_nltk + nouns_textblob))

    
del database

In [None]:
def appendValueToDictEntry(val, dest_dict, id):
    
    # add object in dict
    if (id in dest_dict) and (val not in dest_dict[id]):
        dest_dict[id].append(val)
    else:
        dest_dict[id] = list([val])

In [None]:
# Cleaning of "objects" found in paintings. Various cleaning techniques are applied.

sw = set(nltk.corpus.stopwords.words('english'))
puncs = list(set(string.punctuation)); puncs.append("‘"); puncs.append('–'); puncs.append('’')

objects_cleaned = {}

# clean set of objects
for id in uncleaned_objects:
    #print(id)
    for word in uncleaned_objects[id]:
        # print(word)
        
        # remove if word is larger than 3 words
        if len(word.split()) > 3:
            continue
    
        # remove any punctuation
        word = re.sub(r'[^\w\s]','',word)
        tokenized = word.split()
        
        for token in tokenized:
            # skip letters, numbers, puncs
            if (len(token) < 3) or token.isdigit() or (token in sw) or (token in puncs):
                continue
            
            # add object in dict
            appendValueToDictEntry(token, objects_cleaned, id)

#del uncleaned_objects

In [None]:
# wordnet usage analysis
# filter out anything that is not considered lexicographically as Food, Plant, Animal or Artifact
# also, convert any words in plural to singular

objects_final = {}

lem = WordNetLemmatizer()

for id in objects_cleaned:
    #print(id)
    for word in objects_cleaned[id]:
        
        word = word = lem.lemmatize(word)
        syns = wn.synsets(word, pos = wn.NOUN)
        
        for syn in syns:
            if any(x in syn.lexname() for x in ['food','plant','animal','artifact']):
                appendValueToDictEntry(word, objects_final, id)
                break
                
#del objects_cleaned

In [None]:
# remove hypernyms. For exmaple, if both "insect" and "butterfly" are on list, remove "insect"

for id in objects_final:
    for word in objects_final:
        syns = wn.synsets(word, pos = wn.NOUN)
        for syn in syns:
            for hypernym in syn.hypernyms():
                if any(x in hypernym.name() for x in objects_final):               
                    for hyper_word in objects_final:
                        if hyper_word in hypernym.name() and hyper_word != word:
                            objects_final[id].remove(hyper_word)

In [None]:
# Save objects into file
json.dump(objects_final, open("objects_nlp.json", 'w'))

In [None]:
# for id in objects_final:
#     print("ID: ", id)
#     print("Uncleaned: ", uncleaned_objects[id])
#     print("Cleaned: ", objects_cleaned[id])
#     print("Final: ", objects_final[id])
#     print("--------------------------------------------")