# MonaLIA Postprocessing

In [1]:
from __future__ import print_function
import torch

import os
import sys

import numpy as np
import pandas as pd

from rdflib import Graph, URIRef, BNode, Literal
from rdflib import RDF, RDFS, XSD
from rdflib.namespace import SKOS


In [2]:
### Import MonaLIA library that is in the same directory ###
if (os.getcwd() not in sys.path):
    sys.path.append(os.getcwd())

import MonaLIA    
import importlib


importlib.reload(MonaLIA)

<module 'MonaLIA' from 'C:\\Users\\abobashe\\Documents\\MonaLIA\\Python Scripts\\MonaLIA.py'>

## Read the classification results from the IPython store of the previous script

In [3]:
%store -r classified test_set
print(classified.shape)
print(classified[0:4])

torch.Size([232, 10])
tensor([[    0,     3,     0,     1,     2,  3422,  4787,  3422,   973,
           816],
        [    0,     0,     3,     1,     2,  5461,  5461,  2048,  1677,
           813],
        [    0,     0,     1,     2,     3,  9574,  9574,   370,    45,
             8],
        [    0,     0,     1,     3,     2,  5919,  5919,  2624,   981,
           474]])


## Convert classification result tensor to the dataframe

In [4]:
class_col_names = ['target', 'prediction', 'class2', 'class3', 'class4', 'class5',]
prob_col_names = ['target_prob','pred_prob', 'prob2', 'prob3', 'prob4', 'prob5']

#truncate the columns array if the number of classes is less than 5
class_col_names = class_col_names[0: classified.shape[1]//2 ] 
prob_col_names = prob_col_names[0 : classified.shape[1]//2]

col_names = class_col_names + prob_col_names

classified_df = pd.DataFrame(classified.numpy(), columns= col_names)

classified_df[prob_col_names] = classified_df[prob_col_names].apply(pd.to_numeric) * 0.0001
classified_df[class_col_names] = classified_df[class_col_names].apply(lambda row: row.apply(lambda x: test_set.classes[x]))

#classified_df.sort_values(by='target_prob', ascending =False).head()
print(classified_df.shape)
classified_df.head()

(232, 10)


Unnamed: 0,target,prediction,class2,class3,class4,target_prob,pred_prob,prob2,prob3,prob4
0,genre iconographique,représentation scientifique,genre iconographique,ornementation,représentation non figurative,0.3422,0.4787,0.3422,0.0973,0.0816
1,genre iconographique,genre iconographique,représentation scientifique,ornementation,représentation non figurative,0.5461,0.5461,0.2048,0.1677,0.0813
2,genre iconographique,genre iconographique,ornementation,représentation non figurative,représentation scientifique,0.9574,0.9574,0.037,0.0045,0.0008
3,genre iconographique,genre iconographique,ornementation,représentation scientifique,représentation non figurative,0.5919,0.5919,0.2624,0.0981,0.0474
4,genre iconographique,genre iconographique,représentation scientifique,ornementation,représentation non figurative,0.9955,0.9955,0.0036,0.0006,0.0001


## Add image properties 

In [5]:
classified_df['imageFile'] =  pd.Series(test_set.samples).apply(lambda x: x[0].replace(test_set.root, '')) # this column is not really nesessary because the file path can be inferred
classified_df['ref'] = pd.Series(classified_df.imageFile).apply(lambda x:  os.path.splitext(os.path.basename(x))[0])

##### Read image sizes from flat file (faster)

TODO: get the image size from the RDF file

In [6]:
csv_file_name = '..\Joconde\main_image_size.csv'
image_size_df = pd.read_csv(csv_file_name)

print(image_size_df.shape)
image_size_df.head()

(298511, 5)


Unnamed: 0.1,Unnamed: 0,ref,imagePath,width,height
0,0,AG021125,/0059/m079057_0001758_p.jpg,215,481
1,1,AG021121,/0059/m079057_0001663_p.jpg,183,512
2,2,AG019906,/0059/m079057_0002134_p.jpg,170,488
3,3,AG019903,/0059/m079057_0002122_p.jpg,204,502
4,4,AG019754,/0059/m079057_0001895_p.jpg,197,512


##### Merge the size information into the classification dataset

In [7]:
classified_df = pd.merge(classified_df,
                        image_size_df[['ref', 'width', 'height']],
                        on='ref',
                        how='left')
classified_df.head()

Unnamed: 0,target,prediction,class2,class3,class4,target_prob,pred_prob,prob2,prob3,prob4,imageFile,ref,width,height
0,genre iconographique,représentation scientifique,genre iconographique,ornementation,représentation non figurative,0.3422,0.4787,0.3422,0.0973,0.0816,\genre iconographique\00000088532.jpg,88532,746,512
1,genre iconographique,genre iconographique,représentation scientifique,ornementation,représentation non figurative,0.5461,0.5461,0.2048,0.1677,0.0813,\genre iconographique\00000101480.jpg,101480,756,512
2,genre iconographique,genre iconographique,ornementation,représentation non figurative,représentation scientifique,0.9574,0.9574,0.037,0.0045,0.0008,\genre iconographique\00000102103.jpg,102103,756,512
3,genre iconographique,genre iconographique,ornementation,représentation scientifique,représentation non figurative,0.5919,0.5919,0.2624,0.0981,0.0474,\genre iconographique\00000102722.jpg,102722,382,512
4,genre iconographique,genre iconographique,représentation scientifique,ornementation,représentation non figurative,0.9955,0.9955,0.0036,0.0006,0.0001,\genre iconographique\00000106034.jpg,106034,673,512


## Combine the classification results with the  KB data

In [8]:
classifier_name = "REPR" #"DOMN"
wds = "http://localhost:3030/Joconde/query"

In [9]:
from MonaLIA import monalia, ns

g = MonaLIA.create_graph()

#New classifier class
clsfier = URIRef(monalia.RepresentedSubjectClassifier) 
g.add((clsfier , RDF.type, RDFS.Class))
g.add((clsfier , monalia.vocabID , Literal('REPR'))) #Literal('DOMN')))

#g.namespace_manager.bind('skos', SKOS,  override=False)

In [10]:

# get the URIs for the classes
class_terms =  pd.Series(index=test_set.classes)

for i, t in enumerate(class_terms.index):
    class_terms[t] = MonaLIA.getJocondeTermByLabel_service(wds, t)

class_terms    

genre iconographique             http://data.culture.fr/thesaurus/resource/ark:...
ornementation                    http://data.culture.fr/thesaurus/resource/ark:...
représentation non figurative    http://data.culture.fr/thesaurus/resource/ark:...
représentation scientifique      http://data.culture.fr/thesaurus/resource/ark:...
dtype: object

In [11]:
%time
ref_list = []
for i, row in classified_df.iterrows():
    
    ref = row.ref
    
    classifier_bn = BNode()
    
    #g.add( (ns[ref], RDF.type , monalia.MissClassified))
    
    g.add( (ns[ref], monalia.imageClassifier, classifier_bn))
    
    g.add( (classifier_bn, RDF.type,   clsfier ))
    g.add( (classifier_bn, monalia.targetClass,          class_terms[row.target]))
    g.add( (classifier_bn, monalia.targetProbability,    Literal(round(row.target_prob,4) , datatype=XSD.float)))
    g.add( (classifier_bn, monalia.predictedClass,       class_terms[row.prediction]))
    g.add( (classifier_bn, monalia.predictedProbability, Literal(round(row.pred_prob, 4) , datatype=XSD.float)))
    g.add( (classifier_bn, monalia.imageWidth,           Literal(row.width, datatype=XSD.int)))
    g.add( (classifier_bn, monalia.imageHeight,          Literal(row.height, datatype=XSD.int)))
    
    if i % 100 == 0:
        print (i, end=', ')
    
    #Read notices' properties from the Joconde KB in batches of 1000
    ref_list.append(ref)
        

    if (i+1) % 1000 == 0:
        g = g + MonaLIA.describeJocondeNoticeList_service(wds, ref_list)
        ref_list = []

#read the remainder
if len(ref_list) > 0:
    g = g + MonaLIA.describeJocondeNoticeList_service(wds, ref_list)
        

Wall time: 0 ns
0, 100, 200, 

In [None]:
print(g.serialize(format='n3', encoding='utf-8').decode("utf-8"))

## Store the classification results in RDF

In [13]:
rdf_file_name = '.\Classification Results\Themes_4_Dataset.ttl'
g.serialize(destination=rdf_file_name, format='n3', encoding='utf-8')

##### Read the graph if nesessary

In [42]:
rdf_file_name = '.\Classification Results\class_artform_7.ttl'
g = MonaLIA.create_graph()
g.parse(rdf_file_name, format='n3', encoding='utf-8')

<Graph identifier=MonaLIA (<class 'rdflib.graph.Graph'>)>

## Update the DataFrame from the Joconde KB

### Load Thesauri

In [14]:
domn = Graph(identifier='DOMN')

domn.parse('../Joconde/domnskos.rdf', format='xml', encoding='utf-8')

repr = Graph(identifier='REPR')

repr.parse('../Joconde/reprskos.rdf', format='xml', encoding='utf-8')

<Graph identifier=REPR (<class 'rdflib.graph.Graph'>)>

#### Run query against the kb data subset and thesauri

In [15]:
query_str = ''' 
prefix skos: <http://www.w3.org/2004/02/skos/core#> 
prefix jcl: <http://jocondelab.iri-research.org/ns/jocondelab/>
    
select ?noticeRef 
       ?noticeDomain
       ?noticeRepresentedSubject
       ?noticeTechnique
       ?noticeDeno

       ?noticeArtForm
       ?noticeFunction
       ?noticeDiscipline

       ?noticeRepresentationType

       ?noticePhotocredit
       ?noticeMuseum 

       ?noticeTechnique1
       ?noticeTechnique2
       ?noticeTechnique3

       ?noticeDenomination
       #?noticeDenoDetails

where {

#link to notice data
#VALUES ?v {%s}.
#?notice jcl:noticeRef ?v.
?notice jcl:noticeRef ?noticeRef.



#link to domain attributes
optional{?notice jcl:noticeDomn ?noticeDomain.}
optional{?notice jcl:noticeDomnTerm ?noticeDomnTerm.
         ?noticeDomnTerm skos:prefLabel ?noticeArtForm.
         ?noticeDomnTerm skos:broader+ ?domainParentTerm.
         ?domainParentTerm skos:prefLabel "domaine par support de conservation"@fr.}
optional{?notice jcl:noticeDomnTerm ?noticeDomnTerm1.
         ?noticeDomnTerm1 skos:prefLabel ?noticeFunction.
         ?noticeDomnTerm1 skos:broader+ ?domainParentTerm1.
         ?domainParentTerm1 skos:prefLabel "domaine par type de fonction"@fr.}
optional{?notice jcl:noticeDomnTerm ?noticeDomnTerm2.
         ?noticeDomnTerm2 skos:prefLabel ?noticeDiscipline.
         ?noticeDomnTerm2 skos:broader+ ?domainParentTerm2.
         ?domainParentTerm2 skos:prefLabel "domaine disciplinaire"@fr.}
         optional{?notice jcl:noticeRepr ?noticeRepr}

#link to notice subject representation
optional{?notice jcl:noticeRepr ?noticeRepresentedSubject. }

optional{?notice jcl:noticeReprTerm ?noticeReprTerm.
         ?noticeReprTerm skos:prefLabel ?noticeRepresentationType.
         ?noticeReprTerm skos:broader ?reprParentTerm.
         ?reprParentTerm skos:prefLabel "genre de la représentation"@fr.}

optional{?notice jcl:noticePhot ?noticePhotocredit.}
optional{?notice jcl:noticeMuseo ?noticeMuseum. }

optional{?notice jcl:noticeDeno ?noticeDenomination. }

            #link to notice state of preservation -- too many missing values
            #optional{?notice jcl:noticeEtat ?noticeState.
            # reduce NoticeState classes
            #bind (lcase(?noticeState) as  ?temp)
            #bind( 
            #            if(contains( ?temp, "bon" ) , "bon",
            #            if(contains( ?temp, "moyen" ) , "moyen",
            #            if(contains( ?temp, "mauvais" ) , "mauvais",
            #           ?temp)))     as ?noticeStateReduced ). }

#link to notice techniques
optional {?notice jcl:noticeTech ?noticeTechnique.
        #split into 3 techniques
        bind( replace(?noticeTechnique, ",", ";") as ?technique ).
        bind( if(contains (?technique, ";"),  strbefore( ?technique, ";" ) , ?technique) as ?technique1 ).
        bind( if(contains (?technique, ";"),  strafter( ?technique, ";" ), "")  as ?temp1 ).  
        bind( if(contains (?temp1, ";"),  strbefore( ?temp1, ";" ) , ?temp1) as ?technique2 ).
        bind( if(contains (?temp1, ";"),  strafter( ?temp1, ";" ), "")  as ?temp2 ).
        bind( if(contains (?temp2, ";"),  strbefore( ?temp2, ";" ) , ?temp2) as ?technique3 ).       

        #remove leading, trailing and double spaces
        bind("^\\\\s+(.*?)\\\\s*$|^(.*?)\\\\s+$"  as ?regexp).
        bind( lcase( replace(?technique1, ?regexp, '$1$2'))  AS ?noticeTechnique1).
        bind( lcase( replace(?technique2, ?regexp, '$1$2'))  AS ?noticeTechnique2).
        bind( lcase( replace(?technique3, ?regexp, '$1$2'))  AS ?noticeTechnique3).    
        }

#link to notice denomination
optional {?notice jcl:noticeDeno ?noticeDeno. 
        bind( replace(replace( ?noticeDeno , ";",  "(" ), "," , "(") as ?denomination ).
        bind( if(contains (?denomination, "("),  strbefore( ?denomination, "(" ) , ?denomination) as ?denomination1 ).
        bind( if(contains (?denomination, "("),  strafter( ?denomination, "(" ), "")  as ?denomination2 ).  

        #removing leading, trailing and double spaces
        bind("^\\\\s+(.*?)\\\\s*$|^(.*?)\\\\s+$" as ?regexRemoveSpaces).
        bind( lcase( replace(?denomination1, ?regexRemoveSpaces, '$1$2'))  AS ?noticeDenomination).
        bind( lcase( replace(?denomination2, ?regexRemoveSpaces, '$1$2'))  AS ?noticeDenoDetails).
        }

       }'''

In [16]:
kb_df = MonaLIA.sparql_graph_to_dataframe(g + domn + repr , query_str)

query returned 262 entries


In [17]:
kb_df.replace("None" , np.nan, inplace=True)


In [18]:
print(kb_df.shape)
kb_df.head()

(262, 15)


Unnamed: 0,noticeRef,noticeDomain,noticeRepresentedSubject,noticeTechnique,noticeDeno,noticeArtForm,noticeFunction,noticeDiscipline,noticeRepresentationType,noticePhotocredit,noticeMuseum,noticeTechnique1,noticeTechnique2,noticeTechnique3,noticeDenomination
0,M0805001156,céramique,"ornementation (fleur, pétale, filet)",faïence ; glaçure opaque ; décor de grand feu ...,assiette (ronde),céramique,,,ornementation,© Marion Kalt,M0805,faïence,glaçure opaque,décor de grand feu,assiette
1,01720002107,sculpture,représentation non figurative,tôle : acier,maquette de sculpture,sculpture,,,représentation non figurative,© Thomas Georges - utilisation soumise à autor...,M0172,tôle : acier,,,maquette de sculpture
2,01720004544,archéologie,ornementation (fleur),"calcaire, taille directe",chancel (élément) ; plaque,,,archéologie,ornementation,"© J C. Culas, Musées de Mâcon",M0172,calcaire,taille directe,,chancel
3,000PE025272,peinture,"paysage (Piémont, lac)",peinture à l'huile ; bois,tableau,peinture,,,,© Réunion des musées nationaux - utilisation s...,M0065,peinture à l'huile,bois,,tableau
4,000PE020339,peinture,"figures bibliques (femme, Yaël, Sisera, de tro...",peinture à l'huile ; toile,tableau,peinture,,,,© Lysiane Gauthier,M0065,peinture à l'huile,toile,,tableau


In [None]:
c = pd.crosstab(index=kb_df.noticeRef, columns="cnt")
print(c[c.cnt > 1].shape)
c[c.cnt > 1].head()

In [19]:
classified_df = pd.merge(classified_df,
                        kb_df,
                        left_on='ref',
                        right_on='noticeRef', 
                        how='left')


In [20]:
classified_df.head()

Unnamed: 0,target,prediction,class2,class3,class4,target_prob,pred_prob,prob2,prob3,prob4,...,noticeArtForm,noticeFunction,noticeDiscipline,noticeRepresentationType,noticePhotocredit,noticeMuseum,noticeTechnique1,noticeTechnique2,noticeTechnique3,noticeDenomination
0,genre iconographique,représentation scientifique,genre iconographique,ornementation,représentation non figurative,0.3422,0.4787,0.3422,0.0973,0.0816,...,dessin,,,,© Jean de Calan,M5044,mine de plomb,papier (beige),,élément d'ensemble
1,genre iconographique,genre iconographique,représentation scientifique,ornementation,représentation non figurative,0.5461,0.5461,0.2048,0.1677,0.0813,...,dessin,,,,© Claudine Pigot - utilisation soumise à autor...,M1096,crayon feutre,papier,,
2,genre iconographique,genre iconographique,ornementation,représentation non figurative,représentation scientifique,0.9574,0.9574,0.037,0.0045,0.0008,...,dessin,,,,© Claudine Pigot - utilisation soumise à autor...,M1096,gouache,papier,couleur,
3,genre iconographique,genre iconographique,ornementation,représentation scientifique,représentation non figurative,0.5919,0.5919,0.2624,0.0981,0.0474,...,dessin,,,,© Claudine Pigot - utilisation soumise à autor...,M1096,crayon gras,papier,,
4,genre iconographique,genre iconographique,représentation scientifique,ornementation,représentation non figurative,0.9955,0.9955,0.0036,0.0006,0.0001,...,photographie,,,,© musée Condé,M5052,,,,tirage photographique


In [21]:
%store classified_df

Stored 'classified_df' (DataFrame)


In [22]:
classified_df.to_csv('themes_262.tsv', sep="\t", encoding='utf-8')

In [None]:
classified_df = classified_df.drop(kb_df.columns, axis=1)

In [23]:
classified_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262 entries, 0 to 261
Data columns (total 29 columns):
target                      262 non-null object
prediction                  262 non-null object
class2                      262 non-null object
class3                      262 non-null object
class4                      262 non-null object
target_prob                 262 non-null float64
pred_prob                   262 non-null float64
prob2                       262 non-null float64
prob3                       262 non-null float64
prob4                       262 non-null float64
imageFile                   262 non-null object
ref                         262 non-null object
width                       262 non-null int64
height                      262 non-null int64
noticeRef                   262 non-null object
noticeDomain                262 non-null object
noticeRepresentedSubject    262 non-null object
noticeTechnique             245 non-null object
noticeDeno                  