# MonaLIA Full Set Scoring

In [4]:
from __future__ import print_function
import torch

import os
import sys

import numpy as np
import pandas as pd

from rdflib import Graph, URIRef, BNode, Literal
from rdflib import RDF, RDFS, XSD
from rdflib.namespace import SKOS

from itertools import compress

import torch
import torch.nn as nn

import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.models

In [5]:
# Import MonaLIA library from the package in the subfolder of the notebook folder
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import importlib 
import MonaLIA.util.metadata_helpers 
import MonaLIA.data.image_dataset
import MonaLIA.model.train as model

importlib.reload(MonaLIA.util.metadata_helpers)
#importlib.reload(MonaLIA.data.image_dataset)

from MonaLIA.data.image_dataset import JocondeDataset
from MonaLIA.util import metadata_helpers as helpers
from MonaLIA.util.metadata_helpers import monalia, jcl, notice, thesaurus


In [3]:
import rdflib
print('rdflib ver.', rdflib.__version__)

import SPARQLWrapper
print('SPARQLWrapper ver.', SPARQLWrapper.__version__)

rdflib ver. 5.0.0
SPARQLWrapper ver. 1.8.5


## Load Data

In [6]:
dataset = 'Joconde'


images_root = 'C:/Joconde/joconde'
descr_path = 'C:/Datasets/Joconde/forty classes'
image_description_file = os.path.join(descr_path, 'dataset1.csv')

multi_label = True
multi_crop = False
batch_size = 4

model_name = 'inception_v3'
#model_param_file = '../output/Inception_v3_Joconde_20_classes.1000.1.pth'
model_checkpoint_file = '../../MonaLIA/output/inception_v3_Joconde_40_classes.test.1000.3.4.checkpoint.pth.tar'
model_image_size = 299

In [5]:
if model_name == 'inception_v3':
    dataset_mean =  [0.5, 0.5, 0.5]
    dataset_std  =  [0.5, 0.5, 0.5]

elif model_name == 'vgg16_bn':
    dataset_mean =  image_transforms.joconde_mean_animals 
    dataset_std  =  image_transforms.joconde_std_animals 
       
else:
    raise ValueError('unexplored model')
    
if (multi_crop):
    test_trans = transforms.Compose([
                    transforms.Resize(max(256, model_image_size)),
                    transforms.FiveCrop(model_image_size),
                    transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])), # returns a 4D tensor
                    NormalizeMultiCrop(mean = dataset_mean,
                                         std = dataset_std)
                    ])
else:

    test_trans = transforms.Compose([
        #PadToSquare(padding_mode='wrap'),
        #transforms.Resize((model_image_size, model_image_size)), 
        transforms.Resize(model_image_size),
        transforms.CenterCrop(model_image_size),
        transforms.ToTensor(),
        transforms.Normalize(mean = dataset_mean,
                             std = dataset_std),
    ])
    


test_set = JocondeDataset(image_description_file, 
                        images_root,
                        dataset_name = 'all_classes',
                        exclude_labels= []  ,
                        label_column='label',
                        multiple_labels = multi_label, 
                        #filter_dict= {'usage': ['test']}, 
                        add_columns=['ref', 'repr'],
                        transform=test_trans)



test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                        batch_size=batch_size,
                                        shuffle=False,
                                        num_workers=2)

class_count = len(test_set.classes)
class_names = test_set.classes

print('Test', test_set)
print('    Labels:', test_set.labels_count)
print()

Test Dataset JocondeDataset
    Number of datapoints: 85797
    Root location: C:/Joconde/joconde
    Description file: C:/Datasets/Joconde/forty classes\dataset1.csv
    Number of classes: 40
    Number of uniqie labels: 4893
    StandardTransform
Transform: Compose(
               Resize(size=299, interpolation=PIL.Image.BILINEAR)
               CenterCrop(size=(299, 299))
               ToTensor()
               Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
           )
    Labels: {'ange': 3382, 'arbre': 10537, 'armure': 1805, 'bateau': 4956, 'bateau à voiles': 1678, 'casque': 1524, 'cavalier': 2333, 'chapeau': 2701, 'cheval': 7928, 'chien': 3344, 'château': 2898, 'couronne': 3193, 'croix': 2296, 'de face': 3766, 'de profil': 5329, 'drapeau': 1248, 'draperie': 3083, 'en buste': 10458, 'feuille': 1390, 'fleur': 5972, 'lion': 1274, 'livre': 3074, 'main': 2272, 'maison': 5164, 'mer': 1944, 'montagne': 2209, 'mouton': 1290, 'nu': 8009, 'nuage': 1291, 'nudité': 2218, 'oiseau': 48

## Load Model

In [5]:
checkpoint = torch.load(model_checkpoint_file)
print(checkpoint.keys())

dict_keys(['epoch', 'arch', 'state_dict', 'best_acc', 'classes', 'threshold', 'elapsed_time'])


In [9]:
checkpoint['classes']

['not mer', 'mer']

In [24]:
theClass = checkpoint['classes'][1]

In [11]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
print('Using cuda? ', use_cuda)

net = model.load_net(model_name = model_name, class_count=len(checkpoint['classes']))
#net.load_state_dict(torch.load(model_param_file))
net = net.to(device)
net.load_state_dict(checkpoint['state_dict'])

print(net.transform_input)

Using cuda?  True
False


In [13]:
activation = torch.softmax
scores = model.score(net, test_loader, activation, save_to_file='scores.pt')

images total: 1000 of 85797
images total: 2000 of 85797
images total: 3000 of 85797
images total: 4000 of 85797
images total: 5000 of 85797
images total: 6000 of 85797
images total: 7000 of 85797
images total: 8000 of 85797
images total: 9000 of 85797
images total: 10000 of 85797
images total: 11000 of 85797
images total: 12000 of 85797
images total: 13000 of 85797
images total: 14000 of 85797
images total: 15000 of 85797
images total: 16000 of 85797
images total: 17000 of 85797
images total: 18000 of 85797
images total: 19000 of 85797
images total: 20000 of 85797
images total: 21000 of 85797
images total: 22000 of 85797
images total: 23000 of 85797
images total: 24000 of 85797
images total: 25000 of 85797
images total: 26000 of 85797
images total: 27000 of 85797
images total: 28000 of 85797
images total: 29000 of 85797
images total: 30000 of 85797
images total: 31000 of 85797
images total: 32000 of 85797
images total: 33000 of 85797
images total: 34000 of 85797
images total: 35000 of 

## Read the scores file

In [6]:
scores = torch.load('..\\output\\scores.pt').cpu()

In [7]:
scores.shape

torch.Size([85797, 2])

## Calculate mAP

In [15]:
import sklearn.metrics as metrics
y_true = np.array(test_set.targets[:scores.shape[0]])
y_score = scores.cpu().detach().numpy()

labeled = np.sum(y_true ,  axis=1) > 0

val_mAP = metrics.average_precision_score(y_true[labeled], y_score[labeled], average='macro')
val_mAP

0.10830765294429619

## Combine the classification results with the  KB data

In [8]:
wds = "http://localhost:3030/Joconde/query"

In [27]:
# get the URIs for the classes
#class_terms =  pd.Series(index=test_set.classes)
class_terms =  pd.Series(index=checkpoint['classes'], dtype=object)

for i, t in enumerate(class_terms.index):
    class_terms[t] = helpers.getJocondeTermByLabel_service(wds, t)

pd.set_option('display.max_colwidth', None)
class_terms

not mer                                                             None
mer        http://data.culture.fr/thesaurus/resource/ark:/67717/T523-618
dtype: object

In [91]:
class_terms.index

Index(['not mer', 'mer'], dtype='object')

In [47]:
classifier_vocab = "REPR" #"DOMN"
classifier_name = '10 classes'
classifier_descr = "Classifier trained on images labeled by the top 10 most populated terms from the MiC's list of 100"
classifier_type = monalia.classifierRepresentedSubjectMultiLabel
classifier_id = monalia.classifierTenClasses

In [82]:
theClass = checkpoint['classes'][1]
classifier_vocab = "REPR" #"DOMN"
classifier_name = theClass
classifier_descr = "Binary classifier for category '%s'@fr. Param file: %s " % (theClass , os.path.basename(model_checkpoint_file))
classifier_type = monalia.classifierRepresentedSubjectBinary
classifier_id = monalia['classifier_%s' % theClass]

In [97]:
test_set.name

'all_classes'

In [83]:
g = helpers.create_graph()

#New classifier class
clsfier = URIRef(classifier_type)
g.add((clsfier , RDF.type, RDFS.Class))
g.add((clsfier , monalia.vocabID , Literal(classifier_vocab))) #Literal('DOMN')))


clsfier_sp = URIRef(classifier_id) 
g.add((clsfier_sp , RDFS.subClassOf, clsfier))
g.add((clsfier_sp , RDFS.label , Literal(classifier_name))) 
g.add((clsfier_sp , RDFS.comment , Literal(classifier_descr))) 

#g.bind("n",notice)

#g.namespace_manager.bind('skos', SKOS,  override=False)

In [84]:
for i in class_terms.index:
    
    if class_terms[i] is not None:
        t = BNode()
        g.add( (t, RDF.type, class_terms[i]) )
        g.add( (t, SKOS.prefLabel, Literal(i, lang='fr')) )
        g.add ((clsfier_sp , monalia.conatainsClass, t))
    
    
    #g.add((clsfier_sp , monalia.conatainsClass, Literal(i , lang='fr') ))
    #g.add( (t , RDF.type, class_terms[i]))
    

In [85]:
print(g.serialize(format='n3', encoding='utf-8').decode("utf-8"))

@prefix jcl: <http://jocondelab.iri-research.org/ns/jocondelab/> .
@prefix ml: <http://ns.inria.fr/monalia/> .
@prefix n: <https://jocondelab.iri-research.org/data/notice/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix t: <http://data.culture.fr/thesaurus/resource/ark:/67717/> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ml:classifierRepresentedSubjectBinary a rdfs:Class ;
    ml:vocabID "REPR" .

ml:classifier_mer rdfs:label "mer" ;
    ml:conatainsClass [ a t:T523-618 ;
            skos:prefLabel "mer"@fr ] ;
    rdfs:comment "Binary classifier for category 'mer'@fr. Param file: inception_v3_Joconde_40_classes.mer.1000.4.checkpoint.pth.tar " ;
    rdfs:subClassOf ml:classifierRepresentedSubjectBinary .




In [86]:
top_k = scores.shape[1] 

#for i, row in classified_df.iterrows():
for i, row in enumerate(test_set.samples):
    
    ref = row[2]
    
    classifier_bn = BNode()
   
    g.add( (notice[ref], monalia.imageClassifier, classifier_bn))
    g.add( (classifier_bn, RDF.type,   clsfier_sp ))
   

    #reduce the number of stored prediction classes to top 5 scores
    pred_scores, pred_labels =  torch.topk(scores[i] , top_k ,0)
    
           
    #pred_score_dict = dict(zip( [test_set.classes[pl] for pl in pred_labels] , 
    pred_score_dict = dict(zip( [class_terms.index[pl] for pl in pred_labels] ,
                                pred_scores.numpy()))
    

    for r, label in enumerate(pred_score_dict):
        
        if class_terms[label] is not None:
            label_key_value = BNode()

            g.add( (classifier_bn, monalia.detected, label_key_value) )
            #g.add( (label_key_value, monalia.predictionRank, Literal(r, datatype=XSD.int) ) ) 
            #g.add( (label_key_value, monalia.label, Literal(label, lang='fr') ))#TODO: delete
            g.add( (label_key_value, RDF.type, class_terms[label] )) 
            g.add( (label_key_value, monalia.score, Literal(round(pred_score_dict[label], 4) , datatype=XSD.float)))
    
    if i % 1000 == 0:
        print (i, end=', ')
    elif i == 2:     
        sample_RDF = g.serialize(format='n3', encoding='utf-8').decode("utf-8")
        break
            
print("Done")
        

0, Done


In [87]:
print(sample_RDF)

@prefix jcl: <http://jocondelab.iri-research.org/ns/jocondelab/> .
@prefix ml: <http://ns.inria.fr/monalia/> .
@prefix n: <https://jocondelab.iri-research.org/data/notice/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix t: <http://data.culture.fr/thesaurus/resource/ark:/67717/> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ml:classifierRepresentedSubjectBinary a rdfs:Class ;
    ml:vocabID "REPR" .

<https://jocondelab.iri-research.org/data/notice/50170000659> ml:imageClassifier [ a ml:classifier_mer ;
            ml:detected [ a t:T523-618 ;
                    ml:score "0.0337"^^xsd:float ] ] .

<https://jocondelab.iri-research.org/data/notice/50350109897> ml:imageClassifier [ a ml:classifier_mer ;
            ml:detected [ a t:T523-618 ;
                    ml:score "0.1147"^^xsd:float ] ]

## Store the classification results in RDF

In [7]:
model_checkpoint_file

'../../MonaLIA/output/inception_v3_Joconde_40_classes.test.1000.3.4.checkpoint.pth.tar'

In [59]:
rdf_file_name = os.path.join('./Classification Results', 'full_dataset.Inception_v3_Joconde_40_classes.test.1000.3.4.ttl')
g.serialize(destination=rdf_file_name, format='n3', encoding='utf-8')

##### Read the graph if nesessary

In [114]:
import rdflib
from rdflib import Graph

print('rdflib ver.', rdflib.__version__)

rdf_file_name = 'humans and horses and birds and dogs.ttl'
g_test = Graph()
g_test.parse(rdf_file_name, format='n3', encoding='utf-8')

<Graph identifier=N21241ded5fd8438bb86790fdbfd1ed59 (<class 'rdflib.graph.Graph'>)>

# Scrapbook

In [103]:
'voiture à attelage'.encode("ascii",  errors="backslashreplace")

b'voiture \\xe0 attelage'