# MonaLIA Get and Save the image size data for Joconde images

This script provides a tool to retrieve and link the actual image sizes from the Joconde image set with the Joconde KB.

It only has to be ran if the new images become available.

It saves the image sizes in two formats flat csv file and RDF N3 file.

If the image files that are referenced by Joconde KB are not present in the image set at the path specified then their width and height are set to 0.

In [87]:
from __future__ import print_function

import os
from PIL import Image

import numpy as np
import pandas as pd

from rdflib import Graph, URIRef, BNode, Literal
from rdflib import RDF, RDFS, XSD
from rdflib.namespace import Namespace, NamespaceManager
from rdflib.plugins import sparql as SPARQL

import json
from SPARQLWrapper import SPARQLWrapper, JSON, N3, XML, POST

In [25]:
jcl  = Namespace('http://jocondelab.iri-research.org/ns/jocondelab/')
ns = Namespace("https://jocondelab.iri-research.org/data/notice/")
monalia = Namespace("http://ns.inria.fr/monalia/")

def get_sparql_dataframe_service(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

def get_sparql_dataframe_graph(graph, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    q =  SPARQL.prepareQuery(query, initNs = { 'monalia': monalia, 'skos': SKOS, 'jcl': jcl })

    res = graph.query(q) 
   
    print('query returned %d entries' % len(res))

    cols = pd.Series(res.vars).apply(str).values

    out = []
    for row in res:
        item = []
        for col in cols:
            item.append(str(row[col]))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

### Specify local service

In [6]:
wds = "http://localhost:3030/Joconde/query"

### SPARQL Query String 
to retrieve all the main images from the Joconde KB

In [7]:
qs = '''
prefix skos: <http://www.w3.org/2004/02/skos/core#> 
prefix jcl: <http://jocondelab.iri-research.org/ns/jocondelab/>

select ?imagePath ?noticeReference where { 
?notice jcl:noticeHasImage true.               
?notice jcl:noticeImage [ jcl:noticeImageIsMain true ; jcl:noticeImagePath ?imagePath].
?notice jcl:noticeRef ?noticeReference.
}
'''

### Run Query

In [63]:
image_set_df = get_sparql_dataframe_service(wds, qs)

In [64]:
col_names = ['imagePath', 'ref' ]
image_set_df.columns = col_names

print(image_set_df.shape)
image_set_df.head()

(298511, 2)


Unnamed: 0,imagePath,ref
0,/0059/m079057_0001758_p.jpg,AG021125
1,/0059/m079057_0001663_p.jpg,AG021121
2,/0059/m079057_0002134_p.jpg,AG019906
3,/0059/m079057_0002122_p.jpg,AG019903
4,/0059/m079057_0001895_p.jpg,AG019754


### Read Saved Size Data

In [None]:
use_rdf_data = False

#### From Flat File

In [14]:
csv_file_name = '..\Joconde\main_image_size.csv'
image_size_df = pd.read_csv(csv_file_name)

print(image_size_df.shape)
image_size_df.head()

Wall time: 0 ns
(157239, 5)


Unnamed: 0.1,Unnamed: 0,ref,imagePath,width,height
0,0,01370001071,/0332/m013704_0004737_p.jpg,398,512
1,1,50350039685,/0122/m503501_d0039685-000_p.jpg,341,512
2,2,01720008704,/0372/m017201_0004127_p.jpg,400,512
3,3,M0435000598,/0407/m043501_00601_p.jpg,683,512
4,4,000PE011138,/0638/m507704_94de51048_p.jpg,496,600


#### From RDF file

In [19]:
g = Graph()
g.parse('../Joconde/main_image_size.ttl', format='n3', encoding='utf-8')

<Graph identifier=N0547ae546a634b08acbf3a64a0c1d19d (<class 'rdflib.graph.Graph'>)>

In [38]:
qs1 = '''
prefix monalia: <http://ns.inria.fr/monalia/>
prefix jcl: <http://jocondelab.iri-research.org/ns/jocondelab/>
prefix : <https://jocondelab.iri-research.org/data/notice/>

select ?notice ?imagePath ?width ?height where
{

    ?notice monalia:noticeImage [ monalia:imagePath ?imagePath; monalia:imageWidth ?width; monalia:imageHeight ?height ].
}
'''
if (use_rdf_data):
    image_size_df = get_sparql_dataframe_graph(g, qs1)

Wall time: 0 ns
query returned 157239 entries


In [57]:
if (use_rdf_data):
    print(image_size_df.shape)
    image_size_df['ref'] = image_size_df.notice.str.split('/', expand=True).iloc[:,5] 

    image_size_df.head()

(157239, 5)


Unnamed: 0,notice,imagePath,width,height,ref
0,https://jocondelab.iri-research.org/data/notic...,/0654/m500202_atpico060188_p.jpg,800,542,5002E011527
1,https://jocondelab.iri-research.org/data/notic...,/0427/m075903_017366_p.jpg,341,512,M0759003245
2,https://jocondelab.iri-research.org/data/notic...,/0632/m074801_0017843_p.jpg,449,600,07480002675
3,https://jocondelab.iri-research.org/data/notic...,/0532/m501104_92-001732_p.jpg,474,600,50110000862
4,https://jocondelab.iri-research.org/data/notic...,/0332/m013704_0008579_p.jpg,768,493,01370001897


### Merge KB image data and image size data

In [67]:
image_set_df = pd.merge(image_set_df,
        image_size_df[['ref', 'width', 'height']],
        on='ref',
        how='left')
print(image_set_df.shape)
image_set_df.head()

(298511, 8)


Unnamed: 0,imagePath,ref,width_x,height_x,width_y,height_y,width,height
0,/0059/m079057_0001758_p.jpg,AG021125,,,,,,
1,/0059/m079057_0001663_p.jpg,AG021121,,,,,,
2,/0059/m079057_0002134_p.jpg,AG019906,,,,,,
3,/0059/m079057_0002122_p.jpg,AG019903,,,,,,
4,/0059/m079057_0001895_p.jpg,AG019754,,,,,,
5,/0059/m079057_0001802_p.jpg,AG019707,213.0,487.0,213.0,487.0,213.0,487.0
6,/0059/m079057_0001793_p.jpg,AG019699,,,,,,
7,/0059/m079057_0001790_p.jpg,AG019698,,,,,,
8,/0059/m079057_0001785_p.jpg,AG019695,,,,,,
9,/0059/m079057_0001783_p.jpg,AG019693,,,,,,


#### List the unseen images

In [73]:
image_set_df[image_set_df.width.isnull()]

Unnamed: 0,imagePath,ref,width_x,height_x,width_y,height_y,width,height
0,/0059/m079057_0001758_p.jpg,AG021125,,,,,,
1,/0059/m079057_0001663_p.jpg,AG021121,,,,,,
2,/0059/m079057_0002134_p.jpg,AG019906,,,,,,
3,/0059/m079057_0002122_p.jpg,AG019903,,,,,,
4,/0059/m079057_0001895_p.jpg,AG019754,,,,,,
6,/0059/m079057_0001793_p.jpg,AG019699,,,,,,
7,/0059/m079057_0001790_p.jpg,AG019698,,,,,,
8,/0059/m079057_0001785_p.jpg,AG019695,,,,,,
9,/0059/m079057_0001783_p.jpg,AG019693,,,,,,
10,/0059/m079057_0001781_p.jpg,AG019692,,,,,,


#### Open the unseen images and read their size

In [95]:
new_count = 0
error_count = 0

image_root = 'C:/Joconde/joconde'

for i, row in image_set_df[image_set_df.width.isnull()].iterrows():
    
    if os.path.isfile(image_root + row.imagePath) :

        try:
            image = Image.open(image_root + row.imagePath)
            
            # get image size
            image_set_df.loc[i, ['width']] = image.size[0]
            image_set_df.loc[i, ['height']] = image.size[1]

            image.close()
            new_count += 1
            
        except:
            error_count += 1
    else:
        image_set_df.loc[i, ['width']] = 0
        image_set_df.loc[i, ['height']] = 0
        
    if i % 1000 == 0:
        print (i, end=', ')
        

print()            
print('Tried to read %d files; detected %d errors;  %d new records' %  (i, error_count, new_count))
            

1000, 2000, 8000, 10000, 
Tried to read 294922 files; detected 0 errors;  0 new records


#### View the updates

In [112]:
image_set_df[image_set_df.width_x.isnull()].head()

Unnamed: 0,imagePath,ref,width_x,height_x,width_y,height_y,width,height
0,/0059/m079057_0001758_p.jpg,AG021125,,,,,215,481
1,/0059/m079057_0001663_p.jpg,AG021121,,,,,183,512
2,/0059/m079057_0002134_p.jpg,AG019906,,,,,170,488
3,/0059/m079057_0002122_p.jpg,AG019903,,,,,204,502
4,/0059/m079057_0001895_p.jpg,AG019754,,,,,197,512


### Save Updated Dataset

#### To flat file

In [104]:
csv_file_name = '..\Joconde\main_image_size.csv'

image_set_df[['ref' ,'imagePath', 'width', 'height']].drop_duplicates().to_csv(csv_file_name, mode='w')

#### To RDF file

In [109]:
rdf_file_name = '..\Joconde\main_image_size.ttl'

g.namespace_manager = NamespaceManager(Graph())
g.namespace_manager.bind('jcl', jcl, override=False)
g.namespace_manager.bind('monalia', monalia, override=False)

new_count = 0

for i, row in image_set_df[image_set_df.width_x.isnull()].drop_duplicates(subset=['ref' ,'imagePath', 'width', 'height'] ).iterrows():
    
    ref = row.ref
    
    image_size_bn = BNode()
    
    g.add( (ns[ref], monalia.noticeImage, image_size_bn))
    
    g.add( (image_size_bn, monalia.imagePath,            Literal(row.imagePath)))
    g.add( (image_size_bn, monalia.imageWidth,           Literal(row.width, datatype=XSD.int)))
    g.add( (image_size_bn, monalia.imageHeight,          Literal(row.height, datatype=XSD.int)))
    
    new_count += 1
    
    if i % 1000 == 0:
        print (i, end=', ')
          
    
#print(g.serialize(format='n3', encoding='utf-8').decode("utf-8"))
g.serialize(destination=rdf_file_name, format='n3', encoding='utf-8')

0, 1000, 2000, 8000, 10000, 40000, 41000, 42000, 44000, 54000, 55000, 56000, 57000, 58000, 61000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 71000, 74000, 76000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 98000, 99000, 100000, 102000, 105000, 106000, 107000, 108000, 109000, 110000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 140000, 141000, 142000, 144000, 145000, 146000, 147000, 149000, 150000, 151000, 154000, 155000, 157000, 158000, 161000, 162000, 163000, 164000, 165000, 166000, 167000, 174000, 178000, 179000, 180000, 181000, 182000, 183000, 184000, 185000, 186000, 189000, 191000, 192000, 193000, 194000, 198000, 200000, 201000, 202000, 203000, 209000, 211000, 213000, 214000, 215000, 216000, 218000, 219000, 220000, 225000, 23400

In [110]:
new_count

141272

### Alternative Way to read the image size
In this implementation only works for JPEG images but can be extended

In [None]:
#function to read the size of the JPEG inage w/o Pillow library 
# there is no benefit of using it.
# it seems that the Pillow uses the same loading technique
# not really worth using this one

def get_jpeg_size(file_path):
    with open(file_path, "rb") as input:
        h = -1
        w = -1
        data = input.read(2)
        if data.startswith(b'\377\330'):
            b1 = data[1]
            b2 = input.read(1)[0]
            
            try:
                while (b1 != 0xDA):

                    while (b1 != 0xFF or (b2 != 0xC0 and b2 != 0xC2)):
                        b1 = b2
                        b2 = input.read(1)[0]

                    input.read(3)
                    h = int.from_bytes(input.read(2), byteorder='big', signed=False)
                    w = int.from_bytes(input.read(2), byteorder='big', signed=False)
                    break
            except Exception as e:
                raise Exception(e.__class__.__name__ + file_path)
        return  (w, h)
                
                