In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json
import util

In [2]:
sns.set(rc={'figure.figsize':(16, 12)})
dirname = 'data/size/'

In [3]:
exps = dict()

for filename in sorted(os.listdir(dirname)):
    with open(dirname + filename) as fp:
        data = json.load(fp)

    ds_name = util.get_ds_name(data)
    for key in data['result']['sizes']:
        exps[(key, ds_name)] = [data['result']['sizes'][key]]

s_exps = pd.DataFrame.from_dict(exps).unstack()

In [4]:
dt_exps = s_exps.reset_index().set_axis(['method', 'dataset', 'non', 'size'], axis=1, inplace=False)\
    [['method', 'dataset', 'size']]
baselines = dt_exps[dt_exps.method == 'n3'].set_index('dataset')[['size']]

dt_exps = dt_exps.join(baselines, on='dataset', rsuffix='_base')
dt_exps['compression'] = 100.0 * dt_exps['size'] / dt_exps['size_base']

dt_exps

Unnamed: 0,method,dataset,size,size_base,compression
0,rdf-xml,identica,2095286,3444964,60.821710
1,turtle,identica,3062054,3444964,88.884935
2,n3,identica,3444964,3444964,100.000000
3,jelly-noprefix-sm,identica,1732210,3444964,50.282383
4,jelly-noprefix-gzip,identica,469190,3444964,13.619591
...,...,...,...,...,...
155,jelly-norepeat,nevada_10m,439123392,2064208968,21.273204
156,jelly-noprefix,nevada_10m,421902060,2064208968,20.438922
157,jena-proto,nevada_10m,2181884788,2064208968,105.700771
158,jelly-norepeat-gzip,nevada_10m,69245400,2064208968,3.354573


In [5]:
dt_exps[dt_exps.method.str.startswith('jelly') & ~dt_exps.method.str.endswith('gzip')]

Unnamed: 0,method,dataset,size,size_base,compression
3,jelly-noprefix-sm,identica,1732210,3444964,50.282383
8,jelly-full,identica,1628865,3444964,47.282497
11,jelly-norepeat,identica,1941671,3444964,56.362592
12,jelly-noprefix,identica,1711259,3444964,49.67422
19,jelly-noprefix-sm,mix,4555910,12305408,37.02364
24,jelly-full,mix,3754377,12305408,30.509976
27,jelly-norepeat,mix,4243368,12305408,34.483765
28,jelly-noprefix,mix,4116881,12305408,33.455868
35,jelly-noprefix-sm,wikipedia,18986766,34183225,55.544104
40,jelly-full,wikipedia,16942469,34183225,49.563694


In [6]:
dt_exps_agg = dt_exps.groupby(by='method')[['compression']].agg([np.mean, stats.gmean])\
    .set_axis(['mean', 'gmean'], axis=1, inplace=False)
dt_exps_agg

Unnamed: 0_level_0,mean,gmean
method,Unnamed: 1_level_1,Unnamed: 2_level_1
jelly-full,27.671808,25.545875
jelly-full-gzip,5.327854,3.405951
jelly-noprefix,30.001991,27.574291
jelly-noprefix-gzip,5.350713,3.438206
jelly-noprefix-sm,31.837259,29.478558
jelly-noprefix-sm-gzip,5.573533,3.688505
jelly-norepeat,31.441705,29.103693
jelly-norepeat-gzip,7.071251,5.486199
jena-proto,108.354843,108.334977
jena-proto-gzip,8.250078,6.705212


In [7]:
dt_extra = pd.read_csv('data/sizes_eri_hdt.tsv', sep='\t')
dt_extra

Unnamed: 0,dataset,eri-4k,eri-4k-nodict,hdt-4k
0,identica,8.4,8.0,16.4
1,mix,5.2,5.1,10.6
2,wikipedia,7.5,7.7,13.4
3,aemet-1,1.2,0.8,4.4
4,migr_reschange,0.5,0.5,2.6
5,tour_cap_nuts_3,0.5,0.6,2.6
6,aemet-2,1.1,1.1,3.8
7,petrol,2.9,2.6,9.9
8,flickr_10m,6.6,6.3,14.4
9,nevada_10m,1.5,1.3,4.9


In [8]:
dt_extra_agg = dt_extra[['eri-4k', 'eri-4k-nodict', 'hdt-4k']].apply([np.mean, stats.gmean], axis=0).T
dt_extra_agg

Unnamed: 0,mean,gmean
eri-4k,3.54,2.234545
eri-4k-nodict,3.4,2.112035
hdt-4k,8.3,6.705993


In [9]:
pd.concat((dt_exps_agg, dt_extra_agg)).apply(lambda x: np.round(x, 2))

Unnamed: 0,mean,gmean
jelly-full,27.67,25.55
jelly-full-gzip,5.33,3.41
jelly-noprefix,30.0,27.57
jelly-noprefix-gzip,5.35,3.44
jelly-noprefix-sm,31.84,29.48
jelly-noprefix-sm-gzip,5.57,3.69
jelly-norepeat,31.44,29.1
jelly-norepeat-gzip,7.07,5.49
jena-proto,108.35,108.33
jena-proto-gzip,8.25,6.71


In [10]:
dt_pivot = dt_exps.pivot(index='method', columns='dataset', values='compression')
dt_pivot

dataset,aemet-1,aemet-2,flickr_10m,identica,migr_reschange,mix,nevada_10m,petrol,tour_cap_nuts3,wikipedia
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
jelly-full,19.404953,22.067569,31.558461,47.282497,15.762003,30.509976,19.165369,25.504277,15.89928,49.563694
jelly-full-gzip,1.800741,1.585588,10.695717,13.746936,0.750982,6.947401,2.27199,4.50701,0.932999,10.039181
jelly-noprefix,18.949152,22.99352,32.83942,49.67422,17.197702,33.455868,20.438922,31.114678,17.490885,55.865542
jelly-noprefix-gzip,1.769707,1.582508,10.405967,13.619591,0.775855,7.144842,2.325314,4.695189,0.937486,10.250674
jelly-noprefix-sm,20.562517,23.371771,33.439573,50.282383,17.099687,37.02364,28.657134,35.010527,17.381259,55.544104
jelly-noprefix-sm-gzip,2.018094,1.749579,10.38531,13.608996,0.804673,7.822569,2.820073,5.28486,0.969331,10.271851
jelly-norepeat,24.313987,24.371531,35.891476,56.362592,18.081784,34.483765,21.273204,28.381949,18.213821,53.042944
jelly-norepeat-gzip,3.82619,2.934659,12.705222,15.328056,2.04461,8.843762,3.354573,6.797645,2.152904,12.724894
jena-proto,110.154057,106.846809,108.291433,109.905764,106.293215,109.509542,105.700771,107.925817,106.285225,112.6358
jena-proto-gzip,5.55978,3.844618,12.875937,18.34292,2.600573,11.377339,4.160474,7.040552,3.083898,13.61469


In [11]:
dt_pivot.to_latex(
    'extra/latex/size.tex',
    float_format='%.2f',
    bold_rows=True,
    caption='Compression Ratio (\%)',
)

  dt_pivot.to_latex(
