In [1]:
import gzip
import os

import requests
from rdflib import Graph, DCAT

In [2]:
# TODO: change to a stable version
RIVERBENCH_VERSION = 'dev'
RIVERBENCH_PROFILE = 'flat-triples'
RIVERBENCH_BASE_URL = 'https://w3id.org/riverbench'

BATCH_SIZE = 50_000

In [3]:
r = requests.get(
    f'{RIVERBENCH_BASE_URL}/profiles/{RIVERBENCH_PROFILE}/{RIVERBENCH_VERSION}',
    headers={'Accept': 'text/turtle'}
)

g = Graph()
g.parse(data=r.text, format='turtle')
len(g)

28

In [10]:
datasets = list(g.objects(predicate=DCAT.seriesMember, unique=True))

print('Using the following datasets:')

for dataset in datasets:
    print(' - ' + str(dataset))

Using the following datasets:
 - https://w3id.org/riverbench/datasets/muziekweb/dev
 - https://w3id.org/riverbench/datasets/lod-katrina/dev
 - https://w3id.org/riverbench/datasets/digital-agenda-indicators/dev
 - https://w3id.org/riverbench/datasets/citypulse-traffic/dev
 - https://w3id.org/riverbench/datasets/linked-spending/dev
 - https://w3id.org/riverbench/datasets/dbpedia-live/dev
 - https://w3id.org/riverbench/datasets/assist-iot-weather/dev
 - https://w3id.org/riverbench/datasets/politiquices/dev


In [11]:
for dataset in datasets:
    dataset_name = str(dataset.split("/")[-2])
    print(f'Fetching {dataset_name}...')
    
    try:
        if os.stat(f'data/{dataset_name}.tgz').st_size > 0:
            print('  Already fetched')
            print()
            continue
    except:
        pass
    
    r = requests.get(
        str(dataset) + '/files/flat_full.nt.gz', 
        stream=True,
    )
    gzip_file = gzip.GzipFile(fileobj=r.raw)
    counter = 0
    os.makedirs(f'data/{dataset_name}', exist_ok=True)
    out_f = None
    
    while True:
        if counter % BATCH_SIZE == 0:
            f_name = f'{counter // BATCH_SIZE:05d}'
            if (counter // BATCH_SIZE) % 50 == 0:
                print(f'  Batch {f_name}')
            
            if out_f:
                out_f.close()
            out_f = open(f'data/{dataset_name}/{f_name}.nt', 'wb')
        line = gzip_file.readline()
        if not line:
            break
        out_f.write(line)
        counter += 1
        
    out_f.close()
    print(f'  Fetched {counter} triples')
    if counter % BATCH_SIZE != 0:
        print('  Removing last batch...')
        os.remove(f'data/{dataset_name}/{counter // BATCH_SIZE:05d}.nt')
    
    print('  Recompressing the dataset')
    os.system(f'tar czf data/{dataset_name}.tgz data/{dataset_name}')
    os.system(f'rm -r data/{dataset_name}')
    
    print('  Done')
    print()

Fetching muziekweb...
  Batch 00000
  Batch 00050
  Batch 00100
  Batch 00150
  Batch 00200
  Batch 00250
  Batch 00300
  Batch 00350
  Batch 00400
  Batch 00450
  Batch 00500
  Batch 00550
  Batch 00600
  Batch 00650
  Batch 00700
  Fetched 36195263 triples
  Removing last batch...
  Recompressing the dataset
  Done

Fetching lod-katrina...
  Batch 00000
  Batch 00050
  Batch 00100
  Batch 00150
  Batch 00200
  Batch 00250
  Batch 00300
  Batch 00350
  Batch 00400
  Batch 00450
  Batch 00500
  Batch 00550
  Batch 00600
  Batch 00650
  Batch 00700
  Batch 00750
  Batch 00800
  Batch 00850
  Batch 00900
  Batch 00950
  Batch 01000
  Batch 01050
  Batch 01100
  Batch 01150
  Batch 01200
  Batch 01250
  Batch 01300
  Batch 01350
  Batch 01400
  Batch 01450
  Batch 01500
  Batch 01550
  Batch 01600
  Batch 01650
  Batch 01700
  Batch 01750
  Batch 01800
  Batch 01850
  Batch 01900
  Batch 01950
  Batch 02000
  Batch 02050
  Batch 02100
  Batch 02150
  Batch 02200
  Batch 02250
  Batch 0230