In [1]:
import gzip
import os

import requests
from rdflib import Graph, DCAT

In [2]:
RIVERBENCH_VERSION = '2.0.1'
RIVERBENCH_PROFILE = 'flat-triples'
RIVERBENCH_BASE_URL = 'https://w3id.org/riverbench'

BATCH_SIZE = 50_000

In [6]:
r = requests.get(
    f'{RIVERBENCH_BASE_URL}/v/{RIVERBENCH_VERSION}/profiles/{RIVERBENCH_PROFILE}',
    headers={'Accept': 'text/turtle'}
)

g = Graph()
g.parse(data=r.text, format='turtle')
len(g)

43

In [7]:
datasets = list(g.objects(predicate=DCAT.seriesMember, unique=True))

print('Using the following datasets:')

for dataset in datasets:
    print(' - ' + str(dataset))

Using the following datasets:
 - https://w3id.org/riverbench/datasets/digital-agenda-indicators/1.0.2
 - https://w3id.org/riverbench/datasets/politiquices/1.0.2
 - https://w3id.org/riverbench/datasets/assist-iot-weather/1.0.2
 - https://w3id.org/riverbench/datasets/lod-katrina/1.0.2
 - https://w3id.org/riverbench/datasets/muziekweb/1.0.2
 - https://w3id.org/riverbench/datasets/citypulse-traffic/1.0.2
 - https://w3id.org/riverbench/datasets/dbpedia-live/1.0.2
 - https://w3id.org/riverbench/datasets/linked-spending/1.0.2


In [8]:
for dataset in datasets:
    dataset_name = str(dataset.split("/")[-2])
    print(f'Fetching {dataset_name}...')
    
    try:
        if os.stat(f'data/{dataset_name}.tar.gz').st_size > 0:
            print('  Already fetched')
            print()
            continue
    except:
        pass
    
    r = requests.get(
        str(dataset) + '/files/flat_full.nt.gz', 
        stream=True,
    )
    gzip_file = gzip.GzipFile(fileobj=r.raw)
    counter = 0
    os.makedirs(f'data/{dataset_name}', exist_ok=True)
    out_f = None
    
    while True:
        if counter % BATCH_SIZE == 0:
            f_name = f'{counter // BATCH_SIZE:05d}'
            if (counter // BATCH_SIZE) % 50 == 0:
                print(f'  Batch {f_name}')
            
            if out_f:
                out_f.close()
            out_f = open(f'data/{dataset_name}/{f_name}.nt', 'wb')
        line = gzip_file.readline()
        if not line:
            break
        out_f.write(line)
        counter += 1
        
    out_f.close()
    print(f'  Fetched {counter} triples')
    if counter % BATCH_SIZE != 0:
        print('  Removing last batch...')
        os.remove(f'data/{dataset_name}/{counter // BATCH_SIZE:05d}.nt')
    
    print('  Recompressing the dataset')
    os.system(f'tar czf data/{dataset_name}.tar.gz data/{dataset_name}')
    os.system(f'rm -r data/{dataset_name}')
    
    print('  Done')
    print()

Fetching digital-agenda-indicators...
  Batch 00000
  Batch 00050
  Batch 00100
  Batch 00150
  Batch 00200
  Fetched 11669016 triples
  Removing last batch...
  Recompressing the dataset
  Done

Fetching politiquices...
  Batch 00000
  Fetched 159957 triples
  Removing last batch...
  Recompressing the dataset
  Done

Fetching assist-iot-weather...
  Batch 00000
  Batch 00050
  Batch 00100
  Batch 00150
  Batch 00200
  Batch 00250
  Batch 00300
  Batch 00350
  Batch 00400
  Batch 00450
  Batch 00500
  Batch 00550
  Batch 00600
  Batch 00650
  Batch 00700
  Batch 00750
  Batch 00800
  Batch 00850
  Batch 00900
  Batch 00950
  Batch 01000
  Batch 01050
  Batch 01100
  Batch 01150
  Batch 01200
  Batch 01250
  Batch 01300
  Batch 01350
  Batch 01400
  Batch 01450
  Batch 01500
  Batch 01550
  Batch 01600
  Fetched 80646970 triples
  Removing last batch...
  Recompressing the dataset
  Done

Fetching lod-katrina...
  Batch 00000
  Batch 00050
  Batch 00100
  Batch 00150
  Batch 00200
  Ba