In [None]:
def download_data(mnist=True, coherent=False, tiny=False):
    """
    Download one of the 6 possible versions of BSCCM dataset
    
    mnist: download BSCCMNIST (downsized and downsampled version of BSCCM)
    coherent: download BSCCM-coherent or BSCCM-coherent-tiny
    tiny: the tiny version or the full version
    """


    location = '/home/hpinkard_waller/2tb_ssd/'
    doi_url = 'doi%3A10.5061%2Fdryad.9pg8d'
    version_index = -1
    file_index = 1

    # Get the version ID of the dataset
    api_url = "https://datadryad.org/api/v2/"
    versions = requests.get(api_url + 'datasets/{}/versions'.format(doi_url))
    version_id = versions.json()['_embedded']['stash:versions'][version_index]['_links']['self']['href'].split('/')[version_index]

    # Get the URL to download one particular file
    file = requests.get(api_url + 'versions/' + version_id + '/files').json()['_embedded']['stash:files'][file_index]
    file_name = file['path']
    download_url = 'https://datadryad.org' + file['_links']['stash:download']['href']

    # Download in chunks (so that really big files can be downloaded)
    chunk_size = 1024 * 1024 * 8
    iters = file['size'] / chunk_size
    with requests.get(download_url, stream=True) as r:
        r.raise_for_status()
        with open(location + file_name, 'wb') as f:
            for i, chunk in enumerate(r.iter_content(chunk_size=chunk_size)): 
                print('Downloading {}, {:.1f}%\r'.format(file_name, 100 * i / iters ), end='')
                f.write(chunk)
    print('Finished downloading')


    loc = location + file_name[:-7] #remove .tar.gz
    print('Extracting to  {}...'.format(loc))
    file = tarfile.open(location + file_name)
    file.extractall(loc)
    file.close()
    print('Cleaning up')
    os.remove(location + file_name)
    print('Complete')

In [1]:
import os
import tarfile
from shutil import copyfileobj
from tqdm import tqdm

def split_tar_gz(path, file, chunk_size=2**30):
    chunk_number = 0
    total_size = os.path.getsize(path + file)

    # make destination directory
    chunk_path = path + file.split('.')[0] + '_chunks/'
    os.mkdir(chunk_path)
    
    with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
        with open(path + file, 'rb') as src:
            while True:
                with open('{}chunk{:05d}.bin'.format(chunk_path, chunk_number), 'wb') as dst:
                    written = 0
                    while written < chunk_size:
                        data = src.read(chunk_size) 
                        if not data:
                            return
                        dst.write(data)
                        written += len(data)
                        pbar.update(len(data))
                chunk_number += 1



In [3]:
for file in os.listdir('/home/hpinkard_waller/data/tars'):
    split_tar_gz(path='/home/hpinkard_waller/data/tars/', file=file)

100%|██████████| 513M/513M [00:00<00:00, 834MB/s]
100%|██████████| 211G/211G [05:38<00:00, 624MB/s] 
100%|██████████| 13.2M/13.2M [00:00<00:00, 673MB/s]
100%|██████████| 6.07G/6.07G [00:11<00:00, 542MB/s]
100%|██████████| 486M/486M [00:00<00:00, 813MB/s]
100%|██████████| 26.4G/26.4G [00:45<00:00, 576MB/s]
