We need to download features from youtube8m

In [1]:
"""Downloads YouTube8M Dataset files for a specific partition from a mirror.

This download script will be served from http://data.yt8m.org/download.py. The
partitions are 1/{frame_level,video_level}/{train,validate,test}

To run locally, do:
  cat download.py | partition=2/video/train mirror=us python

Or to download just 1/1000th of the data:
  cat download.py | shard=1,1000 partition=2/video/train mirror=us python
"""

import hashlib
import json
import os
import sys

def md5sum(filename):
    """Computes the MD5 Hash for the contents of `filename`."""
    md5 = hashlib.md5()
    with open(filename, 'rb') as fin:
        for chunk in iter(lambda: fin.read(128 * md5.block_size), b''):
            md5.update(chunk)
    return md5.hexdigest()


In [4]:
# download data
def dwnld(partition, shard, mirror):
    """download tr records we need partition, shards and mirror"""

    partition_parts = partition.split('/')

    assert mirror in {'us', 'eu', 'asia'}
    assert len(partition_parts) == 3
    assert partition_parts[1] in {'video_level', 'frame_level', 'video', 'frame'}
    assert partition_parts[2] in {'train', 'test', 'validate'}

    plan_url = 'http://data.yt8m.org/{}/download_plans/{}_{}.json'.format(partition_parts[0], partition_parts[1], partition_parts[2])

    num_shards = 1
    shard_id = 1
    
    #shard_id, num_shards = os.environ['shard'].split(',')
    shard_id, num_shards = shard.split(',')
    shard_id = int(shard_id)
    num_shards = int(num_shards)
    assert shard_id >= 1
    assert shard_id <= num_shards

    plan_filename = '%s_download_plan.json' % partition.replace('/', '_')

    if os.path.exists(plan_filename):
        print ('Resuming Download ...')
    else:
        print ('Starting fresh download in this directory. Please make sure you '
               'have >2TB of free disk space!')
        os.system('curl %s > %s' % (plan_url, plan_filename))

    download_plan = json.loads(open(plan_filename).read())

    files = [f for f in download_plan['files'].keys()
           if int(hashlib.md5(f.encode('utf-8')).hexdigest(), 16) % num_shards == shard_id - 1]

    print ('Files remaining %i' % len(files))
    for f in files:
        print ('Downloading: %s' % f)
        if os.path.exists(f) and md5sum(f) == download_plan['files'][f]:
            print ('Skipping already downloaded file %s' % f)
            continue

        download_url = 'http://%s.data.yt8m.org/%s/%s' % (mirror, partition, f)
        os.system('curl %s > %s' % (download_url, f))
        if md5sum(f) == download_plan['files'][f]:
            print ('Successfully downloaded %s\n\n' % f)
            del download_plan['files'][f]
            open(plan_filename, 'w').write(json.dumps(download_plan))
        else:
            print ('Error downloading %s. MD5 does not match!\n\n' % f)

    print ('All done. No more files to download.')
    return


In [16]:
partition = '1/video_level/train'
mirror = 'eu'
shard = '10,1000'

os.chdir("./dataset/features")

dwnld(partition, shard, mirror)

os.chdir("../..")

Resuming Download ...
Files remaining 6
Downloading: trainON.tfrecord
Error downloading trainON.tfrecord. MD5 does not match!


Downloading: trainpj.tfrecord
Error downloading trainpj.tfrecord. MD5 does not match!


Downloading: trainSA.tfrecord
Error downloading trainSA.tfrecord. MD5 does not match!


Downloading: trainpI.tfrecord
Error downloading trainpI.tfrecord. MD5 does not match!


Downloading: train8C.tfrecord
Error downloading train8C.tfrecord. MD5 does not match!


Downloading: trainCc.tfrecord
Error downloading trainCc.tfrecord. MD5 does not match!


All done. No more files to download.


In [15]:
os.chdir("./serval")
os.getcwd()

'/home/hugo/git/serval/serval'