# 1. Preprocess-Stage1-Download-TrainData

## Run name

In [1]:
import time

project_name = 'Google-LandMark-Rec2019'
step_name = '1-Preprocess-Stage1-Download-TrainData'
time_str = time.strftime("%Y%m%d-%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google-LandMark-Rec2019_1-Preprocess-Stage1-Download-TrainData_20190501-172856


## Important params

In [2]:
import multiprocessing

cpu_amount = multiprocessing.cpu_count()
train_tar_count = 500

print('train_tar_count:', train_tar_count)
print('cpu_amount: ', cpu_amount)

train_tar_count: 500
cpu_amount:  1


## Import PKGs

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os, sys, gc, math, shutil, zipfile, pickle, h5py
import urllib
from tqdm import tqdm
import hashlib

## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

md5sum_folder = os.path.join(input_folder, 'md5sum')
if not os.path.exists(md5sum_folder):
    os.mkdir(md5sum_folder)
    print('create folder:', md5sum_folder)
    
train_tar_folder = os.path.join(input_folder, 'train_tar')
if not os.path.exists(train_tar_folder):
    os.mkdir(train_tar_folder)
    print('create folder:', train_tar_folder)

create folder: /data/landmark-recognition-2019/input/md5sum
create folder: /data/landmark-recognition-2019/input/train_tar


## Functions

In [5]:
def download_file(url, target_folder):
    file_name = url.split('/')[-1]
    file_path = os.path.join(target_folder, file_name)
    try:
        if os.path.exists(file_path):
            print('File existed, skip it: %s' % file_path)
        else:
            urllib.request.urlretrieve(url, file_path)
    except:
        print('Warning: Could not download image {} from {}'.format(key, url))


## Download md5sum

In [6]:
%%time
for i in range(train_tar_count):
    url = 'https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_%03d.txt' % i
    download_file(url, md5sum_folder)
    print('Downloaded:', url)

Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_000.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_001.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_002.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_003.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_004.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_005.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_006.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_007.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_008.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_009.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_010.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train

Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_098.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_099.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_100.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_101.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_102.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_103.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_104.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_105.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_106.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_107.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_108.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train

Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_195.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_196.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_197.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_198.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_199.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_200.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_201.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_202.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_203.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_204.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_205.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train

Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_293.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_294.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_295.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_296.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_297.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_298.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_299.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_300.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_301.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_302.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_303.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train

Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_392.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_393.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_394.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_395.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_396.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_397.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_398.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_399.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_400.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_401.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_402.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train

Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_491.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_492.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_493.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_494.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_495.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_496.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_497.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_498.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_499.txt
CPU times: user 4.95 s, sys: 252 ms, total: 5.2 s
Wall time: 38.4 s


In [None]:
!ls ./input/md5sum -hl

total 2.0M
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_000.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_001.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_002.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_003.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_004.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_005.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_006.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_007.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_008.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_009.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_010.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_011.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_012.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_013.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_014.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.images_015.txt
-rw-r--r-- 1 root root 49 May  1 17:28 md5.i

## Download md5sum

In [None]:
%%time
for i in range(train_tar_count):
    url = 'https://s3.amazonaws.com/google-landmark/train/images_%03d.tar' % i
    download_file(url, train_tar_folder)
    print('Downloaded:', url)

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_000.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_001.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_002.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_003.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_004.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_005.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_006.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_007.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_008.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_009.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_010.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_011.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_012.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_112.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_113.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_114.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_115.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_116.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_117.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_118.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_119.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_120.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_121.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_122.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_123.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_124.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_223.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_224.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_225.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_226.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_227.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_228.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_229.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_230.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_231.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_232.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_233.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_234.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_235.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_334.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_335.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_336.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_337.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_338.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_339.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_340.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_341.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_342.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_343.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_344.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_345.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_346.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_445.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_446.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_447.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_448.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_449.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_450.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_451.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_452.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_453.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_454.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_455.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_456.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_457.tar
Downloaded: https://s3.amazonaws.com/g

In [None]:
!ls ./input/train_tar -hl

total 498G
-rw-r--r-- 1 root root 1018M May  1 17:30 images_000.tar
-rw-r--r-- 1 root root 1015M May  1 17:30 images_001.tar
-rw-r--r-- 1 root root 1019M May  1 17:31 images_002.tar
-rw-r--r-- 1 root root 1019M May  1 17:32 images_003.tar
-rw-r--r-- 1 root root 1022M May  1 17:32 images_004.tar
-rw-r--r-- 1 root root 1019M May  1 17:32 images_005.tar
-rw-r--r-- 1 root root  1.0G May  1 17:33 images_006.tar
-rw-r--r-- 1 root root  1.1G May  1 17:34 images_007.tar
-rw-r--r-- 1 root root 1020M May  1 17:34 images_008.tar
-rw-r--r-- 1 root root 1020M May  1 17:35 images_009.tar
-rw-r--r-- 1 root root 1023M May  1 17:35 images_010.tar
-rw-r--r-- 1 root root 1021M May  1 17:36 images_011.tar
-rw-r--r-- 1 root root 1023M May  1 17:36 images_012.tar
-rw-r--r-- 1 root root 1021M May  1 17:37 images_013.tar
-rw-r--r-- 1 root root 1022M May  1 17:38 images_014.tar
-rw-r--r-- 1 root root 1011M May  1 17:38 images_015.tar
-rw-r--r-- 1 root root 1015M May  1 17:39 images_016.tar
-rw-r--r-- 1 root ro

-rw-r--r-- 1 root root  1.0G May  2 04:49 images_306.tar
-rw-r--r-- 1 root root 1017M May  2 04:52 images_307.tar
-rw-r--r-- 1 root root 1023M May  2 04:54 images_308.tar
-rw-r--r-- 1 root root  1.1G May  2 04:57 images_309.tar
-rw-r--r-- 1 root root 1023M May  2 04:59 images_310.tar
-rw-r--r-- 1 root root 1015M May  2 05:01 images_311.tar
-rw-r--r-- 1 root root  1.1G May  2 05:03 images_312.tar
-rw-r--r-- 1 root root 1017M May  2 05:06 images_313.tar
-rw-r--r-- 1 root root  1.0G May  2 05:08 images_314.tar
-rw-r--r-- 1 root root 1022M May  2 05:11 images_315.tar
-rw-r--r-- 1 root root  1.1G May  2 05:13 images_316.tar
-rw-r--r-- 1 root root 1020M May  2 05:15 images_317.tar
-rw-r--r-- 1 root root 1019M May  2 05:17 images_318.tar
-rw-r--r-- 1 root root 1017M May  2 05:20 images_319.tar
-rw-r--r-- 1 root root 1022M May  2 05:22 images_320.tar
-rw-r--r-- 1 root root 1017M May  2 05:24 images_321.tar
-rw-r--r-- 1 root root  1.1G May  2 05:27 images_322.tar
-rw-r--r-- 1 r

In [None]:
print('Time elapsed: %.1fs' % (time.time() - t0))
print(run_name)

Time elapsed: 67787.3s
Google-LandMark-Rec2019_1-Preprocess-Stage1-Download-TrainData_20190501-172856
