# 1. Preprocess-Stage1-Download-TrainData

## Run name

In [1]:
import time

project_name = 'Google-LandMark-Rec2019'
step_name = '1-Preprocess-Stage1-Download-TrainData'
time_str = time.strftime("%Y%m%d-%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google-LandMark-Rec2019_1-Preprocess-Stage1-Download-TrainData_20190421-133825


## Important params

In [2]:
import multiprocessing

cpu_amount = multiprocessing.cpu_count()
train_tar_count = 500

print('train_tar_count:', train_tar_count)
print('cpu_amount: ', cpu_amount)

train_tar_count: 500
cpu_amount:  1


## Import PKGs

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os, sys, gc, math, shutil, zipfile, pickle, h5py
import urllib
from tqdm import tqdm
import hashlib

## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

md5sum_folder = os.path.join(input_folder, 'md5sum')
if not os.path.exists(md5sum_folder):
    os.mkdir(md5sum_folder)
    print('create folder:', md5sum_folder)
    
train_tar_folder = os.path.join(input_folder, 'train_tar')
if not os.path.exists(train_tar_folder):
    os.mkdir(train_tar_folder)
    print('create folder:', train_tar_folder)

## Functions

In [5]:
def download_file(url, target_folder):
    file_name = url.split('/')[-1]
    file_path = os.path.join(target_folder, file_name)
    try:
        if os.path.exists(file_path):
            print('File existed, skip it: %s' % file_path)
        else:
            urllib.request.urlretrieve(url, file_path)
    except:
        print('Warning: Could not download image {} from {}'.format(key, url))


## Download md5sum

In [6]:
%%time
for i in range(train_tar_count):
    url = 'https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_%03d.txt' % i
    download_file(url, md5sum_folder)
    print('Downloaded:', url)

File existed, skip it: /data/landmark-recognition-2019/input/md5sum/md5.images_000.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_000.txt
File existed, skip it: /data/landmark-recognition-2019/input/md5sum/md5.images_001.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_001.txt
File existed, skip it: /data/landmark-recognition-2019/input/md5sum/md5.images_002.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_002.txt
File existed, skip it: /data/landmark-recognition-2019/input/md5sum/md5.images_003.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_003.txt
File existed, skip it: /data/landmark-recognition-2019/input/md5sum/md5.images_004.txt
Downloaded: https://s3.amazonaws.com/google-landmark/md5sum/train/md5.images_004.txt
File existed, skip it: /data/landmark-recognition-2019/input/md5sum/md5.images_005.txt
Downloaded: https://s3.amazonaws.com/google-landmark/

In [7]:
!ls ./input/md5sum -hl

total 63M
-rw-rw-r-- 1 ubuntu ubuntu 61M Apr 21 10:10 images_000.tar
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_000.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_001.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_002.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_003.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_004.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_005.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_006.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_007.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_008.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_009.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_010.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_011.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_012.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Apr 21 10:36 md5.images_013.txt
-rw-rw-r-- 1 ubuntu ubuntu  49 Ap

## Download md5sum

In [8]:
%%time
for i in range(train_tar_count):
    url = 'https://s3.amazonaws.com/google-landmark/train/images_%03d.tar' % i
    download_file(url, train_tar_folder)
    print('Downloaded:', url)

File existed, skip it: /data/landmark-recognition-2019/input/train_tar/images_000.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_000.tar
File existed, skip it: /data/landmark-recognition-2019/input/train_tar/images_001.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_001.tar
File existed, skip it: /data/landmark-recognition-2019/input/train_tar/images_002.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_002.tar
File existed, skip it: /data/landmark-recognition-2019/input/train_tar/images_003.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_003.tar
File existed, skip it: /data/landmark-recognition-2019/input/train_tar/images_004.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_004.tar
File existed, skip it: /data/landmark-recognition-2019/input/train_tar/images_005.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_005.tar
File existed, skip it: /data/landmark-re

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_089.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_090.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_091.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_092.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_093.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_094.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_095.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_096.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_097.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_098.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_099.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_100.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_101.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_200.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_201.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_202.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_203.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_204.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_205.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_206.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_207.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_208.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_209.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_210.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_211.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_212.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_311.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_312.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_313.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_314.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_315.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_316.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_317.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_318.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_319.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_320.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_321.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_322.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_323.tar
Downloaded: https://s3.amazonaws.com/g

Downloaded: https://s3.amazonaws.com/google-landmark/train/images_422.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_423.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_424.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_425.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_426.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_427.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_428.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_429.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_430.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_431.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_432.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_433.tar
Downloaded: https://s3.amazonaws.com/google-landmark/train/images_434.tar
Downloaded: https://s3.amazonaws.com/g

In [9]:
!ls ./input/train_tar -hl

total 498G
-rw-rw-r-- 1 ubuntu ubuntu 1018M Apr 21 10:38 images_000.tar
-rw-rw-r-- 1 ubuntu ubuntu 1015M Apr 21 10:40 images_001.tar
-rw-rw-r-- 1 ubuntu ubuntu 1019M Apr 21 10:41 images_002.tar
-rw-rw-r-- 1 ubuntu ubuntu 1019M Apr 21 10:42 images_003.tar
-rw-rw-r-- 1 ubuntu ubuntu 1022M Apr 21 10:43 images_004.tar
-rw-rw-r-- 1 ubuntu ubuntu 1019M Apr 21 10:44 images_005.tar
-rw-rw-r-- 1 ubuntu ubuntu  1.0G Apr 21 10:46 images_006.tar
-rw-rw-r-- 1 ubuntu ubuntu  1.1G Apr 21 10:47 images_007.tar
-rw-rw-r-- 1 ubuntu ubuntu 1020M Apr 21 10:48 images_008.tar
-rw-rw-r-- 1 ubuntu ubuntu 1020M Apr 21 10:49 images_009.tar
-rw-rw-r-- 1 ubuntu ubuntu 1023M Apr 21 10:51 images_010.tar
-rw-rw-r-- 1 ubuntu ubuntu 1021M Apr 21 10:52 images_011.tar
-rw-rw-r-- 1 ubuntu ubuntu 1023M Apr 21 10:53 images_012.tar
-rw-rw-r-- 1 ubuntu ubuntu 1021M Apr 21 10:55 images_013.tar
-rw-rw-r-- 1 ubuntu ubuntu 1022M Apr 21 10:56 images_014.tar
-rw-rw-r-- 1 ubuntu ubuntu 1011M Apr 21 10:57 images_015.tar
-rw-rw-r-- 1 

In [10]:
print('Time elapsed: %.1fs' % (time.time() - t0))
print(run_name)

Time elapsed: 57234.1s
