## Preprocess ImageNet

In [1]:
import glob
import numpy as np
import pickle as pkl
import torchvision.transforms as transforms
import json

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])

transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize,
])

In [2]:
import os
import glob
import concurrent
import multiprocessing
import pandas as pd
from PIL import Image
from multiprocessing import Pool, cpu_count
import pickle

src_dir = "/home/cc/CNDLSysData/Imagenet-Mini"

def convert_img(chunk, image_paths):
    for image_path in image_paths:
        with Image.open(image_path) as img:
            img = img.convert("RGB")
        img = transform(img)
        new_img_path = image_path.replace('JPEG', 'npy')
        np.save(new_img_path, img)

img_paths = glob.glob(f"{src_dir}/*/samples/*/*.JPEG")

def images(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
with Pool(cpu_count()) as p:
    num_files_per_process = len(img_paths) // cpu_count()  # divide files equally among processes
    img_files = list(images(img_paths, num_files_per_process))
    p.starmap(convert_img, enumerate(img_files))

## Upload Data to S3

In [8]:
import boto3
import glob
import pickle
import multiprocessing
import concurrent.futures
import os


session = boto3.Session()
s3 = session.client("s3")
bucket = 'vuzhuangwei'

def preprocess(path):
    key = path.replace('/home/cc/CNDLSysData/', '')
    print(key)
    s3.upload_file(path, bucket, key)
 
def upload_objects(folder):
    with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        futures = []
        if os.path.isdir(folder):
            imgs = glob.glob('{}/*/*'.format(folder))
        else:
            imgs = [folder]
        for path in imgs:
            futures.append(executor.submit(preprocess, path))
        concurrent.futures.wait(futures)

upload_objects("/home/cc/CNDLSysData/Imagenet-Mini-Numpy")

Imagenet-Mini-Numpy/val/0.tar.gz
Imagenet-Mini-Numpy/train/1.tar.gz
Imagenet-Mini-Numpy/train/0.tar.gz
Imagenet-Mini-Numpy/train/2.tar.gz
Imagenet-Mini-Numpy/train/3.tar.gz
