#### If your download is unstable, reduce the number of thread.
You will need boto3 and tqdm to run this script.

In [1]:
import boto3
import os
from typing import Tuple, List
from tqdm import tqdm
import pathlib
import json
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

import numpy as np

DEFAULT_NUM_THREAD = 8
S3_BUCKET = 'origin-ai-medical-data'
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S.%f'


class S3IO:
    def __init__(self, bucket) -> None:
        self._client = boto3.client(
            's3',
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY
        )

        self._bucket = bucket

    # TODO: might need a lazy loading approach (generator) if dataset is huge
    def _get_uploaded_data(self, prefix: str) -> Tuple[List, List]:
        keys = []
        dirs = []
        next_token = ''
        base_kwargs = {
            'Bucket': self._bucket,
            'Prefix': prefix,
        }
        while next_token is not None:
            kwargs = base_kwargs.copy()
            if next_token != '':
                kwargs.update({'ContinuationToken': next_token})
            results = self._client.list_objects_v2(**kwargs)
            contents = results.get('Contents')
            
            for i in contents:
    
                k = i.get('Key')
                if k[-1] != '/':
                    keys.append(k)
                else:
                    dirs.append(k)
            next_token = results.get('NextContinuationToken')
        return keys, dirs

    def _check_and_download(self, key: str,
                            local: str) -> None:

        dest_pathname = os.path.join(local, key)
        if not os.path.exists(os.path.dirname(dest_pathname)):
            try:
                os.makedirs(os.path.dirname(dest_pathname))
            except Exception:
                pass
        # Possible corner case: modified local file will still get ignored
        if not os.path.exists(dest_pathname):
            self._client.download_file(self._bucket, key, dest_pathname)

    # Modified solution from
    # https://stackoverflow.com/questions/31918960/boto3-to-download-all-files-from-a-s3-bucket
    def download_directory(self, prefix: str,
                           local: str) -> None:
        keys, dirs = self._get_uploaded_data(prefix)

        for d in dirs:
            dest_pathname = os.path.join(local, d)
            if not os.path.exists(os.path.dirname(dest_pathname)): 
                os.makedirs(os.path.dirname(dest_pathname))

        def download_function(key: str, local_dir: str = local) -> None:
            self._check_and_download(key, local_dir)  # type: ignore

        with ThreadPoolExecutor(
                max_workers=DEFAULT_NUM_THREAD) as executor:
            executor.map(download_function, keys)

    def _upload_file(self, file: str,
                     prefix: str) -> None:
        num_dir_to_skip = len(prefix.split('/')[:-1])

        target_dir = '/'.join(file.split('/')[num_dir_to_skip:])

        self._client.upload_file(file, self._bucket,
                                 target_dir)

    def upload_directory(self, local_prefix: str, cloud_prefix: str,
                         num_local_dir_to_skip: int) -> None:
        local_files = [str(file) for file
                       in pathlib.Path(local_prefix + '/').rglob('*')
                       if file.is_file()]
    # while counting the num_local_dir_to_skip count the num of backslash in the local path    

        def upload_function(file: str,
                            prefix_dir: str = cloud_prefix
                            ) -> None:
            local_dir = '/'.join(file.split('/')[num_local_dir_to_skip:])
            self.upload_file(file, prefix_dir + local_dir)

        with ThreadPoolExecutor(
                max_workers=DEFAULT_NUM_THREAD) as executor:
            list(tqdm(executor.map(
                upload_function, local_files),
                total=len(local_files), leave=False))

    def upload_file(self, local_file_path: str, cloud_file_path: str) -> None:
        self._client.upload_file(local_file_path, self._bucket,
                                 cloud_file_path)
        
    def download_from_list(file_list, local_target, max_workers=DEFAULT_NUM_THREAD):
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            list(tqdm(executor.map(
                s3.download_directory, file_list,
                [local_target] * len(file_list)),
                total=len(file_list)))          

s3 = S3IO(S3_BUCKET)

ModuleNotFoundError: No module named 'boto3'

### Download files

In [2]:
# import pandas as pd
# df = pd.read_csv("")
# raw_files = df['image_path'].values.tolist()
# print(type(raw_files))
# print(len(raw_files))

f = open('data.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)

file_list = []
for obj in data['data']:
    file_list.append(obj['image_path'])
    

<class 'list'>
49209


In [3]:
# raw_files[0]

'tranche_5/extracted_resources/patient_788e7063d0224b9589ac2b1bf12f329f/2fe7984baed04dc891717ee45c91d82d_00470.png'

In [3]:

# This is the target that you want to download to
download_target = '/home/ubuntu/OH/OH-Classifier-Framework-Basics/first_trimester_data/train'

In [5]:
def download_from_list(file_list, local_target, max_workers=DEFAULT_NUM_THREAD):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(tqdm(executor.map(
            s3.download_directory, file_list,
            [local_target] * len(file_list)),
            total=len(file_list)))

In [None]:
# This starts the download in the specified folder(this can take some time)
# download_from_list(raw_files, download_target)

In [13]:
# import tqdm
# from tqdm import tqdm
# import os
# df = pd.read_csv("/home/ubuntu/MODEL_CATALOG/Download_from_s3/csvs/model_catalog_train_test_val_data.csv")
# df.head()
# for i in tqdm(df['image_path']):
#     if os.path.exists(f"/home/ubuntu/MODEL_CATALOG/OH-classifier-framework/data/{i}"):
#         #print("true")
#         pass
#     else:
#         print("path doest not exist")
#         print(i)



100%|██████████| 49209/49209 [00:00<00:00, 229494.01it/s]


### Download Folder

In [7]:
s3_client = S3IO(S3_BUCKET)
s3_path = 'ai-dev/.../dataset_version_2/'
local_path = "/.../dataset_folders/"
s3_client.download_directory(s3_path, local_path)

### Upload folder

In [5]:
s3_client = S3IO(S3_BUCKET)
local_file_path = "/hom..../ataset_version_2"
cloud_file_path = "ai-dev.../"

s3_client.upload_directory(local_file_path, cloud_file_path,5)

  0%|          | 0/5 [00:00<?, ?it/s]

### get files

In [None]:
import os
import sys
import string
import shutil

#Generate the file paths to traverse, or a single path if a file name was given
def getfiles(path):
    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            for name in files:
                yield os.path.join(root, name)
    else:
        yield path

destination = "./anomalyimages/"
fromdir = ".//"
for f in getfiles(fromdir):
    filename = f.split( '/')[-1]
    # if os.path.isfile(destination+filename):
    filename = f.replace(fromdir,"",1).replace("/","_")
    #os.rename(f, destination+filename)
    shutil.copy(f, destination+filename)

destination = "./anomalyimages/"
fromdir = "./anomalytctv/anomaly_curation/"
for f in getfiles(fromdir):
    filename = f.split( '/')[-1]
    # if os.path.isfile(destination+filename):
    filename = f.replace(fromdir,"",1).replace("/","_")
    #os.rename(f, destination+filename)
    shutil.copy(f, destination+filename)