# Objective

* This is intended to be a script that will download the training data from `cosmic-ai` to be able to run locally
* showing it as a notebook to show off how it works
* warning this takes a very long time to run and don't yet have functionality to check for checkpoints or what is already loaded 

In [16]:
import os
import sys

import boto3
import pandas as pd
from tqdm import tqdm
import yaml


print(sys.version)

with open("../keys/aws_credentials.yaml", "r") as f:
    credentials = yaml.safe_load(f)


3.11.8 (v3.11.8:db85d51d3e, Feb  6 2024, 18:02:37) [Clang 13.0.0 (clang-1300.0.29.30)]


In [17]:
!mkdir -p ..//data/raw/cosmic_data

In [18]:
# Initialize  -- using session authorization
session = boto3.Session(aws_access_key_id=credentials["aws_access_key_id"],
                        aws_secret_access_key=credentials["aws_secret_access_key"],
                        region_name=credentials["region"])

s3 = session.client('s3')

BUCKET_NAME = 'cosmicai-data' # where data should be
LOCAL_SAVE = "..//data/raw/cosmic_data"

In [19]:
# TODO: this should come from utils

def download_s3_bucket(s3, bucket_name :str, local_dir:str = "tmp") -> None:
    """takes an S3 object, and a valid bucket name and downloads all the files on that Bucket
    in the same structure and copies them to a local directory.
    
    PARAMS:
        s3: a botocore s3 object
        bucket_name: a valid s3 bucket in that object
        local_directory: where the bucket will get copied to

    for fine grained control use `s3.download_file` and for a list of valid buckets `s3.list_buckets`.
    If local directory not specified will dump into tmp/ where this is script is called. Files downloaded
    should have the same structure as the S3 bucket.  
    """

    # Ensure the local directory exists
    if not os.path.exists(local_dir):
        print(f"creating directory -- {local_dir}")
        os.makedirs(local_dir)

    # List objects in the specified S3 bucket
    objects = s3.list_objects_v2(Bucket=bucket_name)

    if 'Contents' in objects:
        # Initialize tqdm progress bar
        total_files = len(objects['Contents'])
        with tqdm(total=total_files, desc="Downloading files", unit="file") as pbar:
            for obj in objects['Contents']:

                local_file_path = os.path.join(local_dir, obj['Key'])
                
                # Ensure the directory structure exists
                if not os.path.exists(os.path.dirname(local_file_path)):
                    os.makedirs(os.path.dirname(local_file_path))
                
                # Check if the object is a file (not a directory)
                if not obj['Key'].endswith('/'):
                    s3.download_file(bucket_name, obj['Key'], local_file_path)
                
                # Update the progress bar
                pbar.update(1)
        print("Download complete.")
    else:
        print(f"No objects found in bucket {bucket_name}.")


In [20]:
download_s3_bucket(s3, bucket_name = BUCKET_NAME, local_dir = LOCAL_SAVE) 

Downloading files: 100%|██████████| 1000/1000 [1:07:02<00:00,  4.02s/file]

Download complete.





In [23]:
!du -h ..//data/raw/cosmic_data

 13G	..//data/raw/cosmic_data/100MB
8.5G	..//data/raw/cosmic_data/10MB
 21G	..//data/raw/cosmic_data
