Grabbing the latest chips that are being worked on.

In [None]:
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import pandas as pd
import rasterio
from pathlib import Path
from tqdm import tqdm
import shutil

In [None]:
os.environ["AWS_NO_SIGN_REQUEST"] = "YES"

In [None]:
df_image_calc = gpd.read_file('s3://opera-calval-database-dswx/image_calc.geojson')
df_image_calc.dropna(subset='geometry', inplace=True)
df_image_calc = df_image_calc.sort_values(by=['image_name', 'upload_date'], ascending=True)
df_image_calc = df_image_calc.groupby('image_name').tail(1)
df_image_calc.head()

In [None]:
#n = df_image_calc[df_image_calc.processing_level == 'Intermediate'].shape[0]
n = df_image_calc.shape[0]
f'We have submitted {n} images currently'

# Extract Paths

In [None]:
def format_local_name(row) -> tuple:
    s3_bucket = row['bucket']
    
    s3_keys = row['s3_keys']
    l = s3_keys.split(',')
    if len(l) > 1:
        l = list(filter(lambda key: ('.tif' in key) and ('diff' not in key), l))
        if len(l) > 1:
            print(l)
    s3_key = l[0]
    
    directory = s3_bucket + '/' + '/'.join(s3_key.split('/')[:-1])
    filename = s3_key.split('/')[-1]
    return s3_key, directory, filename

Source: https://stackoverflow.com/questions/22799300/how-to-unpack-a-series-of-tuples-in-pandas

In [None]:
out = df_image_calc.aggregate(format_local_name, axis=1)
df_image_calc[['s3_key', 'directory', 'filename']] = out.apply(pd.Series)
df_image_calc.head()

# Download

In [None]:
def download_one(src_bucket: str, 
                 src_key: str, 
                 dst_dir_path: str, 
                 dst_filename: str) -> Path:
    dst_dir = Path(dst_dir_path)
    dst_dir.mkdir(exist_ok=True, parents=True)
    
    with rasterio.open(f's3://{src_bucket}/{src_key}') as ds:
        X = ds.read()
        p = ds.profile
        
    out_path = f'{dst_dir_path}/{dst_filename}'
    with rasterio.open(f'{dst_dir_path}/{dst_filename}', 'w', **p) as ds:
        ds.write(X)
    return out_path

def download_one_from_record(data_record: dict) -> Path:
    src_bucket = data_record['bucket']
    src_key = data_record['s3_key']
    
    dir_path = data_record['directory']
    filename = data_record['filename']
    
    return download_one(src_bucket, src_key, dir_path, filename)
    

In [None]:
records = df_image_calc.to_dict('records')
paths = list(map(download_one_from_record, tqdm(records)))

In [None]:
paths[0]

In [None]:
top_dir = paths[0].split('/')[0]
df_image_calc.to_file(f'{top_dir}/image_calc.geojson', driver='GeoJSON')

# Zip

Zips up the download and removes the original data.

In [None]:
shutil.make_archive(top_dir, 'zip', top_dir)

In [None]:
if True:
    shutil.rmtree(top_dir)