# Product Confirmation Workflow

This notebook downloads DIST-ALERT products from S3, unzips them, and runs the confirmation workflow.

In [1]:
import pandas as pd
import shutil
import zipfile
import requests
from pathlib import Path
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from dist_s1 import run_sequential_confirmation_of_dist_products_workflow

In [2]:
tmp_dir =  Path('tmp')
unconfirmed_products_dir =  Path('unconfirmed_products')
confirmed_products_dir =  Path('confirmed_products')

tmp_dir.mkdir(exist_ok=True)
unconfirmed_products_dir.mkdir(exist_ok=True)
confirmed_products_dir.mkdir(exist_ok=True)

In [3]:
# Load the test products CSV
csv_path = Path('val_products_minus_one.csv')
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,zip_url,browse_url,product_request_time,processing_duration,high_confidence_alert_threshold,mgrs_tile_id,post_date_buffer_days,stride_for_norm_param_estimation,n_workers_for_norm_param_estimation,delta_lookback_days_mw,...,model_source,memory_strategy,batch_size_for_norm_param_estimation,post_date,track_number,low_confidence_alert_threshold,n_workers_for_despeckling,device,max_pre_imgs_per_burst_mw,model_compilation
0,https://hyp3-tibet-jpl-test-contentbucket-hrat...,https://hyp3-tibet-jpl-test-contentbucket-hrat...,2025-09-30T18:19:50+00:00,1492.944,4.5,45TUK,1,7,4,none,...,transformer_optimized,high,32,2024-05-05,114,2.5,4,best,none,False
1,https://hyp3-tibet-jpl-test-contentbucket-hrat...,https://hyp3-tibet-jpl-test-contentbucket-hrat...,2025-09-30T18:19:50+00:00,778.462,4.5,45TUK,1,7,4,none,...,transformer_optimized,high,32,2024-10-25,12,2.5,4,best,none,False
2,https://hyp3-tibet-jpl-test-contentbucket-hrat...,https://hyp3-tibet-jpl-test-contentbucket-hrat...,2025-09-30T18:19:50+00:00,1609.243,4.5,45TUK,1,7,4,none,...,transformer_optimized,high,32,2024-11-13,114,2.5,4,best,none,False
3,https://hyp3-tibet-jpl-test-contentbucket-hrat...,https://hyp3-tibet-jpl-test-contentbucket-hrat...,2025-09-30T18:19:50+00:00,590.068,4.5,45TUK,1,7,4,none,...,transformer_optimized,high,32,2024-12-12,12,2.5,4,best,none,False
4,https://hyp3-tibet-jpl-test-contentbucket-hrat...,https://hyp3-tibet-jpl-test-contentbucket-hrat...,2025-09-30T18:19:50+00:00,791.816,4.5,45TUK,1,7,4,none,...,transformer_optimized,high,32,2024-08-14,12,2.5,4,best,none,False


In [4]:
def download_file(url, destination_path):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    with open(destination_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)
    
    return destination_path


def unzip_file(zip_path, extract_to):
    zip_path = Path(zip_path)
    extract_to = Path(extract_to)
    
    subdirectory_name = zip_path.stem
    
    full_extract_path = extract_to / subdirectory_name
    full_extract_path.mkdir(parents=True, exist_ok=True)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(full_extract_path)
    
    return full_extract_path

In [None]:
download_tasks = []
for _, row in df.iterrows():
    url = row['zip_url']
    filename = Path(url).name
    zip_path = tmp_dir / filename
    download_tasks.append((url, zip_path))

# Download all files concurrently
downloaded_files = []
with ThreadPoolExecutor(max_workers=10) as executor:
    with tqdm(total=len(download_tasks), desc="Downloading files", unit="file") as pbar:
        future_to_task = {executor.submit(download_file, url, dest_path): dest_path for url, dest_path in download_tasks}
        
        for future in future_to_task:
            file_path = future.result()
            downloaded_files.append(file_path)
            pbar.update(1)

print(f"Downloaded {len(downloaded_files)} files")

Downloading files:   0%|          | 0/5303 [00:00<?, ?file/s]

In [None]:
mgrs_tile_id = downloaded_files[0].name.split('_')[3][1:]

In [None]:

# Unzip all downloaded files
for zip_path in downloaded_files:
    mgrs_tile_id = downloaded_files[0].name.split('_')[3][1:]
    unconfirmed_products_dir = Path(f'unconfirmed_products/{mgrs_tile_id}')
    unconfirmed_products_dir.mkdir(exist_ok=True)
    unzip_file(zip_path, unconfirmed_products_dir)

In [None]:
subdirs = list(Path('unconfirmed_products').glob('*/'))
mgrs_tiles_unzipped = [subdir.name for subdir in subdirs]
mgrs_tiles_unzipped[:3]

In [None]:
%%time

# Run the confirmation workflow
run_sequential_confirmation_of_dist_products_workflow(
    unconfirmed_products_dir, 
    confirmed_products_dir / mgrs_tiles_unzipped[0]
)

In [None]:
# cleanup_temp = True
# if cleanup_temp:
#     shutil.rmtree(tmp_dir)