In [0]:
import os

os.environ["AWS_ACCESS_KEY_ID"] = "AWS_ACCESS_KEY_ID"
os.environ["AWS_SECRET_ACCESS_KEY"] = "AWS_SECRET_ACCESS_KEY"
os.environ["AWS_REGION"] = "AWS_REGION"

In [0]:
import boto3
import os
import zipfile
import shutil
from urllib.parse import urlparse
from botocore.client import Config
from datetime import datetime

# --- CONFIG ---
s3_output_path = "https://testing-pyspark-sairam.s3.eu-north-1.amazonaws.com/staging_data/"
bucket_name = "testing-pyspark-sairam"

# Timestamp for unique filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_s3_key = f"detections_zipped/output_detections_{timestamp}.zip"

# Local temp paths (Databricks driver node)
local_tmp_dir = f"/tmp/detection_output_{timestamp}"
local_zip_path = f"/tmp/output_detections_{timestamp}.zip"

# --- Clean and create local tmp dir ---
if os.path.exists(local_tmp_dir):
    shutil.rmtree(local_tmp_dir)
os.makedirs(local_tmp_dir, exist_ok=True)

# --- Parse prefix ---
parsed = urlparse(s3_output_path)
prefix = parsed.path.lstrip("/")  # Remove leading slash

# --- Initialize S3 client ---
s3 = boto3.client(
    "s3",
    region_name="eu-north-1",
    config=Config(signature_version="s3v4")
)

# --- List all files using paginator ---
paginator = s3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

download_count = 0

for page in pages:
    for obj in page.get("Contents", []):
        key = obj["Key"]
        if "_delta_log" in key:
            continue  # ❌ Skip delta log files
        if key.endswith((".parquet", ".json", ".csv")):
            relative_path = key[len(prefix):].lstrip("/")
            local_file_path = os.path.join(local_tmp_dir, relative_path)
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            s3.download_file(bucket_name, key, local_file_path)
            print(f"⬇️ Downloaded: {key}")
            download_count += 1

if download_count == 0:
    print(f"❌ No data files found under prefix: {prefix}")
    exit(1)

# --- Zip downloaded files ---
with zipfile.ZipFile(local_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(local_tmp_dir):
        for file in files:
            full_path = os.path.join(root, file)
            arcname = os.path.relpath(full_path, local_tmp_dir)
            zipf.write(full_path, arcname=arcname)
print(f"✅ Zipped {download_count} files to: {local_zip_path}")

# --- Upload ZIP to S3 ---
s3.upload_file(local_zip_path, bucket_name, zip_s3_key)
print(f"✅ Uploaded ZIP to s3://{bucket_name}/{zip_s3_key}")

# --- Generate pre-signed URL (valid 1 hour) ---
url = s3.generate_presigned_url(
    "get_object",
    Params={"Bucket": bucket_name, "Key": zip_s3_key},
    ExpiresIn=3600
)
print(f"\n📦 Download ZIP here (valid 1 hour):\n{url}")