## Imports and set directory path

In [10]:
import sys
import os
import io
import boto3
import logging
from PIL import Image
import pillow_heif

In [11]:
notebook_dir = os.path.abspath('') 
project_root = os.path.join(notebook_dir, '../../')
global_scripts_path = os.path.join(project_root, 'global_scripts')

if global_scripts_path not in sys.path:
    sys.path.append(global_scripts_path)

from utils import *
from consts import *

In [12]:
setup_logging("format_images.log")

## Connect to MinIO client

In [14]:
s3_client = boto3.client(
    "s3",
    endpoint_url=ENDPOINT_URL,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
logging.info("Connected to MinIO.")

In [15]:
## Retrieve images from landing zone
objects = s3_client.list_objects_v2(Bucket=LANDING_ZONE_BUCKET, Prefix=f'{PERSISTENT_SUB_BUCKET}/media/image/')
print(len(objects['Contents']))

500


## Unify format for all images

In [17]:
def delete_images_from_formatted(s3_client):
    logging.info(f"Preparing to delete all objects in sub-bucket {FORMATTED_ZONE_BUCKET}/{MEDIA_SUB_BUCKET}/image")
    prefix_images = f"{MEDIA_SUB_BUCKET}/image/"
    try:
        # list objects to delete
        objects_to_delete = s3_client.list_objects_v2(Bucket=FORMATTED_ZONE_BUCKET, Prefix=prefix_images)
        if 'Contents' not in objects_to_delete:
            logging.warning(f"No objects found with prefix '{prefix_images}'. Nothing to delete.")
            return True
        delete_keys = {'Objects': [{'Key': obj['Key']} for obj in objects_to_delete['Contents']]}

        # delete them
        response = s3_client.delete_objects(Bucket=FORMATTED_ZONE_BUCKET, Delete=delete_keys)

        if 'Errors' in response:
            logging.error("An error occurred during bulk delete.")
            for error in response['Errors']:
                logging.error(f" - Could not delete '{error['Key']}': {error['Message']}")
            return False

        logging.info(f"Successfully deleted {len(delete_keys['Objects'])} objects from '{prefix_images}'.")
        return True
    except ClientError as e:
        logging.error(f"A Boto3 client error occurred: {e}")
        return False
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return False

In [18]:
def format_image(s3_client, img_name):
    try:
        # Download img to memory
        resp = s3_client.get_object(
            Bucket=LANDING_ZONE_BUCKET,
            Key=img_name
        )
        image_data = resp['Body'].read()

        # Open img with Pillow
        with Image.open(io.BytesIO(image_data)) as img:
            # RGB does not support transparency channel
            if img.mode in ('RGBA', 'LA', 'P'):
                img = img.convert('RGB')

            # Save converted img to JPG buffer
            output_buffer = io.BytesIO()
            img.save(output_buffer, format='JPEG')
            output_buffer.seek(0)

        # Create new key for formatted image
        base_name = img_name.split('/')[-1].split('.')
        new_key = f'{MEDIA_SUB_BUCKET}/image/{base_name}.jpg'

        s3_client.upload_fileobj(output_buffer, FORMATTED_ZONE_BUCKET, new_key)
        logging.info(f"Successfully converted '{key}' to '{new_key}'")

    except Exception as e:
        logging.error(f"Failed to process image '{key}'. Error: {e}")

In [19]:
def move_to_formatted_zone(s3_client, img_name):
    base_name = img_name.split('/')[-1]
    new_key = f'{MEDIA_SUB_BUCKET}/image/{base_name}'
    s3_client.copy_object(
        Bucket=FORMATTED_ZONE_BUCKET,
        CopySource={
            "Bucket": LANDING_ZONE_BUCKET,
            "Key": img_name
        },
        Key=new_key
    )
    logging.info(f"Copied object {img_name} to {new_key} in formatted zone.")

Update formatted zone by deleting previous content if needed.

In [20]:
delete_images_from_formatted(s3_client)

True

Formatting of images: if the image has a format different from JPG, it is formatted and inserted into Formatted zone. Otherwise, if the image has already JPG format, it is copied directly to the Formatted Zone.

In [22]:
logging.info("Starting image format transformation...")
for img in objects['Contents']:
    name_img = img['Key']
    format_img = name_img.split('.')[-1].lower()
    if format_img != 'jpg':
        format_image(s3_client, name_img) # format and move to formatted zone
    else:
        move_to_formatted_zone(s3_client, name_img) # just move to formatted zone

In [23]:
objects = s3_client.list_objects_v2(Bucket=FORMATTED_ZONE_BUCKET, Prefix=f'{MEDIA_SUB_BUCKET}/image/')
print(len(objects['Contents']))

500
