## Imports and set directory path

In [2]:
import sys
import os
import io
import boto3
import logging
from moviepy.editor import VideoFileClip
import tempfile

ModuleNotFoundError: No module named 'moviepy.editor'

In [8]:
notebook_dir = os.path.abspath('') 
project_root = os.path.join(notebook_dir, '../../')
global_scripts_path = os.path.join(project_root, 'global_scripts')

if global_scripts_path not in sys.path:
    sys.path.append(global_scripts_path)

from utils import *
from consts import *

In [9]:
setup_logging("format_videos.log")

## Connect to MinIO client

In [10]:
s3_client = boto3.client(
    "s3",
    endpoint_url=ENDPOINT_URL,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
logging.info("Connected to MinIO.")

In [11]:
## Retrieve images from landing zone
objects = s3_client.list_objects_v2(Bucket=LANDING_ZONE_BUCKET, Prefix=f'{PERSISTENT_SUB_BUCKET}/media/video/')
print(len(objects['Contents']))

74


## Unify format for all videos

In [12]:
def delete_videos_from_formatted(s3_client):
    logging.info(f"Preparing to delete all objects in sub-bucket {FORMATTED_ZONE_BUCKET}/{MEDIA_SUB_BUCKET}/video")
    prefix_videos = f"{MEDIA_SUB_BUCKET}/video/"
    try:
        # list objects to delete
        objects_to_delete = s3_client.list_objects_v2(Bucket=FORMATTED_ZONE_BUCKET, Prefix=prefix_videos)
        if 'Contents' not in objects_to_delete:
            logging.warning(f"No objects found with prefix '{prefix_videos}'. Nothing to delete.")
            return True
        delete_keys = {'Objects': [{'Key': obj['Key']} for obj in objects_to_delete['Contents']]}

        # delete them
        response = s3_client.delete_objects(Bucket=FORMATTED_ZONE_BUCKET, Delete=delete_keys)

        if 'Errors' in response:
            logging.error("An error occurred during bulk delete.")
            for error in response['Errors']:
                logging.error(f" - Could not delete '{error['Key']}': {error['Message']}")
            return False

        logging.info(f"Successfully deleted {len(delete_keys['Objects'])} objects from '{prefix_videos}'.")
        return True
    except ClientError as e:
        logging.error(f"A Boto3 client error occurred: {e}")
        return False
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return False

In [18]:
def format_video(s3_client, video_name):
    try:
        # Download img to memory
        resp = s3_client.get_object(
            Bucket=LANDING_ZONE_BUCKET,
            Key=video_name
        )
        video_data = resp['Body'].read()

        # Open img with Pillow
        with Image.open(io.BytesIO(image_data)) as img:
            # RGB does not support transparency channel
            if img.mode in ('RGBA', 'LA', 'P'):
                img = img.convert('RGB')

            # Save converted img to JPG buffer
            output_buffer = io.BytesIO()
            img.save(output_buffer, format='JPEG')
            output_buffer.seek(0)

        # Create new key for formatted video
        base_name = img_name.split('/')[-1].split('.')
        new_key = f'{MEDIA_SUB_BUCKET}/image/{base_name}.jpg'

        s3_client.upload_fileobj(output_buffer, FORMATTED_ZONE_BUCKET, new_key)
        logging.info(f"Successfully converted '{key}' to '{new_key}'")

    except Exception as e:
        logging.error(f"Failed to process video '{key}'. Error: {e}")

In [13]:
def move_to_formatted_zone(s3_client, video_name):
    base_name = video_name.split('/')[-1]
    new_key = f'{MEDIA_SUB_BUCKET}/video/{base_name}'
    s3_client.copy_object(
        Bucket=FORMATTED_ZONE_BUCKET,
        CopySource={
            "Bucket": LANDING_ZONE_BUCKET,
            "Key": video_name
        },
        Key=new_key
    )
    logging.info(f"Copied object {video_name} to {new_key} in formatted zone.")

Update formatted zone by deleting previous content if needed.

In [14]:
delete_videos_from_formatted(s3_client)

True

Formatting of videos: if the video has a format different from MP4, it is formatted and inserted into Formatted zone. Otherwise, if the video has already MP4 format, it is copied directly to the Formatted Zone.

In [22]:
logging.info("Starting video format transformation...")
for vid in objects['Contents']:
    name_video = vid['Key']
    format_video = name_video.split('.')[-1].lower()
    if format_video != TARGET_VIDEO_FORMAT:
        format_video(s3_client, name_video) # format and move to formatted zone
    else:
        move_to_formatted_zone(s3_client, name_video) # just move to formatted zone

In [23]:
objects = s3_client.list_objects_v2(Bucket=FORMATTED_ZONE_BUCKET, Prefix=f'{MEDIA_SUB_BUCKET}/video/')
print(len(objects['Contents']))

500
