# EMR Serverless estimator

Welcome to the EMR serverless estimator, follow the steps to get the estimation files.

1. Initializes configuration parameters (AWS region, user email, company, data retrieval period).

In [None]:
region_name = ""
email = ""
company = ""
runs_for_last_days = 15  # Default 15 days

# Imports and constants

2. Imports necessary Python libraries for data manipulation, AWS interaction, etc.

In [None]:
import base64
import io
import logging
import uuid
import zipfile

from collections.abc import Iterator
from datetime import datetime, timedelta, timezone

import boto3
import pandas as pd

from IPython.display import HTML, display

3. Defines string constants for AWS data fields and establishes connections to AWS services.

In [None]:
# String literal constants
APPLICATION_ID_LITERAL = "applicationId"
NAME_LITERAL = "name"
TYPE_LITERAL = "type"
STATE_LITERAL = "state"
ARCHITECTURE_LITERAL = "architecture"
CREATED_AT_LITERAL = "createdAt"
UPDATED_AT_LITERAL = "updatedAt"
INITIAL_CAPACITY_LITERAL = "initialCapacity"
EXECUTOR_LITERAL = "Executor"
WORKER_COUNT_LITERAL = "workerCount"
WORKER_CONFIGURATION_LITERAL = "workerConfiguration"
CPU_LITERAL = "cpu"
MEMORY_LITERAL = "memory"
DISK_LITERAL = "disk"
DRIVER_LITERAL = "Driver"
MAXIMUM_CAPACITY_LITERAL = "maximumCapacity"
CONFIG_OVERRIDES_LITERAL = "configurationOverrides"
MONITORING_CONFIG_LITERAL = "monitoringConfiguration"
S3_MONITORING_CONFIG_LITERAL = "s3MonitoringConfiguration"
MANAGED_PERSISTENCE_MONITORING_CONFIG_LITERAL = "managedPersistenceMonitoringConfiguration"
CLOUD_WATCH_LOGGING_CONFIG_LITERAL = "cloudWatchLoggingConfiguration"
ENABLED_LITERAL = "enabled"
JOB_RUN_ID_LITERAL = "jobRunId"
MODE_LITERAL = "mode"
ATTEMPT_LITERAL = "attempt"
STARTED_AT_LITERAL = "startedAt"
ENDED_AT_LITERAL = "endedAt"
QUEUED_DURATION_MS_LITERAL = "queuedDurationMilliseconds"
TOTAL_RESOURCE_UTILIZATION_LITERAL = "totalResourceUtilization"
VCPU_HOUR_LITERAL = "vCPUHour"
MEMORY_GB_HOUR_LITERAL = "memoryGBHour"
STORAGE_GB_HOUR_LITERAL = "storageGBHour"
BILLED_RESOURCE_UTILIZATION_LITERAL = "billedResourceUtilization"
TOTAL_EXECUTION_DURATION_SECONDS_LITERAL = "totalExecutionDurationSeconds"
EXECUTION_TIMEOUT_MINUTES_LITERAL = "executionTimeoutMinutes"
JOB_TYPE_LITERAL = "job_type"
S3_MONITORING_ENABLED_LITERAL = "s3_monitoring_enabled"
MP_MONITORING_ENABLED_LITERAL = "mp_monitoring_enabled"
CLOUD_WATCH_ENABLED_LITERAL = "cloud_watch_enabled"
KEYS_LITERAL = "Keys"
METRICS_LITERAL = "Metrics"
UNBLENDED_COST_LITERAL = "UnblendedCost"
AMOUNT_LITERAL = "Amount"
UNIT_LITERAL = "Unit"
GROUPS_LITERAL = "Groups"
TIME_PERIOD_LITERAL = "TimePeriod"
START_LITERAL = "Start"

# Dictionary key constants
APP_ID_KEY = "application_id"
APP_NAME_KEY = "application_name"
APP_TYPE_KEY = "application_type"
APP_STATE_KEY = "application_state"
ARCHITECTURE_KEY = "architecture"
CREATED_AT_KEY = "created_at"
UPDATED_AT_KEY = "updated_at"
EXECUTOR_WORKER_COUNT_KEY = "executor_worker_count"
EXECUTOR_WORKER_CPU_KEY = "executor_worker_cpu"
EXECUTOR_WORKER_MEMORY_KEY = "executor_worker_memory"
EXECUTOR_WORKER_DISK_KEY = "executor_worker_disk"
DRIVER_WORKER_COUNT_KEY = "driver_worker_count"
DRIVER_WORKER_CPU_KEY = "driver_worker_cpu"
DRIVER_WORKER_MEMORY_KEY = "driver_worker_memory"
DRIVER_WORKER_DISK_KEY = "driver_worker_disk"
MAXIMUM_CPU_KEY = "maximum_cpu"
MAXIMUM_MEMORY_KEY = "maximum_memory"
MAXIMUM_DISK_KEY = "maximum_disk"
JOB_RUN_ID_KEY = "job_run_id"
JOB_NAME_KEY = "job_name"
JOB_MODE_KEY = "job_mode"
JOB_STATE_KEY = "job_state"
ATTEMPT_KEY = "attempt"
STARTED_AT_KEY = "started_at"
ENDED_AT_KEY = "ended_at"
QUEUED_DURATION_MS_KEY = "queued_duration_milliseconds"
TRU_VCPU_HOUR_KEY = "tru_vcpu_hour"
TRU_MEMORY_GB_HOUR_KEY = "tru_memory_gb_hour"
TRU_STORAGE_GB_HOUR_KEY = "tru_storage_gb_hour"
BRU_VCPU_HOUR_KEY = "bru_vcpu_hour"
BRU_MEMORY_GB_HOUR_KEY = "bru_memory_gb_hour"
BRU_STORAGE_GB_HOUR_KEY = "bru_storage_gb_hour"
TOTAL_EXECUTION_SECONDS_KEY = "total_execution_duration_seconds"
EXECUTION_TIMEOUT_MINUTES_KEY = "execution_timeout_minutes"
JOB_TYPE_KEY_DICT = "job_type"
S3_MONITORING_ENABLED_KEY_DICT = "s3_monitoring_enabled"
MP_MONITORING_ENABLED_KEY_DICT = "mp_monitoring_enabled"
CLOUD_WATCH_ENABLED_KEY_DICT = "cloud_watch_enabled"
USAGE_TYPE_KEY = "usage_type"
AMOUNT_KEY = "amount"
UNIT_KEY = "unit"
DATE_KEY = "date"

# Execution details constants
NOTEBOOK_VERSION = "0.1.0"

# Logging Setup
4. This section configures the "emr_serverless_estimator" logger to display informational messages, warnings, and errors directly in your notebook's output, aiding in monitoring the EMR Serverless estimation process.

In [None]:
MIN_LOG_LEVEL = logging.INFO

logger = logging.getLogger("emr_serverless_estimator")
logger.setLevel(MIN_LOG_LEVEL)

if not logger.handlers:
    formatter = logging.Formatter(
        "{asctime} - {name} - {levelname} - {message}",
        style="{",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(MIN_LOG_LEVEL)
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

# EMR Serverless Estimator Manager

5. Defines the **EMRSEstimatorManager** class to encapsulate estimation logic and data handling.

In [None]:
class EMRSEstimatorManager:
    """Manage EMR Serverless applications, job runs, and cost estimations.

    Attributes:
        region_name (str): AWS region name.
        email (str): User email for identification.
        company (str): Company name for identification.
        execution_id (str): Unique identifier for the execution.
        application_list (list): List of applications.
        job_runs_info (list): List of job runs information.
        application_costs_list (list): List of application costs.

    """

    def __init__(
        self,
        region_name: str,
        email: str,
        company: str,
    ):
        """Initialize the EMRSEstimatorManager with region, email, and company.

        Args:
            region_name (str): AWS region name.
            email (str): User email.
            company (str): Company name.

        """
        self.application_list = []
        self.job_runs_info = []
        self.region_name = region_name
        self.email = email
        self.company = company
        self.execution_id = str(uuid.uuid4())
        self.application_costs_list = []

    def add_application(self, app_info: dict):
        """Add an application to the application list.

        Args:
            app_info (dict): Dictionary containing application details.

        """
        try:
            new_application = {
                APP_ID_KEY: app_info.get(APPLICATION_ID_LITERAL, "N/A"),
                APP_NAME_KEY: app_info.get(NAME_LITERAL, "N/A"),
                APP_TYPE_KEY: app_info.get(TYPE_LITERAL, "N/A"),
                APP_STATE_KEY: app_info.get(STATE_LITERAL, "N/A"),
                ARCHITECTURE_KEY: app_info.get(ARCHITECTURE_LITERAL, "N/A"),
                CREATED_AT_KEY: str(app_info.get(CREATED_AT_LITERAL, "N/A")),
                UPDATED_AT_KEY: str(app_info.get(UPDATED_AT_LITERAL, "N/A")),
                EXECUTOR_WORKER_COUNT_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(EXECUTOR_LITERAL, {})
                .get(WORKER_COUNT_LITERAL, "N/A"),
                EXECUTOR_WORKER_CPU_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(EXECUTOR_LITERAL, {})
                .get(WORKER_CONFIGURATION_LITERAL, {})
                .get(CPU_LITERAL, "N/A"),
                EXECUTOR_WORKER_MEMORY_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(EXECUTOR_LITERAL, {})
                .get(WORKER_CONFIGURATION_LITERAL, {})
                .get(MEMORY_LITERAL, "N/A"),
                EXECUTOR_WORKER_DISK_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(EXECUTOR_LITERAL, {})
                .get(WORKER_CONFIGURATION_LITERAL, {})
                .get(DISK_LITERAL, "N/A"),
                DRIVER_WORKER_COUNT_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(DRIVER_LITERAL, {})
                .get(WORKER_COUNT_LITERAL, "N/A"),
                DRIVER_WORKER_CPU_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(DRIVER_LITERAL, {})
                .get(WORKER_CONFIGURATION_LITERAL, {})
                .get(CPU_LITERAL, "N/A"),
                DRIVER_WORKER_MEMORY_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(DRIVER_LITERAL, {})
                .get(WORKER_CONFIGURATION_LITERAL, {})
                .get(MEMORY_LITERAL, "N/A"),
                DRIVER_WORKER_DISK_KEY: app_info.get(INITIAL_CAPACITY_LITERAL, {})
                .get(DRIVER_LITERAL, {})
                .get(WORKER_CONFIGURATION_LITERAL, {})
                .get(DISK_LITERAL, "N/A"),
                MAXIMUM_CPU_KEY: app_info.get(MAXIMUM_CAPACITY_LITERAL, {}).get(
                    CPU_LITERAL,
                    "N/A",
                ),
                MAXIMUM_MEMORY_KEY: app_info.get(MAXIMUM_CAPACITY_LITERAL, {}).get(
                    MEMORY_LITERAL,
                    "N/A",
                ),
                MAXIMUM_DISK_KEY: app_info.get(MAXIMUM_CAPACITY_LITERAL, {}).get(
                    DISK_LITERAL,
                    "N/A",
                ),
            }
            self.application_list.append(new_application)
        except Exception as e:
            logger.exception(f"Error adding application: {e}")

    def add_job_run(self, job_info: dict, job_type: str):
        """Add a job run to the job runs list.

        Args:
            job_info (dict): Dictionary containing job run details.
            job_type (str): Type of the job run.

        """
        try:
            new_job_run = {
                JOB_RUN_ID_KEY: job_info.get(JOB_RUN_ID_LITERAL, "N/A"),
                APP_ID_KEY: job_info.get(APPLICATION_ID_LITERAL, "N/A"),
                JOB_NAME_KEY: job_info.get(NAME_LITERAL, "N/A"),
                JOB_MODE_KEY: job_info.get(MODE_LITERAL, "N/A"),
                CREATED_AT_KEY: job_info.get(CREATED_AT_LITERAL, "N/A"),
                UPDATED_AT_KEY: job_info.get(UPDATED_AT_LITERAL, "N/A"),
                JOB_TYPE_KEY_DICT: job_type,
                S3_MONITORING_ENABLED_KEY_DICT: job_info.get(
                    CONFIG_OVERRIDES_LITERAL,
                    {},
                )
                .get(MONITORING_CONFIG_LITERAL, {})
                .get(S3_MONITORING_CONFIG_LITERAL, {})
                .get(ENABLED_LITERAL, "N/A"),
                MP_MONITORING_ENABLED_KEY_DICT: job_info.get(
                    CONFIG_OVERRIDES_LITERAL,
                    {},
                )
                .get(MONITORING_CONFIG_LITERAL, {})
                .get(MANAGED_PERSISTENCE_MONITORING_CONFIG_LITERAL, {})
                .get(ENABLED_LITERAL, "N/A"),
                CLOUD_WATCH_ENABLED_KEY_DICT: job_info.get(CONFIG_OVERRIDES_LITERAL, {})
                .get(MONITORING_CONFIG_LITERAL, {})
                .get(CLOUD_WATCH_LOGGING_CONFIG_LITERAL, {})
                .get(ENABLED_LITERAL, "N/A"),
                ATTEMPT_KEY: job_info.get(ATTEMPT_LITERAL, "N/A"),
                JOB_STATE_KEY: job_info.get(STATE_LITERAL, "N/A"),
                STARTED_AT_KEY: job_info.get(STARTED_AT_LITERAL, "N/A"),
                ENDED_AT_KEY: job_info.get(ENDED_AT_LITERAL, "N/A"),
                QUEUED_DURATION_MS_KEY: job_info.get(QUEUED_DURATION_MS_LITERAL, "N/A"),
                TRU_VCPU_HOUR_KEY: job_info.get(
                    TOTAL_RESOURCE_UTILIZATION_LITERAL,
                    {},
                ).get(VCPU_HOUR_LITERAL, "N/A"),
                TRU_MEMORY_GB_HOUR_KEY: job_info.get(
                    TOTAL_RESOURCE_UTILIZATION_LITERAL,
                    {},
                ).get(MEMORY_GB_HOUR_LITERAL, "N/A"),
                TRU_STORAGE_GB_HOUR_KEY: job_info.get(
                    TOTAL_RESOURCE_UTILIZATION_LITERAL,
                    {},
                ).get(STORAGE_GB_HOUR_LITERAL, "N/A"),
                BRU_VCPU_HOUR_KEY: job_info.get(
                    BILLED_RESOURCE_UTILIZATION_LITERAL,
                    {},
                ).get(VCPU_HOUR_LITERAL, "N/A"),
                BRU_MEMORY_GB_HOUR_KEY: job_info.get(
                    BILLED_RESOURCE_UTILIZATION_LITERAL,
                    {},
                ).get(MEMORY_GB_HOUR_LITERAL, "N/A"),
                BRU_STORAGE_GB_HOUR_KEY: job_info.get(
                    BILLED_RESOURCE_UTILIZATION_LITERAL,
                    {},
                ).get(STORAGE_GB_HOUR_LITERAL, "N/A"),
                TOTAL_EXECUTION_SECONDS_KEY: job_info.get(
                    TOTAL_EXECUTION_DURATION_SECONDS_LITERAL,
                    "N/A",
                ),
                EXECUTION_TIMEOUT_MINUTES_KEY: job_info.get(
                    EXECUTION_TIMEOUT_MINUTES_LITERAL,
                    "N/A",
                ),
            }
            self.job_runs_info.append(new_job_run)
        except Exception as e:
            logger.exception(f"Error adding job run: {e}")

    def add_application_cost_list(self, cost_list: list, application_id: str) -> None:
        """Add cost information for an application.

        Args:
            cost_list (list): Dictionary containing cost details.
            application_id (str): ID of the application.

        """
        try:
            for cost in cost_list:
                start_period = cost.get(TIME_PERIOD_LITERAL, {}).get(
                    START_LITERAL,
                    "N/A",
                )
                groups = cost.get(GROUPS_LITERAL, [])

                if not groups:
                    new_cost = {
                        APP_ID_KEY: application_id,
                        DATE_KEY: start_period,
                        USAGE_TYPE_KEY: "",
                        AMOUNT_KEY: 0,
                        UNIT_KEY: "USD",
                    }
                    self.application_costs_list.append(new_cost)
                else:
                    for group in groups:
                        keys = group.get(KEYS_LITERAL, [])
                        concatenated_keys = " | ".join(key for key in keys)
                        costs_object = group.get(METRICS_LITERAL, {}).get(
                            UNBLENDED_COST_LITERAL,
                            {},
                        )
                        amount = costs_object.get(AMOUNT_LITERAL, "0")
                        unit = costs_object.get(UNIT_LITERAL, "N/A")
                        new_cost = {
                            APP_ID_KEY: application_id,
                            DATE_KEY: start_period,
                            USAGE_TYPE_KEY: concatenated_keys,
                            AMOUNT_KEY: amount,
                            UNIT_KEY: unit,
                        }
                        self.application_costs_list.append(new_cost)
        except Exception as e:
            logger.exception(f"Error adding application cost list: {e}")

    @staticmethod
    def get_buffer_output(data: pd.DataFrame) -> io.StringIO:
        """Convert a DataFrame to a CSV buffer.

        Args:
            data (pd.DataFrame): DataFrame to convert.

        Returns:
            io.StringIO: Buffer containing the CSV data.

        """
        try:
            buffer = io.StringIO()
            data.to_csv(buffer, index=False, encoding="utf-8")
            buffer.seek(0)
            return buffer
        except Exception as e:
            logger.exception(f"Error getting buffer output: {e}")

    def get_execution_info(self) -> dict:
        """Retrieve execution metadata.

        Returns:
            dict: Dictionary containing execution metadata.

        """
        return {
            "execution_id": self.execution_id,
            "email": self.email,
            "company": self.company,
            "region": self.region_name,
            "version": NOTEBOOK_VERSION,
            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
            "runs_for_last_days": runs_for_last_days,
        }

    @staticmethod
    def get_output_config() -> dict:
        """Retrieve the output configuration.

        Returns:
            dict: Dictionary containing output configuration data.

        """
        output_config = {
            "title": "Download Analysis Data",
            "execution_detail_name_file": "execution_detail.csv",
            "application_list_name_file": "application_list.csv",
            "application_costs_list_name_file": "application_costs_list.csv",
            "job_runs_list_name_file": "job_runs_list.csv",
            "job_runs_attempts_name_file": "job_runs_attempts.csv",
            "job_runs_list_columns": [
                JOB_RUN_ID_KEY,
                APP_ID_KEY,
                JOB_NAME_KEY,
                JOB_MODE_KEY,
                CREATED_AT_KEY,
                UPDATED_AT_KEY,
                JOB_TYPE_KEY_DICT,
                S3_MONITORING_ENABLED_KEY_DICT,
                MP_MONITORING_ENABLED_KEY_DICT,
                CLOUD_WATCH_ENABLED_KEY_DICT,
            ],
            "job_runs_attempts_columns": [
                JOB_RUN_ID_KEY,
                APP_ID_KEY,
                ATTEMPT_KEY,
                JOB_STATE_KEY,
                STARTED_AT_KEY,
                ENDED_AT_KEY,
                QUEUED_DURATION_MS_KEY,
                TRU_VCPU_HOUR_KEY,
                TRU_MEMORY_GB_HOUR_KEY,
                TRU_STORAGE_GB_HOUR_KEY,
                BRU_VCPU_HOUR_KEY,
                BRU_MEMORY_GB_HOUR_KEY,
                BRU_STORAGE_GB_HOUR_KEY,
                TOTAL_EXECUTION_SECONDS_KEY,
                EXECUTION_TIMEOUT_MINUTES_KEY,
            ],
        }
        return output_config

    @staticmethod
    def compress_data(
        output_config: dict,
        execution_info_buffer: io.StringIO,
        application_list_buffer: io.StringIO,
        application_costs_list_buffer: io.StringIO,
        job_runs_list_buffer: io.StringIO,
        job_runs_attempts_buffer: io.StringIO,
    ) -> str:
        """Compresses multiple CSV buffers into a single ZIP file and encodes it in Base64.

        Args:
            output_config (dict): Configuration dictionary containing file names for the ZIP archive.
            execution_info_buffer (io.StringIO): Buffer containing execution metadata in CSV format.
            application_list_buffer (io.StringIO): Buffer containing application list data in CSV format.
            application_costs_list_buffer (io.StringIO): Buffer containing application cost data in CSV format.
            job_runs_list_buffer (io.StringIO): Buffer containing job runs list data in CSV format.
            job_runs_attempts_buffer (io.StringIO): Buffer containing job runs attempts data in CSV format.

        Returns:
            str: Base64-encoded string of the compressed ZIP file.

        """
        try:
            zip_buffer = io.BytesIO()
            with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
                zf.writestr(
                    output_config["execution_detail_name_file"],
                    execution_info_buffer.getvalue(),
                )
                zf.writestr(
                    output_config["application_list_name_file"],
                    application_list_buffer.getvalue(),
                )
                zf.writestr(
                    output_config["application_costs_list_name_file"],
                    application_costs_list_buffer.getvalue(),
                )
                zf.writestr(
                    output_config["job_runs_list_name_file"],
                    job_runs_list_buffer.getvalue(),
                )
                zf.writestr(
                    output_config["job_runs_attempts_name_file"],
                    job_runs_attempts_buffer.getvalue(),
                )

            zip_bytes = zip_buffer.getvalue()
            b64 = base64.b64encode(zip_bytes).decode()
            return b64
        except Exception as e:
            logger.exception(f"Error compressing data: {e}")

    def show_output(self, output_file_name: str = None) -> None:
        """Generate and displays a downloadable ZIP file with analysis data.

        Args:
            output_file_name (str, optional): Name of the output ZIP file.

        """
        try:
            if output_file_name is None:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                output_file_name = f"emr_we_output_{timestamp}.zip"

            output_config = self.get_output_config()

            application_df = pd.DataFrame(self.application_list)
            application_list_buffer = self.get_buffer_output(application_df)

            application_costs_list_buffer = self.get_buffer_output(
                pd.DataFrame(self.application_costs_list),
            )

            job_runs_info_df = pd.DataFrame(self.job_runs_info)
            job_runs_list_df = pd.DataFrame()
            job_runs_attempts_df = pd.DataFrame()

            if not job_runs_info_df.empty:
                job_runs_list_df = job_runs_info_df[output_config["job_runs_list_columns"]].drop_duplicates(
                    subset=[JOB_RUN_ID_KEY],
                    keep="first",
                )

                job_runs_attempts_df = job_runs_info_df[output_config["job_runs_attempts_columns"]]

            job_runs_list_buffer = self.get_buffer_output(job_runs_list_df)
            job_runs_attempts_buffer = self.get_buffer_output(job_runs_attempts_df)

            total_applications = application_df.shape[0]
            total_job_runs = job_runs_list_df.shape[0]
            total_job_runs_attempts = job_runs_attempts_df.shape[0]
            execution_info_df = pd.DataFrame(
                [
                    {
                        **self.get_execution_info(),
                        "total_applications": total_applications,
                        "total_job_runs": total_job_runs,
                        "total_job_runs_attempts": total_job_runs_attempts,
                    },
                ],
            )
            execution_info_buffer = self.get_buffer_output(execution_info_df)

            payload = self.compress_data(
                output_config,
                execution_info_buffer,
                application_list_buffer,
                application_costs_list_buffer,
                job_runs_list_buffer,
                job_runs_attempts_buffer,
            )

            html = (
                f'<html><div style="display:flex;justify-content: center;">'
                f'<a download="{output_file_name}" '
                f'href="data:application/zip;base64,{payload}" '
                f'target="_blank">'
                f'<button style="background-color:#249edc;color: #fff;'
                f"border:1px solid #249edc;cursor:pointer;border-radius:45px;"
                f'font-weight:800;line-height:18px;padding: 8px 16px" '
                f'type="button">{output_config.get("title")}</button>'
                f"</a></div></html>"
            )
            display(HTML(html))
        except Exception as e:
            logger.exception(f"Error showing output: {e}")

# Utility functions

6. Defines utility functions for retrieving data from AWS (applications, job runs, costs).

In [None]:
def get_application_costs(arn) -> list:
    """Retrieve cost information for a specific application.

    Args:
        arn (str): ARN of the application.

    Returns:
        list: List of cost details by time period.

    """
    try:
        now = datetime.now(timezone.utc)  # noqa: UP017
        start_date = (datetime.now() - timedelta(days=14)).strftime("%Y-%m-%d")
        end_date = now.strftime("%Y-%m-%d")

        cost = cost_explorer_client.get_cost_and_usage_with_resources(
            Granularity="DAILY",
            Filter={
                "Dimensions": {"Key": "RESOURCE_ID", "Values": [arn]},
            },
            TimePeriod={"Start": start_date, "End": end_date},
            Metrics=["UnblendedCost"],
            GroupBy=[
                {
                    "Type": "DIMENSION",
                    "Key": "USAGE_TYPE",
                },
            ],
        )
        return cost.get("ResultsByTime", [])
    except Exception as e:
        logger.exception(f"Error getting application costs: {e}")
        return []


def list_applications(
    runs_for_last_days_ago: datetime,
) -> Iterator[str]:
    """List applications updated within the specified time period.

    Args:
        runs_for_last_days_ago (datetime): Time period in days.

    Yields:
        str: Application ID.

    """
    try:
        paginator = emr_serverless_client.get_paginator("list_applications")
        for page in paginator.paginate():
            if "applications" in page:
                for app in page["applications"]:
                    is_valid_date = "updatedAt" in app and app["updatedAt"] >= runs_for_last_days_ago
                    if is_valid_date:
                        application_costs = get_application_costs(app["arn"])
                        estimator_manager.add_application_cost_list(
                            application_costs,
                            app["id"],
                        )
                        app_info = emr_serverless_client.get_application(
                            applicationId=app["id"],
                        )
                        estimator_manager.add_application(app_info["application"])
                        yield app["id"]
    except Exception as e:
        logger.exception(f"Error listing applications: {e}")


def list_jobs(application_id: str, runs_for_last_days_ago: datetime) -> Iterator[dict]:
    """List job runs for a specific application within the specified time period.

    Args:
        application_id (str): ID of the application.
        runs_for_last_days_ago (datetime): Time period in days.

    Yields:
        dict: Job run details.

    """
    try:
        paginator = emr_serverless_client.get_paginator("list_job_runs")
        for page in paginator.paginate(applicationId=application_id):
            if "jobRuns" in page:
                for job_run in page["jobRuns"]:
                    is_valid_date = "updatedAt" in job_run and job_run["updatedAt"] >= runs_for_last_days_ago
                    if is_valid_date:
                        yield job_run
    except Exception as e:
        logger.exception(f"Error listing job runs: {e}")


def add_job_run_info(job_run: dict, job_attempt: int):
    """Process information for a specific job run attempt.

    Args:
        job_run (dict): Dictionary containing job run details.
        job_attempt (int): Attempt number of the job run.

    """
    try:
        application_id = job_run.get("applicationId", "N/A")
        job_run_id = job_run.get("id", "N/A")
        response = emr_serverless_client.get_job_run(
            applicationId=application_id,
            jobRunId=job_run_id,
            attempt=job_attempt,
        )
        if "jobRun" in response:
            job_info = response["jobRun"]
            job_type = job_run.get("type", "N/A")
            estimator_manager.add_job_run(job_info, job_type)
    except Exception as e:
        logger.exception(f"Error adding job run info: {e}")


def validate_user_info() -> bool:
    """Validate the user information required for the application.

    Checks if the region, email, company, and the number of days for job runs
    are properly configured. Logs errors or warnings for invalid or missing values.

    Returns:
        bool: True if the user information is valid, False otherwise.

    """
    is_valid = True
    if region_name == "":
        logger.error("Region is empty. Please provide a valid region.")
        is_valid = False
    if runs_for_last_days < 1:
        logger.error("Runs for last days must be greater than 0.")
        is_valid = False
    if email == "":
        logger.warning("Email is empty. Please provide a valid email.")
    if company == "":
        logger.warning("Company is empty. Please provide a valid company.")

    return is_valid

# Generate estimation

7. Executes the main estimation workflow: instantiates the manager, retrieves data, and generates the output.

In [None]:
if validate_user_info():
    emr_serverless_client = boto3.client("emr-serverless", region_name=region_name)
    cost_explorer_client = boto3.client("ce", region_name=region_name)

    estimator_manager = EMRSEstimatorManager(region_name, email, company)
    now = datetime.now(timezone.utc)  # noqa: UP017
    runs_for_last_days_ago = now - timedelta(days=runs_for_last_days)

    logger.info("Retrieving data...")
    logger.info(f"Start date: {runs_for_last_days_ago.strftime('%Y-%m-%d')}")

    count_apps = 0
    count_jobs = 0
    for app_id in list_applications(runs_for_last_days_ago):
        for job_run in list_jobs(app_id, runs_for_last_days_ago):
            attempts = job_run.get("attempt")
            for attempt in range(attempts):
                add_job_run_info(job_run, attempt + 1)
            count_jobs += 1
        count_apps += 1

    logger.info(f"Total applications: {count_apps}")
    logger.info(f"Total job runs: {count_jobs}")

    estimator_manager.show_output()