In [None]:
import json
from io import StringIO

import boto3
import pandas as pd

In [None]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations on the taxi_trips DataFrame

    Parameters
    ---------- 
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data

    Returns
    -------
    pd.DataFrame
        The transformed DataFrame
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips should be a DataFrame")

    taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract", 
                     "pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                               "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

    taxi_trips["datetime_for_weather"] = pd.to_datetime(taxi_trips["trip_start_timestamp"]).dt.floor("h")

    return taxi_trips

In [None]:
def update_master(taxi_trips: pd.DataFrame, master: pd.DataFrame, id_column: str, value_column: str) -> pd.DataFrame:
    """
    Update the master DataFrame with new companies from the taxi_trips DataFrame
    
    Parameters
    ---------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    master: pd.DataFrame
        The DataFrame containing the master data
    id_column: str
        The id column of the master DataFrame.
    value_column: str
        The name of the column in master_df containing the values.

    Returns
    -------
    pd.DataFrame
        The updated master DataFrame
    """
    max_id = master[id_column].max()

    new_values_list = list(set(taxi_trips[value_column].values) - set(master[value_column].values))
    new_values_df = pd.DataFrame({
        id_column: range(max_id + 1, max_id + 1 + len(new_values_list)),
        value_column: new_values_list
    })

    updated_master = pd.concat([master, new_values_df], ignore_index=True)

    return updated_master


In [None]:
def update_taxi_trips_with_master_data(taxi_trips: pd.DataFrame, payment_type_master: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    '''
    Update the taxi_trips DataFrame with the payment_type_master and company_master ids

    Parameters
    ----------
    taxi_trips: pd.DataFrame
        The DataFrame containing the taxi trips data
    payment_type_master: pd.DataFrame
        The DataFrame containing the payment_type master table
    company_master: pd.DataFrame
        The DataFrame containing the company master table

    Returns
    -------
    pd.DataFrame
        The updated taxi_trips DataFrame with the payment_type and company ids, without the string values
    '''
    taxi_trips_id  = taxi_trips.merge(payment_type_master, on="payment_type")
    taxi_trips_id  = taxi_trips_id.merge(company_master, on="company")
    
    taxi_trips_id.drop(["payment_type", "company"], axis=1, inplace=True)
    
    return taxi_trips_id

In [None]:
def transform_weather_data(weather_data: json) -> pd.DataFrame:
    """
    Perform transformations on the weather data
    
    Parameters
    ----------
    weather_data: dict
        The daily weather data from the Open Meteo API

    Returns
    -------
    pd.DataFrame
        The transformed weather data DataFrame
    """
    weather_data_filtered = {
        "datetime": weather_data['hourly']['time'],
        "temperature": weather_data['hourly']['temperature_2m'],
        "wind_speed": weather_data['hourly']['wind_speed_10m'],
        "rain": weather_data['hourly']['rain'],
        "precipitation": weather_data['hourly']['precipitation']
    }

    weather_df = pd.DataFrame(weather_data_filtered)

    weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])

    return weather_df

In [None]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """
    Read a CSV file from S3 and return it as a DataFrame

    Parameters
    ----------
    bucket: str
        The name of the S3 bucket
    path: str
        The path to the CSV file in the S3 bucket
    filename: str
        The name of the CSV file

    Returns
    -------
    pd.DataFrame
        The DataFrame containing the CSV data
    """
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=f"{path}{filename}")

    output_df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))

    return output_df

In [None]:
def upload_dataframe_to_s3(bucket: str, path: str, dataframe: pd.DataFrame) -> None:
    """
    Upload a DataFrame to S3 as a CSV file

    Parameters
    ----------
    bucket: str
        The name of the S3 bucket
    path: str
        The path to upload the CSV file in the S3 bucket
    filename: str
        The name of the CSV file
    dataframe: pd.DataFrame
        The DataFrame to be uploaded

    Returns
    -------
    None
    """
    s3 = boto3.client('s3')

    buffer = StringIO()
    dataframe.to_csv(buffer, index=False)
    df_content = buffer.getvalue()
    s3.put_object(Bucket=bucket, Key=path, Body=df_content)

In [None]:
def upload_master_table_to_s3(bucket: str, path: str, file_type: str, dataframe: pd.DataFrame) -> None:
    """
    Uploads master data (payment type or company) to S3. Copies previous version and creates new one
    
    Parameters
    ----------
    bucket: str
        Name of the S3 bucket, where we want to store the files
    path: str
        Path within the bucket to upload the file
    file_type: str
        Either "company" or "payment_type"
    dataframe: pd.Dataframe
        The dataframe to be uploaded
        
    Returns
    --------
    None
    """
    s3 = boto3.client('s3')

    master_filepath = f"{path}{file_type}_master.csv"
    previous_master_filepath = f"transformed_data/master_table_previous_versions/{file_type}_master_previous_version.csv"

    s3.copy_object(
        Bucket=bucket, 
        CopySource={'Bucket': bucket, 'Key': master_filepath},
        Key=previous_master_filepath)

    upload_dataframe_to_s3(bucket, master_filepath, dataframe)

In [None]:
def upload_and_move_file_on_s3(
    dataframe: pd.DataFrame, 
    datetime_col: str, 
    bucket: str, 
    file_type: str,
    filename: str,
    source_path: str,
    target_path_raw: str,
    target_path_transformed: str) -> None:
    """
    Uploads a file to S3 and moves it from the source path to the target path

    Parameters
    ----------
    dataframe: pd.DataFrame
        The DataFrame to be uploaded
    datetime_col: str
        The name of the column containing the datetime
    bucket: str
        The name of the S3 bucket
    file_type: str
        The type of the file (e.g., "weather", "taxi")
    filename: str
        The name of the file
    source_path: str
        The path in the S3 bucket where the file is currently located
    target_path_raw: str
        The path in the S3 bucket where the file should be moved after processing
    target_path_transformed: str
        The path in the S3 bucket where the file should be uploaded after processing

    Returns
    -------
    None
    """
    s3 = boto3.client('s3')
    formatted_date = dataframe[datetime_col].iloc[0].strftime("%Y-%m-%d")
    new_path_with_filename = f"{target_path_transformed}{file_type}_{formatted_date}.csv"

    upload_dataframe_to_s3(bucket, new_path_with_filename, dataframe)

    s3.copy_object(
        Bucket=bucket,
        CopySource={'Bucket': bucket, 'Key': f"{source_path}{filename}"},
        Key=f"{target_path_raw}{filename}")

    s3.delete_object(Bucket=bucket, Key=f"{source_path}{filename}")

In [None]:
def read_json_from_s3(bucket: str, path: str) -> json:
    """
    Read a JSON file from S3 and return it as a dictionary

    Parameters:
    ----------
    bucket: str
        The name of the S3 bucket
    path: str
        The path to the json file to read

    Returns
    -------
    The json file content
    """
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=bucket, Key=path)
    content = response["Body"]
    return json.loads(content.read().decode('utf-8'))

In [None]:
#
#
# MAIN FUNCTION
#
#
def lambda_handler(event, context):
    s3 = boto3.client('s3')
    bucket = 'cubix-chicago-taxi-34'

    raw_weather_folder = 'raw_data/to_processed/weather_data/'
    raw_taxi_trips_folder = 'raw_data/to_processed/taxi_data/'

    target_taxi_trips_folder = 'raw_data/processed/taxi_data/'
    target_wet_weather_folder = 'raw_data/processed/weather_data/'

    transformed_taxi_trips_folder = 'transformed_data/taxi_trips/'
    transformed_weather_folder = 'transformed_data/weather/'
    
    payment_type_master_folder = 'transformed_data/payment_type/'
    company_master_folder = 'transformed_data/company/'

    payment_type_master_filename = 'payment_type_master.csv'
    company_master_filename = 'company_master.csv'

    payment_type_master = read_csv_from_s3(bucket, payment_type_master_folder, payment_type_master_filename)
    company_master = read_csv_from_s3(bucket, company_master_folder, company_master_filename)

    # TAXI TRIPS DATA TRANSFORMATION AND LOADING
    for file in s3.list_objects_v2(Bucket=bucket, Prefix=raw_taxi_trips_folder)['Contents']:
        file_key = file['Key']

        if(file_key.split('/'))[-1].strip() != "":
            if(file_key.split('.')[1] == "json"):
                filename = file_key.split('/')[-1]

                taxi_trips_data_json = read_json_from_s3(bucket, file_key)

                taxi_trips_data_raw = pd.DataFrame(taxi_trips_data_json)
                taxi_trips_transformed = taxi_trips_transformations(taxi_trips_data_raw)

                company_master_updated = update_master(taxi_trips_transformed, company_master, "company_id", "company")
                payment_type_master_updated = update_master(taxi_trips_transformed, payment_type_master, "payment_type_id", "payment_type")

                taxi_trips_updated = update_taxi_trips_with_master_data(taxi_trips_transformed, payment_type_master_updated, company_master_updated)

                upload_and_move_file_on_s3(
                    taxi_trips_updated,
                    "datetime_for_weather",
                    bucket,
                    "taxi",
                    filename,
                    raw_taxi_trips_folder,
                    target_taxi_trips_folder,
                    transformed_taxi_trips_folder
                )
                print("taxi trips is uploaded and moved")

                upload_master_table_to_s3(bucket, payment_type_master_folder, "payment_type", payment_type_master_updated)
                print("Payment type master updated")

                upload_master_table_to_s3(bucket, company_master_folder, "company", company_master_updated)
                print("Company master updated")

    # WEATHER DATA TRANSFORMATION AND LOADING
    for file in s3.list_objects_v2(Bucket=bucket, Prefix=raw_weather_folder)['Contents']:
        file_key = file['Key']

        if(file_key.split('/'))[-1].strip() != "":
            if(file_key.split('.')[1] == "json"):
                filename = file_key.split('/')[-1]

                weather_data_json = read_json_from_s3(bucket, file_key)
                weather_data = transform_weather_data(weather_data_json)
                
                upload_and_move_file_on_s3(
                    weather_data,
                    "datetime",
                    bucket,
                    "weather",
                    filename,
                    raw_weather_folder,
                    target_wet_weather_folder,
                    transformed_weather_folder
                )
                print("weather data is uploaded and moved")
