In [239]:
# import necessary libraries for data processing

import pandas as pd
import numpy as np

In [240]:
# loading sheets from the excel file input into relevant dataframes

cardit_df = pd.read_excel('Sample_Data_from_IODA_v2 (China Post).xlsx', sheet_name='fct_cgo_gls_mail_cardit')
master_df = pd.read_excel('Sample_Data_from_IODA_v2 (China Post).xlsx', sheet_name='dim_cgo_cargo_awb_master')
event_df = pd.read_excel('Sample_Data_from_IODA_v2 (China Post).xlsx', sheet_name='fct_cgo_mail_cardit_receptacle')

## Cardit Table Data Sanitisation 

In [241]:
# Filter post_office field for China Post only 
cardit_df = cardit_df[cardit_df['post_office'] == 'China Post']

# Filter leg_number field for first leg only
cardit_df = cardit_df[cardit_df['leg_number'] == 1]

In [242]:
# Removing unecessary fields from cardit_df

columns_to_keep_cardit = ["dim_cgo_cargo_awb_master_sk", "receptacle_charge_weight", "shipment_origin", "shipment_dest", "fct_cgo_gls_mail_cardit_sk"]

cardit_df = cardit_df[columns_to_keep_cardit]

## Event Table Data Sanitisation

In [243]:
# Remove the string "[Null]" from the whole table
event_df = event_df.replace("[Null]", np.nan)

# Remove rows where actual_depart_datetime_local is null (we want the last departure time on EZY post))
event_df = event_df[event_df['actual_depart_datetime_local'].notna()]

# Convert actual_depart_datetime_local to datetime format (may not be the right data type as "[Null]" was replaced with NaN)
event_df['actual_depart_datetime_local'] = pd.to_datetime(event_df['actual_depart_datetime_local'], errors='coerce')

# Convert actual_depart_datetime_local to datetime format (may not be the right data type as "[Null]" was replaced with NaN)
event_df['actual_arrive_datetime_local'] = pd.to_datetime(event_df['actual_arrive_datetime_local'], errors='coerce')

# Create a new field with the actual_arrive_datetime_local of the last leg_number for each receptalce_id (with respect to duplicated receptacle_id)
event_df['Arrive Datetime'] = event_df.groupby('receptacle_id')['actual_arrive_datetime_local'].transform('max')

# Filter leg_number field for first leg only
event_df = event_df[event_df['leg_number'] == 1]

# Keep only rows with the latest actual_depart_datetime_local for each receptacle_id
event_df = event_df.loc[
    event_df.groupby('receptacle_id')['actual_depart_datetime_local'].idxmax()
]

In [244]:
columns_to_keep_event = ["fct_cgo_gls_mail_cardit_sk",
                   "carrier_code", "flight_number", "actual_depart_datetime_local", "Arrive Datetime",
                   "receptacle_id"]

event_df = event_df[columns_to_keep_event]

## Master Table Data Sanitisation

In [245]:
columns_to_keep_master = ["dim_cgo_cargo_awb_master_sk", "awb_number"]

master_df = master_df[columns_to_keep_master]

## Merging Tables

In [246]:
# inner join master and cardit on dim_cgo_cargo_awb_master_sk
master_inner_cardit_df = master_df.merge(cardit_df, how='inner', on='dim_cgo_cargo_awb_master_sk')

In [247]:
# inner join master cardit and event on fct_cgo_gls_mail_cardit_sk
master_cardit_inner_event_df = master_inner_cardit_df.merge(event_df, how='inner', on='fct_cgo_gls_mail_cardit_sk')

### Post Merge Processing

In [248]:
"""
Rename actual_depart_datetime_local to Depart DateTime
OR
Splitting actual_depart_datetime_local into Flight Date only and removing the time data
"""

# Rename actual_depart_datetime_local to Depart DateTime
master_cardit_inner_event_df = master_cardit_inner_event_df.rename(columns={"actual_depart_datetime_local": "Depart Datetime"})

# OR

# # Create a new column with just the date
# master_cardit_inner_event_df['Flight Date'] = master_cardit_inner_event_df['actual_depart_datetime_local'].dt.date

# # Drop the original datetime column
# master_cardit_inner_event_df = master_cardit_inner_event_df.drop(columns=['actual_depart_datetime_local'])


In [249]:
"""
Dealing with case where there are multiple different AWB numbers for the same receptacle_id
If left unresolved, receptacle_id will not be unique which will cause issues when joining with CNP template
Concatenate multiple AWB numbers into a single cell separated by commas where there is only one row for each receptacle_id
"""


# Identify duplicated Receptacle IDs
duplicated_ids = master_cardit_inner_event_df['receptacle_id'].duplicated(keep=False)

# Split into duplicates and non-duplicates
duplicates_df = master_cardit_inner_event_df[duplicated_ids]
non_duplicates_df = master_cardit_inner_event_df[~duplicated_ids]

# Group by Receptacle ID and aggregate
grouped_duplicates = duplicates_df.groupby('receptacle_id').agg({
    'awb_number': lambda x: ', '.join(sorted(set(x.astype(str)))),
    'shipment_origin': 'first',
    'shipment_dest': 'first',
    'receptacle_charge_weight': 'first',
    'carrier_code': 'first',
    'flight_number': 'first',
    'Depart Datetime': 'first',
    'Arrive Datetime': 'first'
}).reset_index()

# Combine with non-duplicates
master_cardit_inner_event_df = pd.concat([non_duplicates_df, grouped_duplicates], ignore_index=True)

In [250]:
# # Removing unnecessary fields that were previously used for merging (keys)
# master_cardit_inner_event_df.drop(columns=['dim_cgo_cargo_awb_master_sk', 'fct_cgo_gls_mail_cardit_sk'], inplace=True)

In [251]:
# Renaming fields to match CNP Template 

master_cardit_inner_event_df = master_cardit_inner_event_df.rename(columns={
    'awb_number': 'AWB Number',
    'receptacle_charge_weight': 'Weight',
    'shipment_origin': 'Departure Station',
    'shipment_dest': 'Destination',
    'carrier_code': 'Airline',
    'flight_number': 'Flight Number',
    'receptacle_id': 'Receptacle ID'
})


## Export

In [252]:
# Reordering field names to match the CNP template

desired_order = [
    "AWB Number",
    "Departure Station",
    "Destination",
    "Weight",
    "Airline",
    "Flight Number",
    "Depart Datetime",
    "Arrive Datetime",
    "Receptacle ID"
]

master_cardit_inner_event_df = master_cardit_inner_event_df[desired_order]


In [253]:
# Exporting final table to an excel file 

master_cardit_inner_event_df.to_excel('master_cardit_inner_event_df.xlsx', index=False)

In [254]:
master_cardit_inner_event_df['Receptacle ID'].is_unique

True