In [126]:
from datetime import datetime, date, time
import numpy as np
import pandas as pd
import os
import re

## Data ingestion

In [128]:
file_path = '../data'
file_name = 'Non Delivered 1Aug-1Nov24..xlsx'

time_str, time_match = pd.Timestamp.now().strftime("%Hh%M"), re.search(r'(\d{2})h(\d{2})', file_name)
if time_match:
    hour, minute = time_match.groups()
    time_str = f'{hour}h{minute}'

file_path_frequency = f'../data/frequency-reports-{pd.Timestamp.now().strftime("%Y-%m-%d")} {time_str}'
file_path_booking = '../data/booking-reports'

os.makedirs(file_path_frequency, exist_ok=True)
os.makedirs(file_path_booking, exist_ok=True)

In [129]:
df = pd.read_excel(os.path.join(file_path, file_name))

In [130]:
df.columns

Index(['Account', 'Waybill', 'Waybill Date', 'Service', 'Reference', 'Shipper',
       'Consignee', 'Orig Hub', 'Orig Place', 'Dest Hub', 'Dest Place',
       'Pieces', 'Chrg Mass', 'Due Date', 'POD Recipient', 'POD Date',
       'POD Time', 'Booking Date', 'SLA Transit Days', 'POD Image Present',
       'Last Event', 'Last Event Date', 'Delivery Agent'],
      dtype='object')

## Data cleaning

In [131]:
def safe_to_datetime(x):
    if pd.isna(x):
        return pd.NaT
    elif isinstance(x, (pd.Timestamp, np.datetime64, datetime)):
        return pd.to_datetime(x)
    elif isinstance(x, time):
        return pd.to_datetime(datetime.combine(date.today(), x))
    else:
        try:
            return pd.to_datetime(x)
        except Exception:
            return pd.NaT

In [132]:
date_columns = ['Due Date', 'Waybill Date', 'Last Event Date']
for col in date_columns:
    df[col] = df[col].apply(safe_to_datetime)

## Data manipulation

In [87]:
# Build rules based on other columns

## Data output

### Frequency reports

In [8]:
account_list = df['Account'].unique()
for account in account_list:
    # Split the DataFrame by Account
    df_account = df[df['Account'] == account]
    df_account = df_account.sort_values(by=['Last Event', 'Waybill Date'], ascending=[True, False])

    # Split the DataFrame by Last Event
    df_not_pod = df_account[~df_account['Last Event'].isin(["POD Details Captured", "POD Image Scanned"])]
    df_pod = df_account[df_account['Last Event'].isin(["POD Details Captured", "POD Image Scanned"])]

    # Save to Excel file with multiple sheets
    with pd.ExcelWriter(f'{file_path_frequency}/{account}.xlsx') as writer:
        df_not_pod.to_excel(writer, sheet_name='Current deliveries', index=False)
        df_pod.to_excel(writer, sheet_name='Completed deliveries', index=False)

In [134]:
# Use for backlog reports that do not need to be split by account but still need frequency report processing

# Assuming `df` is already defined and contains the necessary data
# Sort the entire DataFrame by 'Last Event' and 'Waybill Date'
df_sorted = df.sort_values(by=['Last Event', 'Waybill Date'], ascending=[True, False])

# Split the DataFrame by Last Event
df_not_pod = df_sorted[~df_sorted['Last Event'].isin(["POD Details Captured", "POD Image Scanned"])]
df_pod = df_sorted[df_sorted['Last Event'].isin(["POD Details Captured", "POD Image Scanned"])]

# Save to a single Excel file with two sheets: 'Current deliveries' and 'Completed deliveries'
with pd.ExcelWriter(f'{file_path_frequency}/{file_name}') as writer:
    df_not_pod.to_excel(writer, sheet_name='Current deliveries', index=False)
    df_pod.to_excel(writer, sheet_name='Completed deliveries', index=False)


### Booking reports

In [9]:
location_hub_list = df['Dest Hub'].unique()

# Sort the DataFrame by 'Booking Date' column
df_booking_date = df.sort_values(by='Booking Date', ascending=True)

# Create an Excel writer object
with pd.ExcelWriter(f'{file_path_booking}/booking-report-{pd.Timestamp.now().strftime("%Y-%m-%d")}.xlsx', engine='openpyxl') as writer:
    # Group the DataFrame by 'Dest Hub' and write each group to a separate sheet
    for category, group in df_booking_date.groupby('Dest Hub'):
        group.to_excel(writer, sheet_name=category, index=False)