In [9]:
from datetime import datetime, date, time
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font, Border, Side
import numpy as np
import pandas as pd
import os
import re

## Data ingestion

In [10]:
file_path = '../data'
file_name = 'FR Report 07 Nov 24 12h00..xlsx'

file_path_bulk = '../data/bulk-reports'

os.makedirs(file_path_bulk, exist_ok=True)

In [11]:
df = pd.read_excel(os.path.join(file_path, file_name))

In [12]:
df.columns

Index(['Customer', 'Account', 'Waybill Date', 'Waybill', 'Service',
       'Reference', 'Shipper', 'Consignee', 'Delivery Agent', 'Orig Hub',
       'Orig Place', 'Dest Hub', 'Dest Place', 'Pieces', 'Chrg Mass',
       'Due Date', 'POD Recipient', 'POD Date', 'POD Time', 'SLA Transit Days',
       'Booking Date', 'Last Event', 'Last Event Hub', 'Last Event Date',
       'Last Event Time', 'Last Manual Event', 'Last Manual Event Date',
       'Last Manual Event Time'],
      dtype='object')

## Data cleaning

In [13]:
def safe_to_datetime(x):
    if pd.isna(x):
        return pd.NaT
    elif isinstance(x, (pd.Timestamp, np.datetime64, datetime)):
        return pd.to_datetime(x)
    elif isinstance(x, time):
        return pd.to_datetime(datetime.combine(date.today(), x))
    else:
        try:
            return pd.to_datetime(x)
        except:
            return pd.NaT

In [14]:
date_columns = ['Due Date', 'Waybill Date', 'Last Event Date']
for col in date_columns:
    df[col] = df[col].apply(safe_to_datetime)

## Data output

In [15]:
# Sort the entire DataFrame by 'Last Event' and 'Waybill Date'
df_sorted = df.sort_values(by=['Last Event', 'Waybill Date'], ascending=[True, False])

# Split the DataFrame by Last Event
df_not_pod = df_sorted[~df_sorted['Last Event'].isin(["POD Details Captured", "POD Image Scanned"])]
df_pod = df_sorted[df_sorted['Last Event'].isin(["POD Details Captured", "POD Image Scanned"])]

# Save to a single Excel file with two sheets: 'Current deliveries' and 'Completed deliveries'
with pd.ExcelWriter(f'{file_path_bulk}/{file_name}') as writer:
    df_not_pod.to_excel(writer, sheet_name='Current deliveries', index=False)
    df_pod.to_excel(writer, sheet_name='Completed deliveries', index=False)
