#### Imports and Paths

In [None]:
import os
import sys
import pandas as pd
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)

In [5]:
# -----------------------------
# Paths
# -----------------------------
PROCESSED_DIR = "../data/processed"
FEATURES_CSV = os.path.join(PROCESSED_DIR, "corridor_features.csv")

FIGURES_DIR = "../reports/figures"
os.makedirs(FIGURES_DIR, exist_ok=True)

#### Load Feature Dataset

In [6]:
# -----------------------------
# Load engineered features (Notebook 02 output)
# -----------------------------
df = pd.read_csv(FEATURES_CSV)

print("Loaded:", df.shape)
df.head()

Loaded: (1000, 22)


Unnamed: 0,shipment_id,truck_id,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator,travel_to_border_hours,clearance_hours,travel_to_destination_hours,total_transit_hours,depart_month,depart_day_of_week_num,depart_hour,is_weekend,is_high_value,is_heavy,rolling_clearance_mean_30,rolling_clearance_median_30,rolling_delay_rate_30,goods_category,delay_reason_group,is_delayed
0,SHP00823,TRK600,28351.5,34054.16,1.38,1,11.285224,8.522,4.270258,24.077482,1,0,3,0,0,0,6.162153,5.187192,0.107,Fuel,Other,0
1,SHP00640,TRK331,10666.48,12530.05,1.6,1,13.724932,4.780147,7.271907,25.776986,1,0,12,0,0,0,6.162153,5.187192,0.107,Machinery,Other,0
2,SHP00639,TRK688,17593.27,24469.33,1.49,0,12.245634,20.0,7.083963,39.329596,1,0,16,0,0,0,6.162153,5.187192,0.107,Machinery,Weather,1
3,SHP00842,TRK761,3041.22,3616.17,1.45,1,10.859174,11.657312,7.60578,30.122266,1,0,19,0,0,0,6.162153,5.187192,0.107,Pharmaceuticals,Other,0
4,SHP00652,TRK182,5562.38,5554.66,1.45,1,9.076477,12.444743,4.356565,25.877785,1,1,0,0,0,0,6.162153,5.187192,0.107,Pharmaceuticals,Vehicle/Mechanical,1


#### Quick Data Checks

In [7]:
# -----------------------------
# Quick checks (missing + types)
# -----------------------------
df.info()

missing = df.isnull().mean().sort_values(ascending=False)
display(missing.head(15))

# Delay rate sanity check
print("Delay rate (%):", round(df["is_delayed"].mean() * 100, 2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   shipment_id                  1000 non-null   object 
 1   truck_id                     1000 non-null   object 
 2   weight_kg                    1000 non-null   float64
 3   declared_value_usd           1000 non-null   float64
 4   fuel_price_usd_per_litre     1000 non-null   float64
 5   rainfall_indicator           1000 non-null   int64  
 6   travel_to_border_hours       1000 non-null   float64
 7   clearance_hours              1000 non-null   float64
 8   travel_to_destination_hours  1000 non-null   float64
 9   total_transit_hours          1000 non-null   float64
 10  depart_month                 1000 non-null   int64  
 11  depart_day_of_week_num       1000 non-null   int64  
 12  depart_hour                  1000 non-null   int64  
 13  is_weekend         

shipment_id                    0.0
truck_id                       0.0
weight_kg                      0.0
declared_value_usd             0.0
fuel_price_usd_per_litre       0.0
rainfall_indicator             0.0
travel_to_border_hours         0.0
clearance_hours                0.0
travel_to_destination_hours    0.0
total_transit_hours            0.0
depart_month                   0.0
depart_day_of_week_num         0.0
depart_hour                    0.0
is_weekend                     0.0
is_high_value                  0.0
dtype: float64

Delay rate (%): 10.7
