#### Imports & Paths

In [1]:
import os
import pandas as pd
import numpy as np
import sys

In [2]:
RAW_PATH = "../data/raw/northern_corridor_synthetic_dataset.csv"
PROCESSED_DIR = "../data/processed"

os.makedirs(PROCESSED_DIR, exist_ok=True)

#### Load Dataset (with timestamp parsing)

In [3]:
date_cols = ["depart_time", "arrival_time", "border_arrival_time", "border_exit_time"]
df = pd.read_csv(RAW_PATH, parse_dates=date_cols)
print("Dataset loaded successfully!")

df.head()

Dataset loaded successfully!


Unnamed: 0,shipment_id,truck_id,origin_port,border,destination,depart_time,border_arrival_time,border_exit_time,arrival_time,goods_category,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator,delay_reason
0,SHP00001,TRK754,Mombasa,Malaba,Kampala,2024-01-29 00:00:00,2024-01-29 12:26:57.490795,2024-01-29 19:37:49.228206,2024-01-30 00:36:35.670901,Electronics,16047.07,20438.99,1.55,0,
1,SHP00002,TRK704,Mombasa,Malaba,Kampala,2024-04-18 01:00:00,2024-04-18 09:10:43.619940,2024-04-18 13:39:43.839026,2024-04-18 18:32:12.225863,Construction Materials,27040.37,32255.69,1.46,1,
2,SHP00003,TRK325,Mombasa,Malaba,Kampala,2024-04-24 18:00:00,2024-04-25 03:40:08.919298,2024-04-25 07:48:57.582009,2024-04-25 15:17:35.506628,Textiles,10252.37,13212.2,1.27,0,
3,SHP00004,TRK320,Mombasa,Malaba,Kampala,2024-03-27 03:00:00,2024-03-27 11:33:23.310217,2024-03-27 15:42:12.174996,2024-03-27 20:05:24.890822,Textiles,6095.6,6005.17,1.12,1,
4,SHP00005,TRK649,Mombasa,Malaba,Kampala,2024-02-01 12:00:00,2024-02-01 20:28:22.084278,2024-02-02 10:25:18.999930,2024-02-02 15:35:40.767182,Pharmaceuticals,5711.12,8013.81,1.39,0,Customs Inspection


#### Basic Structure Checks

In [4]:
#High-level info: columns, dtypes, missing counts
print("Shape:", df.shape)
display(df.info())

print("\n# Quick stats for numeric columns:")
display(df.describe(include=[np.number]))

print("\n# Quick stats for categorical columns:")
display(df.describe(include=['object']))


Shape: (1000, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   shipment_id               1000 non-null   object        
 1   truck_id                  1000 non-null   object        
 2   origin_port               1000 non-null   object        
 3   border                    1000 non-null   object        
 4   destination               1000 non-null   object        
 5   depart_time               1000 non-null   datetime64[ns]
 6   border_arrival_time       1000 non-null   datetime64[ns]
 7   border_exit_time          1000 non-null   datetime64[ns]
 8   arrival_time              1000 non-null   datetime64[ns]
 9   goods_category            1000 non-null   object        
 10  weight_kg                 1000 non-null   float64       
 11  declared_value_usd        1000 non-null   float64       
 12  fue

None


# Quick stats for numeric columns:


Unnamed: 0,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator
count,1000.0,1000.0,1000.0,1000.0
mean,16522.17515,19091.65712,1.34966,0.515
std,9802.057215,12043.994288,0.144432,0.500025
min,2000.51,1892.1,1.1,0.0
25%,7590.775,8709.5875,1.22,0.0
50%,15769.69,16621.175,1.35,1.0
75%,23564.3975,27413.3675,1.47,1.0
max,39985.22,57822.71,1.6,1.0



# Quick stats for categorical columns:


Unnamed: 0,shipment_id,truck_id,origin_port,border,destination,goods_category,delay_reason
count,1000,1000,1000,1000,1000,1000,107
unique,1000,597,1,1,1,7,6
top,SHP00001,TRK452,Mombasa,Malaba,Kampala,Pharmaceuticals,Customs Inspection
freq,1,6,1000,1000,1000,161,23


#### Define Expected Schema (columns + types)

In [5]:
# Expected columns (schema check)
expected_cols = [
    "shipment_id", "truck_id",
    "origin_port", "border", "destination",
    "depart_time", "border_arrival_time", "border_exit_time", "arrival_time",
    "goods_category", "weight_kg", "declared_value_usd",
    "fuel_price_usd_per_litre", "rainfall_indicator", "delay_reason"
]

missing_cols = [c for c in expected_cols if c not in df.columns]
extra_cols = [c for c in df.columns if c not in expected_cols]

print("\nMissing columns:", missing_cols)
print("Extra columns:", extra_cols)

assert len(missing_cols) == 0, "Schema error: Missing columns detected!"


Missing columns: []
Extra columns: []


#### Missing Values & Duplicates

In [6]:
# Missing values
nulls = df.isnull().sum().sort_values(ascending=False)
print("Missing values per column:")
display(nulls[nulls > 0])

Missing values per column:


delay_reason    893
dtype: int64

In [7]:
#Duplicate check
dup_shipments = df["shipment_id"].duplicated().sum()
dup_rows = df.duplicated().sum()
print(f"\nDuplicate shipment_ids: {dup_shipments}")
print(f"Duplicate full-row count: {dup_rows}")

# If shipment_id should be unique, enforce it:
# (For synthetic data it should be unique; in real data you might dedupe)
assert dup_shipments == 0, "Data error: shipment_id is not unique."


Duplicate shipment_ids: 0
Duplicate full-row count: 0


#### Validate Timestamp Logic (ordering + negative durations)

In [8]:
# -----------------------------
# Timestamp ordering rules
# -----------------------------
# depart_time <= border_arrival_time <= border_exit_time <= arrival_time

bad_order = df[
    (df["depart_time"] > df["border_arrival_time"]) |
    (df["border_arrival_time"] > df["border_exit_time"]) |
    (df["border_exit_time"] > df["arrival_time"])
]

print("Rows with invalid timestamp order:", len(bad_order))
display(bad_order.head())

# If you want strict enforcement:
assert len(bad_order) == 0, "Data error: invalid timestamp ordering found."

# -----------------------------
# Compute durations (for validation + later analysis)
# -----------------------------
df["travel_to_border_hours"] = (
    df["border_arrival_time"] - df["depart_time"]
).dt.total_seconds() / 3600

df["clearance_hours"] = (
    df["border_exit_time"] - df["border_arrival_time"]
).dt.total_seconds() / 3600

df["travel_to_destination_hours"] = (
    df["arrival_time"] - df["border_exit_time"]
).dt.total_seconds() / 3600

df["total_transit_hours"] = (
    df["arrival_time"] - df["depart_time"]
).dt.total_seconds() / 3600

# -----------------------------
# Negative duration checks (should not happen)
# -----------------------------
neg_duration = df[
    (df["travel_to_border_hours"] < 0) |
    (df["clearance_hours"] < 0) |
    (df["travel_to_destination_hours"] < 0) |
    (df["total_transit_hours"] < 0)
]

print("Rows with negative durations:", len(neg_duration))
display(neg_duration.head())

assert len(neg_duration) == 0, "Data error: negative durations found."

Rows with invalid timestamp order: 0


Unnamed: 0,shipment_id,truck_id,origin_port,border,destination,depart_time,border_arrival_time,border_exit_time,arrival_time,goods_category,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator,delay_reason


Rows with negative durations: 0


Unnamed: 0,shipment_id,truck_id,origin_port,border,destination,depart_time,border_arrival_time,border_exit_time,arrival_time,goods_category,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator,delay_reason,travel_to_border_hours,clearance_hours,travel_to_destination_hours,total_transit_hours


#### Validate Allowed Values (categoricals)

In [9]:
# -----------------------------
# Expected fixed corridor values
# -----------------------------
assert df["origin_port"].nunique() == 1 and df["origin_port"].iloc[0] == "Mombasa"
assert df["border"].nunique() == 1 and df["border"].iloc[0] == "Malaba"
assert df["destination"].nunique() == 1 and df["destination"].iloc[0] == "Kampala"

# -----------------------------
# Rainfall indicator should be 0 or 1
# -----------------------------
bad_rain = df[~df["rainfall_indicator"].isin([0, 1])]
print("Invalid rainfall_indicator rows:", len(bad_rain))
display(bad_rain.head())
assert len(bad_rain) == 0, "Data error: rainfall_indicator must be 0 or 1."

# -----------------------------
# Delay reason sanity check
# -----------------------------
# If delay_reason == "None", typically clearance should be <= 12
# If clearance > 12, delay_reason should not be "None"
inconsistent_delay = df[
    ((df["clearance_hours"] > 12) & (df["delay_reason"] == "None")) |
    ((df["clearance_hours"] <= 12) & (df["delay_reason"] != "None"))
]

print("Inconsistent delay_reason rows:", len(inconsistent_delay))
display(inconsistent_delay.head())

# For synthetic dataset you might enforce strictly:
# If you want strict:
# assert len(inconsistent_delay) == 0, "Data error: delay_reason inconsistent with clearance_hours."

Invalid rainfall_indicator rows: 0


Unnamed: 0,shipment_id,truck_id,origin_port,border,destination,depart_time,border_arrival_time,border_exit_time,arrival_time,goods_category,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator,delay_reason,travel_to_border_hours,clearance_hours,travel_to_destination_hours,total_transit_hours


Inconsistent delay_reason rows: 893


Unnamed: 0,shipment_id,truck_id,origin_port,border,destination,depart_time,border_arrival_time,border_exit_time,arrival_time,goods_category,weight_kg,declared_value_usd,fuel_price_usd_per_litre,rainfall_indicator,delay_reason,travel_to_border_hours,clearance_hours,travel_to_destination_hours,total_transit_hours
0,SHP00001,TRK754,Mombasa,Malaba,Kampala,2024-01-29 00:00:00,2024-01-29 12:26:57.490795,2024-01-29 19:37:49.228206,2024-01-30 00:36:35.670901,Electronics,16047.07,20438.99,1.55,0,,12.449303,7.181038,4.979567,24.609909
1,SHP00002,TRK704,Mombasa,Malaba,Kampala,2024-04-18 01:00:00,2024-04-18 09:10:43.619940,2024-04-18 13:39:43.839026,2024-04-18 18:32:12.225863,Construction Materials,27040.37,32255.69,1.46,1,,8.178783,4.483394,4.874552,17.536729
2,SHP00003,TRK325,Mombasa,Malaba,Kampala,2024-04-24 18:00:00,2024-04-25 03:40:08.919298,2024-04-25 07:48:57.582009,2024-04-25 15:17:35.506628,Textiles,10252.37,13212.2,1.27,0,,9.669144,4.146851,7.477201,21.293196
3,SHP00004,TRK320,Mombasa,Malaba,Kampala,2024-03-27 03:00:00,2024-03-27 11:33:23.310217,2024-03-27 15:42:12.174996,2024-03-27 20:05:24.890822,Textiles,6095.6,6005.17,1.12,1,,8.556475,4.146907,4.386866,17.090247
5,SHP00006,TRK777,Mombasa,Malaba,Kampala,2024-02-28 09:00:00,2024-02-28 22:54:40.784846,2024-02-29 07:30:41.212140,2024-02-29 14:55:57.787323,Textiles,3909.01,3887.83,1.42,1,,13.911329,8.600119,7.421271,29.932719


#### Validate Numeric Ranges (sanity checks)

In [10]:
# -----------------------------
# Basic numeric sanity rules (tune these if needed)
# -----------------------------
range_rules = {
    "weight_kg": (100, 100000),                 # shipments should have positive weight
    "declared_value_usd": (100, 5_000_000),     # declared value should be positive, reasonable
    "fuel_price_usd_per_litre": (0.3, 3.0),     # typical plausible band
    "travel_to_border_hours": (1, 48),
    "clearance_hours": (0, 72),
    "travel_to_destination_hours": (1, 48),
    "total_transit_hours": (1, 120)
}

bad_ranges = pd.DataFrame(columns=list(df.columns) + ["issue"])

for col, (lo, hi) in range_rules.items():
    bad = df[(df[col] < lo) | (df[col] > hi)]
    if len(bad) > 0:
        tmp = bad.copy()
        tmp["issue"] = f"{col} out of range [{lo}, {hi}]"
        bad_ranges = pd.concat([bad_ranges, tmp], axis=0)

print("Out-of-range rows:", len(bad_ranges))

if bad_ranges.empty:
    print("✅ No out-of-range values found.")
else:
    display(bad_ranges[["shipment_id", "issue"]].head(20))

Out-of-range rows: 0
✅ No out-of-range values found.


##### Create a Data Quality Summary

In [11]:
# -----------------------------
# Quality summary you can paste into your report/README
# -----------------------------
quality_summary = {
    "rows": len(df),
    "columns": df.shape[1],
    "missing_values_total": int(df.isnull().sum().sum()),
    "duplicate_shipment_ids": int(df["shipment_id"].duplicated().sum()),
    "avg_clearance_hours": float(df["clearance_hours"].mean()),
    "p90_clearance_hours": float(df["clearance_hours"].quantile(0.90)),
    "delay_rate_pct": float((df["clearance_hours"] > 12).mean() * 100),
}

quality_summary

{'rows': 1000,
 'columns': 19,
 'missing_values_total': 893,
 'duplicate_shipment_ids': 0,
 'avg_clearance_hours': 6.162153496750833,
 'p90_clearance_hours': 12.312373795611112,
 'delay_rate_pct': 10.7}

#### Cleaned Dataset to data/processed/

In [12]:
# -----------------------------
# Save processed copy
# -----------------------------
processed_csv = os.path.join(PROCESSED_DIR, "corridor_shipments_clean.csv")
df.to_csv(processed_csv, index=False)

print("Saved cleaned dataset to:", processed_csv)

Saved cleaned dataset to: ../data/processed\corridor_shipments_clean.csv


In [13]:
# Optional Parquet save
processed_parquet = os.path.join(PROCESSED_DIR, "corridor_shipments_clean.parquet")
df.to_parquet(processed_parquet, index=False)

print("Saved Parquet dataset to:", processed_parquet)

Saved Parquet dataset to: ../data/processed\corridor_shipments_clean.parquet
