In [None]:
# Nairobi Logistics Data Generator Script (Exact Replica)
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker and set seeds
fake = Faker()
np.random.seed(42)
random.seed(42)
Faker.seed(42)

# Parameters
num_drivers = 100
num_vehicles = 50
num_routes = 200
num_deliveries = 10000
num_weather = 500

In [None]:
# Nairobi Locations and GPS Coordinates
nairobi_locations = {
    "Westlands": (-1.2648, 36.8148),
    "Kasarani": (-1.2321, 36.8991),
    "Embakasi": (-1.3176, 36.8943),
    "CBD": (-1.2833, 36.8167),
    "Kilimani": (-1.2921, 36.7834),
    "Karen": (-1.3201, 36.7206),
    "Eastleigh": (-1.2763, 36.8541),
    "Rongai": (-1.3936, 36.7512),
    "Upper Hill": (-1.2986, 36.8077),
    "South B": (-1.3097, 36.8481),
    "Parklands": (-1.2584, 36.8165),
    "Lang'ata": (-1.3626, 36.7442)
}
location_names = list(nairobi_locations.keys())

In [None]:
# Driver Dimension
drivers = pd.DataFrame({
    'DriverID': range(1, num_drivers + 1),
    'Name': [fake.name() for _ in range(num_drivers)],
    'Region': [random.choice(location_names) for _ in range(num_drivers)],
    'ExperienceYears': np.random.randint(1, 11, num_drivers)
})


In [None]:
# Vehicle Dimension
vehicles = pd.DataFrame({
    'VehicleID': range(1, num_vehicles + 1),
    'Type': [random.choice(['Van', 'Truck', 'Bike']) for _ in range(num_vehicles)],
    'FuelEfficiency_L_per_100km': np.round(np.random.uniform(3, 25, num_vehicles), 2),
    'EmissionFactor_gCO2_per_km': np.round(np.random.uniform(60, 300, num_vehicles), 1)
})


In [None]:
# Route Dimension with GPS
def generate_route(i):
    start = random.choice(location_names)
    end = random.choice([loc for loc in location_names if loc != start])
    return {
        'RouteID': i,
        'Start_Location': start,
        'End_Location': end,
        'DistanceKM': round(random.uniform(5, 25), 1),
        'Start_Lat': nairobi_locations[start][0],
        'Start_Lon': nairobi_locations[start][1],
        'End_Lat': nairobi_locations[end][0],
        'End_Lon': nairobi_locations[end][1],
    }

routes = pd.DataFrame([generate_route(i) for i in range(1, num_routes + 1)])

In [None]:
# Weather Dimension
weather = pd.DataFrame({
    'WeatherID': range(1, num_weather + 1),
    'DateTime': [fake.date_time_between(start_date='-3y', end_date='now') for _ in range(num_weather)],
    'Region': [random.choice(location_names) for _ in range(num_weather)],
    'RainLevel_mm': np.round(np.random.uniform(0, 50, num_weather), 1),
    'WindSpeed_kmph': np.round(np.random.uniform(5, 80, num_weather), 1)
})


In [None]:
# Deliveries (Fact Table)
deliveries = pd.DataFrame({
    'DeliveryID': range(num_deliveries),
    'DriverID': np.random.choice(drivers['DriverID'], num_deliveries),
    'VehicleID': np.random.choice(vehicles['VehicleID'], num_deliveries),
    'RouteID': np.random.choice(routes['RouteID'], num_deliveries),
    'StartTime': [fake.date_time_between(start_date='-3y', end_date='-1d') for _ in range(num_deliveries)]
})

# Join route info and calculate EndTime
deliveries = deliveries.merge(routes[['RouteID', 'DistanceKM']], on='RouteID', how='left')
deliveries['Duration_Minutes'] = deliveries['DistanceKM'] * np.random.uniform(1.2, 2.0, num_deliveries)
deliveries['EndTime'] = deliveries['StartTime'] + pd.to_timedelta(deliveries['Duration_Minutes'], unit='m')
deliveries.drop(columns='Duration_Minutes', inplace=True)

In [None]:
# Inject 5% missing values for cleaning practice
for col in ['DriverID', 'VehicleID', 'StartTime', 'EndTime']:
    missing_idx = deliveries.sample(frac=0.05).index
    deliveries.loc[missing_idx, col] = np.nan

In [None]:
# Save to CSV files
drivers.to_csv("drivers.csv", index=False)
vehicles.to_csv("vehicles.csv", index=False)
routes.to_csv("routes.csv", index=False)
weather.to_csv("weather.csv", index=False)
deliveries.to_csv("deliveries.csv", index=False)
