In [2]:
import numpy as np
import pandas as pd

# Randomness fix for reproducibility
rng = np.random.default_rng(42)

N_SAMPLES = 1000

def generate_bus_data(n_samples=1000):
    # 3 campus routes
    route_ids = rng.integers(1, 4, size=n_samples)  # 1, 2, 3

    # Approx distances for each route (km)
    base_distance = {1: 5.0, 2: 8.0, 3: 3.5}
    distance_km = np.array([base_distance[r] for r in route_ids]) + rng.normal(0, 0.5, size=n_samples)

    # Hour of day (7 AM to 9 PM)
    hour_of_day = rng.integers(7, 22, size=n_samples)

    # Day of week (0 = Monday ... 6 = Sunday)
    day_of_week = rng.integers(0, 7, size=n_samples)

    # Peak hour flag (8–10 AM, 5–7 PM)
    is_peak_hour = np.where(
        ((hour_of_day >= 8) & (hour_of_day <= 10)) | ((hour_of_day >= 17) & (hour_of_day <= 19)),
        1,
        0
    )

    # Weather: 0 = sunny, 1 = cloudy, 2 = rain
    weather = rng.choice([0, 1, 2], size=n_samples, p=[0.5, 0.3, 0.2])

    # Traffic level: 1 = low, 2 = medium, 3 = high
    traffic_level = 1 + is_peak_hour + (weather == 2).astype(int)  # rain & peak => higher traffic
    traffic_level = np.clip(traffic_level, 1, 3)

    # Previous stop delay (minutes)
    previous_delay = rng.normal(loc=3 * is_peak_hour + 2 * (weather == 2), scale=2.0, size=n_samples)
    previous_delay = np.clip(previous_delay, 0, None)

    # Ideal speed (km/h)
    ideal_speed = 25.0

    # Base travel time (if no traffic, no bad weather)
    scheduled_travel_time = distance_km / ideal_speed * 60  # minutes

    # Actual speed affected by traffic & weather
    speed_factor = 1.0 - 0.15 * (traffic_level - 1) - 0.10 * (weather == 2).astype(float)
    speed_factor = np.clip(speed_factor, 0.4, 1.0)
    actual_speed = ideal_speed * speed_factor

    base_travel_time = distance_km / actual_speed * 60  # minutes

    # Additional delay from previous stop & randomness
    extra_delay = previous_delay * rng.uniform(0.5, 1.2, size=n_samples)
    noise = rng.normal(0, 2.0, size=n_samples)  # random noise

    actual_travel_time = base_travel_time + extra_delay + noise

    # Target: arrival delay (minutes) = actual - scheduled
    arrival_delay_min = actual_travel_time - scheduled_travel_time

    # Clip weird values
    arrival_delay_min = np.clip(arrival_delay_min, -10, 40)  # can be early upto 10 min, late upto 40

    df = pd.DataFrame({
        "route_id": route_ids,
        "distance_km": distance_km,
        "hour_of_day": hour_of_day,
        "day_of_week": day_of_week,
        "is_peak_hour": is_peak_hour,
        "weather": weather,
        "traffic_level": traffic_level,
        "previous_delay_min": previous_delay,
        "arrival_delay_min": arrival_delay_min,
    })

    return df

if __name__ == "__main__":
    df = generate_bus_data(N_SAMPLES)
    df.to_csv("bus_data.csv", index=False)
    print("Saved bus_data.csv with shape:", df.shape)
    print(df.head())

Saved bus_data.csv with shape: (1000, 9)
   route_id  distance_km  hour_of_day  day_of_week  is_peak_hour  weather  \
0         1     5.716607           11            5             0        1   
1         3     3.545760           14            0             0        0   
2         2     8.290389            9            2             1        1   
3         2     7.971608           20            0             0        2   
4         2     7.914796           17            3             1        0   

   traffic_level  previous_delay_min  arrival_delay_min  
0              1            3.850026           1.813084  
1              1            3.964521           4.599542  
2              2            3.093026           9.901383  
3              2            2.127955           4.530515  
4              2            2.253114           6.847969  
