In [3]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# -----------------------------
# 1. BASIC SETTINGS
# -----------------------------
n = 100000  # number of records
start_date = datetime(2025, 1, 1)

np.random.seed(42)
random.seed(42)

# -----------------------------
# 2. CATEGORICAL OPTIONS
# -----------------------------
traffic_levels = ["Low", "Medium", "High"]
weathers = ["Clear", "Rainy", "Stormy", "Foggy"]
road_conditions = ["good", "average", "poor"]
seasons = ["summer", "monsoon", "winter"]
bus_types = ["AC", "Non-AC", "Electric"]
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

cities = ["Chennai", "Salem", "Madurai", "Coimbatore", "Trichy",
          "Vellore", "Erode", "Tirunelveli", "Thanjavur", "Kanchipuram", "Tiruvannamalai"]

# -----------------------------
# 3. GENERATE CORE FEATURES
# -----------------------------
bus_ids = [f"TN-{np.random.randint(1000,1999)}" for _ in range(n)]
route_ids = [f"R{np.random.randint(100,160)}" for _ in range(n)]
stop_ids = [f"S-{random.choice(cities)}-{np.random.randint(1,60)}" for _ in range(n)]

scheduled_arrivals = [start_date + timedelta(minutes=np.random.randint(0, 1440))
                      for _ in range(n)]

traffic = np.random.choice(traffic_levels, n, p=[0.4, 0.4, 0.2])
weather = np.random.choice(weathers, n, p=[0.5, 0.2, 0.15, 0.15])
road_condition = np.random.choice(road_conditions, n, p=[0.5, 0.3, 0.2])
season = np.random.choice(seasons, n)
bus_type = np.random.choice(bus_types, n)
day_of_week = [d.strftime("%A") for d in scheduled_arrivals]

# -----------------------------
# 4. NUMERICAL FEATURES
# -----------------------------
route_distance = np.random.randint(5, 120, n)
num_stops = np.random.randint(5, 60, n)
bus_age = np.random.randint(1, 15, n)
railway_crossing = np.random.choice([0, 1], n, p=[0.7, 0.3])
is_holiday = np.random.choice([0, 1], n, p=[0.9, 0.1])
festival_flag = np.random.choice([0, 1], n, p=[0.85, 0.15])

temperature = np.random.normal(30, 3, n)
wind_speed = np.abs(np.random.normal(15, 10, n))
rain_intensity = np.where(weather == "Rainy", np.random.uniform(10, 50, n),
                   np.where(weather == "Stormy", np.random.uniform(30, 70, n), 0))

# -----------------------------
# 5. DELAY LOGIC (IMPORTANT FOR ML)
# -----------------------------
delay = []

for i in range(n):
    d = 0

    # traffic impact
    if traffic[i] == "Medium":
        d += np.random.randint(5, 15)
    elif traffic[i] == "High":
        d += np.random.randint(15, 35)

    # weather impact
    if weather[i] == "Rainy":
        d += np.random.randint(5, 20)
    elif weather[i] == "Stormy":
        d += np.random.randint(15, 40)
    elif weather[i] == "Foggy":
        d += np.random.randint(5, 15)

    # route & bus condition
    d += route_distance[i] * 0.05
    d += bus_age[i] * 0.5

    if road_condition[i] == "poor":
        d += 10

    if railway_crossing[i] == 1:
        d += np.random.randint(3, 10)

    if is_holiday[i] == 1 or festival_flag[i] == 1:
        d += np.random.randint(5, 15)

    delay.append(int(d))

delay = np.array(delay)

# -----------------------------
# 6. ACTUAL ARRIVAL TIME
# -----------------------------
actual_arrival = [scheduled_arrivals[i] + timedelta(minutes=int(delay[i]))
                  for i in range(n)]

# -----------------------------
# 7. CREATE DATAFRAME
# -----------------------------
df = pd.DataFrame({
    "bus_id": bus_ids,
    "route_id": route_ids,
    "stop_id": stop_ids,
    "scheduled_arrival": scheduled_arrivals,
    "traffic_level": traffic,
    "weather": weather,
    "day_of_week": day_of_week,
    "delay_minutes": delay,
    "actual_arrival": actual_arrival,
    "bus_age": bus_age,
    "bus_type": bus_type,
    "route_distance_km": route_distance,
    "num_stops": num_stops,
    "road_condition": road_condition,
    "has_railway_crossing": railway_crossing,
    "is_holiday": is_holiday,
    "season": season,
    "festival_flag": festival_flag,
    "rain_intensity": rain_intensity,
    "temperature": temperature,
    "wind_speed": wind_speed
})

# -----------------------------
# 8. SAVE
# -----------------------------
df.to_csv("synthetic_bus_arrival_data.csv", index=False)

print("✅ Synthetic dataset generated successfully!")



✅ Synthetic dataset generated successfully!


In [4]:
print(df.shape)

print(df.info())

(100000, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   bus_id                100000 non-null  object        
 1   route_id              100000 non-null  object        
 2   stop_id               100000 non-null  object        
 3   scheduled_arrival     100000 non-null  datetime64[ns]
 4   traffic_level         100000 non-null  object        
 5   weather               100000 non-null  object        
 6   day_of_week           100000 non-null  object        
 7   delay_minutes         100000 non-null  int64         
 8   actual_arrival        100000 non-null  datetime64[ns]
 9   bus_age               100000 non-null  int64         
 10  bus_type              100000 non-null  object        
 11  route_distance_km     100000 non-null  int64         
 12  num_stops             100000 non-null  int64  

In [5]:
from google.colab import files
files.download("synthetic_bus_arrival_data.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>