# Predict incident probability on transport line
---

## Imports

In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn import linear_model
import pickle
import datetime as dt

## Utils

In [4]:
def random_date(from_date, to_date):
    """Generate a random datetime between `from_date` and `to_date`"""
    delta = to_date - from_date
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = np.random.randint(int_delta)
    return from_date + dt.timedelta(seconds=random_second)

## Type

In [5]:
class Incident:
    def __init__(self, location: str, num: int, time, retard_time):
        self.location = location
        self.num = num
        self.time = time
        self.retard_time = retard_time


## Example dataset

New Dataset

In [6]:
features = ["hour", "day_of_week", "is_weekend", "is_rush_hour", "location"]
known_locations = ["line 15", "line 67", "line 20", "line 1", "line 5", "line 2", "line 3"]
def generate_coherent_retard_time(location, hour, day_of_week, is_weekend, is_rush_hour):
    """
    Generate retard_time based on realistic patterns:
    - Rush hours have more delays
    - Weekdays have different patterns than weekends
    - Certain locations are more prone to delays
    - Morning/evening peaks have different characteristics
    """

    base_delay = 0
    
    # 1. Location factor (some lines are more problematic)
    location_factors = {
        "line 1": 2.3,   # Busiest line, most delays
        "line 15": 1.8,  # Moderate delays
        "line 5": 1.3,   # Average
        "line 20": 0.5,  # Fewer delays
        "line 67": 0.2,   # Least delays
        "line 2" : 0.0,
        "line 3" : 0.8
    }
    location_factor = location_factors.get(location, 1.0)
    base_delay += location_factor * 3
    
    # 2. Rush hour effect (major contributor)
    if is_rush_hour:
        if hour in [7, 8, 9]:  # Morning rush
            base_delay += np.random.uniform(8, 15)
        else:  # Evening rush (16, 17, 18)
            base_delay += np.random.uniform(8, 15)
    
    # 3. Time of day pattern
    if 6 <= hour <= 9:    # Morning
        base_delay += np.random.uniform(2, 5)
    elif 16 <= hour <= 19: # Evening
        base_delay += np.random.uniform(2, 5)
    elif 20 <= hour <= 22: # Late evening
        base_delay += np.random.uniform(0, 3)
    else:  # Night/Overnight
        base_delay += np.random.uniform(0, 2)
    
    # 4. Day of week pattern
    if day_of_week == 0:  # Monday
        base_delay += np.random.uniform(0, 5)
    elif day_of_week == 4:  # Friday
        base_delay += np.random.uniform(0, 5)
    elif is_weekend:  # Weekend
        if day_of_week == 5:  # Saturday
            base_delay += np.random.uniform(0, 3)
        else:  # Sunday
            base_delay += np.random.uniform(0, 2)
    else:  # Tuesday-Thursday
        base_delay += np.random.uniform(0, 5)
    
    # 5. Random incident factor (occasional large delays)
    incident_probability = 0.05  # 15% chance of significant incident
    if np.random.random() < incident_probability:
        # Major incident causing large delay
        base_delay += np.random.uniform(15, 40)
    
    # 6. Add some random noise
    base_delay += np.random.uniform(-2, 2)
    
    # Ensure non-negative delay
    retard_time = max(0, base_delay)
    

    return round(retard_time, 1)

# def calculate_delay(features):
#     retard = 0.0
#     if features[3] == 1 and features[2] == 0:
#         retard += np.random.randint(5,10)
#     elif features[2] and 11 > features[0] > 13:
#         retard += np.random.randint(3,8)
#     elif features[4] == 3:
#         retard = 0.0
#     elif features[4] == 1:
#         retard = 0.0
#     else:
#         retard += np.random.randint(0,2)
#     return retard


def create_dataset():
    timeStamp = pd.DataFrame(columns = ["hour", "day_of_week", "is_weekend", "is_rush_hour", "location", "retard_time"])
    for i in range(10000):
        hour = np.random.randint(24)

        day_of_week = np.random.randint(7)

        is_weekend = np.random.randint(0,2)

        is_rush_hour = np.random.randint(0,2)

        location = np.random.choice(known_locations)

        retard_time = generate_coherent_retard_time(location, hour, day_of_week, is_weekend, is_rush_hour)

        timeStamp = pd.concat([timeStamp, pd.DataFrame({
            "hour" : hour,
            "day_of_week" : day_of_week,
            "is_weekend" : is_weekend,
            "is_rush_hour" : is_rush_hour,
            "location" : location,
            "retard_time" : retard_time,
        }, index = [0])],ignore_index = True)

    return timeStamp

df = create_dataset()

df.head()

  timeStamp = pd.concat([timeStamp, pd.DataFrame({


Unnamed: 0,hour,day_of_week,is_weekend,is_rush_hour,location,retard_time
0,16,6,0,0,line 15,11.6
1,16,4,1,1,line 2,15.5
2,12,5,0,0,line 67,3.5
3,1,0,0,1,line 3,19.7
4,1,4,0,1,line 67,12.6


In [None]:
N = 100

df = pd.DataFrame(columns=["location", "num", "time", "retard_time"])

# Store all generated incident location-time pairs to avoid duplicates
incident_times = set()

# Generate incidents (incident_occured = 1)
for i in range(N):
    time = random_date(dt.datetime(2025, 1, 1), dt.datetime(2025, 10, 5))
    location = np.random.choice(known_locations)
    
    # Create a unique identifier for this location-time combination
    time_key = f"{location}_{time.strftime('%Y-%m-%d-%H')}"
    
    # Ensure we don't have duplicate incidents at same location and time
    while time_key in incident_times:
        time = random_date(dt.datetime(2025, 1, 1), dt.datetime(2025, 10, 5))
        time_key = f"{location}_{time.strftime('%Y-%m-%d-%H')}"
    
    incident_times.add(time_key)
    
    retard_time = np.random.uniform(0, 60)
    df = pd.concat([df, pd.DataFrame({
        "location": location, 
        "num": np.random.randint(1, 100), 
        "time": time, 
        "retard_time": retard_time, 
    }, index=[0])], ignore_index=True)

# Generate non-incidents (incident_occured = 0) - ensuring no overlap with incidents
for i in range(N):
    time = random_date(dt.datetime(2025, 1, 1), dt.datetime(2025, 10, 5))
    location = np.random.choice(known_locations)
    
    # Create a unique identifier for this location-time combination
    time_key = f"{location}_{time.strftime('%Y-%m-%d-%H')}"
    
    # Ensure this non-incident doesn't overlap with any incident
    while time_key in incident_times:
        time = random_date(dt.datetime(2025, 1, 1), dt.datetime(2025, 10, 5))
        time_key = f"{location}_{time.strftime('%Y-%m-%d-%H')}"
    
    df = pd.concat([df, pd.DataFrame({
        "location": location, 
        "num": 0,  # No reports when no incident
        "time": time, 
        "retard_time": 0,  # No delay when no incident
    }, index=[0])], ignore_index=True)

# Shuffle the dataset to mix incidents and non-incidents
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")

# Verify no duplicates
df['location_time_key'] = df['location'] + '_' + df['time'].astype(str)
print(f"Unique location-time combinations: {df['location_time_key'].nunique()} out of {len(df)}")
print(f"Any duplicates: {df['location_time_key'].duplicated().any()}")

df = df.drop('location_time_key', axis=1)  # Remove the temporary key
df.head()

Dataset shape: (200, 4)
Unique location-time combinations: 200 out of 200
Any duplicates: False


  df = pd.concat([df, pd.DataFrame({


Unnamed: 0,location,num,time,retard_time
0,line 1,81,2025-07-09 18:39:07,15.144522
1,line 20,42,2025-02-03 13:29:47,8.188592
2,line 1,93,2025-09-28 15:38:25,27.260302
3,line 5,0,2025-06-22 11:52:43,0.0
4,line 15,0,2025-02-25 10:57:30,0.0


## Data pre-processing

In [7]:
# df['hour'] = pd.to_datetime(df['time']).dt.hour
# df['day_of_week'] = pd.to_datetime(df['time']).dt.dayofweek
# df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 16, 17, 18]).astype(int)


# Features and target
le = LabelEncoder()
df['location_encoded'] = le.fit_transform(df['location'])

features = ['hour', 'day_of_week', 'is_weekend', 'is_rush_hour', 'location_encoded']
X = df[features]
y = df['retard_time'].astype(float)
df.head()

Unnamed: 0,hour,day_of_week,is_weekend,is_rush_hour,location,retard_time,location_encoded
0,16,6,0,0,line 15,11.6,1
1,16,4,1,1,line 2,15.5,2
2,12,5,0,0,line 67,3.5,6
3,1,0,0,1,line 3,19.7,4
4,1,4,0,1,line 67,12.6,6


## Training

In [11]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train model
model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, y_train)

0,1,2
,n_neighbors,1
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


## Testing

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate basic accuracy metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("=== KNN REGRESSOR EVALUATION ===")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")
print(f"R² Score: {r2:.4f}")

# Percentage accuracy (how close predictions are to actual values)
# Using Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Basic accuracy within a tolerance
tolerance = 5  # minutes
within_tolerance = np.abs(y_test - y_pred) <= tolerance
accuracy_within_tolerance = np.mean(within_tolerance) * 100
print(f"Accuracy within ±{tolerance} minutes: {accuracy_within_tolerance:.2f}%")

# Compare a few actual vs predicted values
print("\n=== SAMPLE PREDICTIONS ===")
sample_indices = np.random.choice(len(y_test), 5, replace=False)
for i in sample_indices:
    print(f"Actual: {y_test.iloc[i]:.1f} min | Predicted: {y_pred[i]:.1f} min | Difference: {abs(y_test.iloc[i] - y_pred[i]):.1f} min")

# Save the trained model
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(model, f)



=== KNN REGRESSOR EVALUATION ===
Mean Absolute Error (MAE): 5.22 minutes
Root Mean Squared Error (RMSE): 9.54 minutes
R² Score: 0.0365
Mean Absolute Percentage Error (MAPE): inf%
Accuracy within ±5 minutes: 76.50%

=== SAMPLE PREDICTIONS ===
Actual: 13.9 min | Predicted: 12.8 min | Difference: 1.1 min
Actual: 11.2 min | Predicted: 20.3 min | Difference: 9.1 min
Actual: 19.5 min | Predicted: 17.2 min | Difference: 2.3 min
Actual: 15.2 min | Predicted: 20.4 min | Difference: 5.2 min
Actual: 5.4 min | Predicted: 6.7 min | Difference: 1.3 min


In [None]:
with open('knn_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

