# Dataset

In [503]:
import os
import sys
sys.path.append(os.path.abspath(".."))
import random
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from db import IncidentRepository, Database
from datetime import datetime
import numpy as np

db = Database('../test.db')
incident_repository = IncidentRepository(db)
incidents = incident_repository.list_incidents()
print(incidents)

[{'id': 4751, 'location_id': 40, 'type_id': 2, 'avg_delay': 24.630959908404346, 'trust_score': 0.5318414510587776, 'status': 'active', 'created_at': '2025-10-05 04:37:38.290096', 'last_updated': '2025-10-05 04:37:38.290096'}, {'id': 4477, 'location_id': 39, 'type_id': 5, 'avg_delay': 13.710952879727138, 'trust_score': 0.7118598797500187, 'status': 'active', 'created_at': '2025-10-05 04:37:30.040387', 'last_updated': '2025-10-05 04:37:30.040387'}, {'id': 5816, 'location_id': 38, 'type_id': 5, 'avg_delay': 10.688354982945548, 'trust_score': 0.23348509933907013, 'status': 'resolved', 'created_at': '2025-10-05 04:33:23.180246', 'last_updated': '2025-10-05 04:33:23.180246'}, {'id': 5281, 'location_id': 33, 'type_id': 1, 'avg_delay': 20.310027861910772, 'trust_score': 0.16550934664097783, 'status': 'pending', 'created_at': '2025-10-05 04:33:17.976410', 'last_updated': '2025-10-05 04:33:17.976410'}, {'id': 2115, 'location_id': 30, 'type_id': 3, 'avg_delay': 20.76711143158328, 'trust_score': 0

In [504]:
X = []
y = []
weights = []

for inc in incidents:
    dt_obj = datetime.fromisoformat(inc['created_at'])
    hour = dt_obj.hour
    day_of_week = dt_obj.weekday()
    is_rush_hour = int(hour in [7, 8, 9, 16, 17, 18])
    if inc['status'] == 'resolved':
        status = 1
        if inc['trust_score'] > 0.3:
            X.append([
                inc['location_id'],
                inc['type_id'],
                inc['trust_score'],
                status,
                hour,
                day_of_week,
                is_rush_hour
            ])
            y.append(inc['avg_delay'])
            if inc['avg_delay'] == 0.0:
                weights.append(0.5 * inc['trust_score'])
            else:
                weights.append(inc['trust_score'])
print(X)
print(y)

[[23, 3, 0.4670169770212458, 1, 4, 6, 0], [21, 1, 0.9096893619245269, 1, 3, 6, 0], [40, 5, 0.8993880855308226, 1, 2, 6, 0], [25, 5, 0.9340592042127327, 1, 1, 6, 0], [28, 1, 0.7807060113645363, 1, 1, 6, 0], [22, 1, 0.7500135856893059, 1, 0, 6, 0], [25, 2, 0.5377035400997713, 1, 0, 6, 0], [34, 5, 0.3030909604734101, 1, 0, 6, 0], [38, 2, 0.7705301891531025, 1, 23, 5, 0], [26, 1, 0.614878442471856, 1, 23, 5, 0], [27, 1, 0.6323891541165416, 1, 21, 5, 0], [36, 4, 0.5321505646557253, 1, 21, 5, 0], [8, 1, 0.8139571779613811, 1, 20, 5, 0], [29, 2, 0.8195728613583267, 1, 20, 5, 0], [33, 3, 0.8589588238806889, 1, 20, 5, 0], [23, 3, 0.7523216831208187, 1, 19, 5, 0], [21, 2, 0.5586245491764973, 1, 18, 5, 1], [25, 2, 0.8556170061501299, 1, 18, 5, 1], [30, 4, 0.4303005135769761, 1, 17, 5, 1], [31, 3, 0.8410737006490647, 1, 17, 5, 1], [37, 3, 0.6729240535858593, 1, 16, 5, 1], [17, 1, 0.6361741407197151, 1, 16, 5, 1], [21, 1, 0.50645770281305, 1, 16, 5, 1], [22, 3, 0.4486308641490143, 1, 16, 5, 1], [29

Train

In [505]:
# Train/test split
X_train, X_test, y_train, y_test,w_train, w_test = train_test_split(X, y, weights, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(
    n_estimators=500,      # More trees
    max_depth=10,          # Limit depth to prevent overfitting
    min_samples_leaf=5,    # Each leaf must have at least 5 samples
    random_state=42
)
y_transformed = np.sqrt(y)

model.fit(X_train, y_train, sample_weight = w_train)

y_pred_transformed = model.predict(X_test)
y_pred = y_pred_transformed ** 2

Eval

In [506]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate basic accuracy metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("=== KNN REGRESSOR EVALUATION ===")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")
print(f"R² Score: {r2:.4f}")


y_test_arr = np.array(y_test)
y_pred_arr = np.array(y_pred)
nonzero_mask = y_test_arr != 0

# Percentage accuracy (how close predictions are to actual values)
# Using Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test_arr[nonzero_mask] - y_pred_arr[nonzero_mask]) / y_test_arr[nonzero_mask])) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Basic accuracy within a tolerance
tolerance = 5  # minutes
within_tolerance = np.abs(y_test - y_pred) <= tolerance
accuracy_within_tolerance = np.mean(within_tolerance) * 100
print(f"Accuracy within ±{tolerance} minutes: {accuracy_within_tolerance:.2f}%")

# Compare a few actual vs predicted values
print("\n=== SAMPLE PREDICTIONS ===")
n_samples = min(5, len(y_test))
sample_indices = np.random.choice(len(y_test), n_samples, replace=False)
for i in sample_indices:
    print(f"Actual: {y_test[i]:.1f} min | Predicted: {y_pred[i]:.1f} min | Difference: {abs(y_test[i] - y_pred[i]):.1f} min")

# Save the trained model
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(model, f)



=== KNN REGRESSOR EVALUATION ===
Mean Absolute Error (MAE): 8.91 minutes
Root Mean Squared Error (RMSE): 10.09 minutes
R² Score: -0.0368
Mean Absolute Percentage Error (MAPE): 209.00%
Accuracy within ±5 minutes: 24.48%

=== SAMPLE PREDICTIONS ===
Actual: 4.2 min | Predicted: 11.1 min | Difference: 6.9 min
Actual: 0.0 min | Predicted: 14.8 min | Difference: 14.8 min
Actual: 13.2 min | Predicted: 15.1 min | Difference: 1.9 min
Actual: 1.6 min | Predicted: 14.8 min | Difference: 13.2 min
Actual: 16.0 min | Predicted: 13.5 min | Difference: 2.5 min


In [507]:
with open('knn_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
y_pred_loaded = loaded_model.predict(X_test)
print(y_pred_loaded)

[14.15386138 15.99025534 15.66017285 15.2303285  13.71862452 18.0394943
 10.73230729 15.25998668  9.49320794 18.27646394 14.0425289  13.53760224
 17.91484747 11.28602243 14.6471898  14.71802721 13.11248743 15.87174095
 10.34120583 15.54415127  8.89269787 14.87404678 12.03762119 11.22063812
 15.86326445 13.64735795 13.90956542 11.39216705 11.1386213  15.8801945
 16.00394599 15.20498853 13.78916421 13.32414608 17.05286696 10.68331772
 15.07767575 15.21619585 11.65732164 16.21593553 14.9174294  14.12484084
 12.13836096  6.71773238 16.24976107 19.12390647  6.88150172 14.38400889
 18.21351785 14.34486613 16.49155925 15.15946455 14.4039508   9.39674316
 13.98329127 16.17746015 16.85423574 16.24919793 12.5023562  11.72634274
 16.40790898 10.14644849 15.53288004 11.37613847 11.82379994 15.64903089
 10.7817336  15.2172754  15.10171659 13.86869136  7.61531347  8.10669869
 14.86518159 12.04386199  4.79434312 11.22627228 16.88158153 13.87800191
 11.73268091 15.5494476  14.63998687 14.02635047 19.2