# Dataset

In [332]:
import os
import sys
sys.path.append(os.path.abspath(".."))
import random
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from db import IncidentRepository, Database
from datetime import datetime
import numpy as np

db = Database('../test.db')
incident_repository = IncidentRepository(db)
incidents = incident_repository.list_incidents()
print(incidents)

[{'id': 344, 'location_id': 5, 'type_id': 2, 'avg_delay': 0.0, 'trust_score': 0.3753936934415564, 'status': 'active', 'created_at': '2025-10-05 06:10:08.091496', 'last_updated': '2025-10-05 06:10:08.091496'}, {'id': 2242, 'location_id': 2, 'type_id': 5, 'avg_delay': 0.0, 'trust_score': 0.9992590383013172, 'status': 'pending', 'created_at': '2025-10-05 06:09:50.640244', 'last_updated': '2025-10-05 06:09:50.640244'}, {'id': 2713, 'location_id': 8, 'type_id': 3, 'avg_delay': 0.0, 'trust_score': 0.04540732544930559, 'status': 'active', 'created_at': '2025-10-05 05:57:11.095735', 'last_updated': '2025-10-05 05:57:11.095735'}, {'id': 1388, 'location_id': 8, 'type_id': 3, 'avg_delay': 0.0, 'trust_score': 0.05370496690182258, 'status': 'active', 'created_at': '2025-10-05 05:56:39.353878', 'last_updated': '2025-10-05 05:56:39.353878'}, {'id': 1878, 'location_id': 1, 'type_id': 2, 'avg_delay': 0.0, 'trust_score': 0.919375206341861, 'status': 'pending', 'created_at': '2025-10-05 05:55:40.642190',

In [333]:
X = []
y = []
weights = []

for inc in incidents:
    dt_obj = datetime.fromisoformat(inc['created_at'])
    hour = dt_obj.hour
    day_of_week = dt_obj.weekday()
    is_rush_hour = int(hour in [7, 8, 9, 16, 17, 18])
    if inc['status'] == 'resolved':
        status = 1
        if inc['trust_score'] > 0.3:
            X.append([
                inc['location_id'],
                inc['type_id'],
                inc['trust_score'],
                status,
                hour,
                day_of_week,
                is_rush_hour
            ])
            y.append(inc['avg_delay'])
            if inc['avg_delay'] == 0.0:
                weights.append(0.7 * inc['trust_score'])
            else:
                weights.append(inc['trust_score'])
print(X)
print(y)

[[8, 5, 0.4971134474908464, 1, 2, 6, 0], [7, 5, 0.9243909643507867, 1, 1, 6, 0], [9, 4, 0.3454426525009664, 1, 1, 6, 0], [2, 4, 0.3068789674032485, 1, 0, 6, 0], [3, 1, 0.4983123798529394, 1, 0, 6, 0], [1, 1, 0.6539574291320095, 1, 23, 5, 0], [10, 4, 0.8861161953448735, 1, 21, 5, 0], [3, 1, 0.6761618485133202, 1, 21, 5, 0], [10, 1, 0.8200256033410094, 1, 19, 5, 0], [5, 2, 0.8942362702070762, 1, 18, 5, 1], [4, 5, 0.7177053599108353, 1, 18, 5, 1], [6, 1, 0.6398652625063543, 1, 15, 5, 0], [10, 4, 0.9199849848529489, 1, 15, 5, 0], [8, 1, 0.7665608891774244, 1, 14, 5, 0], [9, 1, 0.45216970988525784, 1, 14, 5, 0], [9, 2, 0.599341286403451, 1, 11, 5, 0], [1, 3, 0.3174724774052876, 1, 11, 5, 0], [6, 3, 0.6871087194165646, 1, 11, 5, 0], [6, 4, 0.6985029993714295, 1, 8, 5, 1], [3, 4, 0.8093732547248862, 1, 8, 5, 1], [7, 2, 0.43353614968785514, 1, 8, 5, 1], [3, 3, 0.37081392058908924, 1, 7, 5, 1], [5, 2, 0.8218277720030748, 1, 5, 5, 0], [10, 1, 0.6541159320682192, 1, 5, 5, 0], [7, 2, 0.89985007254

Train

In [334]:
# Train/test split
X_train, X_test, y_train, y_test,w_train, w_test = train_test_split(X, y, weights, test_size=0.2, random_state=42)

# Train model
model = KMeans(
    n_clusters=8,          # More clusters = more complexityS
    init='k-means++',     # Better than random
    n_init=20,            # Run multiple times and keep best
    max_iter=500,         # More iterations = more convergence
    random_state=42
)
y_transformed = np.sqrt(y)

model.fit(X_train, y_train, sample_weight = w_train)

y_pred_transformed = model.predict(X_test)
y_pred = y_pred_transformed ** 2

Eval

In [335]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate basic accuracy metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("=== KNN REGRESSOR EVALUATION ===")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")
print(f"R² Score: {r2:.4f}")


y_test_arr = np.array(y_test)
y_pred_arr = np.array(y_pred)
nonzero_mask = y_test_arr != 0

# Percentage accuracy (how close predictions are to actual values)
# Using Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test_arr[nonzero_mask] - y_pred_arr[nonzero_mask]) / y_test_arr[nonzero_mask])) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Basic accuracy within a tolerance
tolerance = 5  # minutes
within_tolerance = np.abs(y_test - y_pred) <= tolerance
accuracy_within_tolerance = np.mean(within_tolerance) * 100
print(f"Accuracy within ±{tolerance} minutes: {accuracy_within_tolerance:.2f}%")

# Compare a few actual vs predicted values
print("\n=== SAMPLE PREDICTIONS ===")
n_samples = min(5, len(y_test))
sample_indices = np.random.choice(len(y_test), n_samples, replace=False)
for i in sample_indices:
    print(f"Actual: {y_test[i]:.1f} min | Predicted: {y_pred[i]:.1f} min | Difference: {abs(y_test[i] - y_pred[i]):.1f} min")

# Save the trained model
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(model, f)



=== KNN REGRESSOR EVALUATION ===
Mean Absolute Error (MAE): 3.46 minutes
Root Mean Squared Error (RMSE): 4.15 minutes
R² Score: 0.0000
Mean Absolute Percentage Error (MAPE): nan%
Accuracy within ±5 minutes: 79.10%

=== SAMPLE PREDICTIONS ===
Actual: 0.0 min | Predicted: 4.0 min | Difference: 4.0 min
Actual: 0.0 min | Predicted: 0.0 min | Difference: 0.0 min
Actual: 0.0 min | Predicted: 1.0 min | Difference: 1.0 min
Actual: 0.0 min | Predicted: 5.0 min | Difference: 5.0 min
Actual: 0.0 min | Predicted: 5.0 min | Difference: 5.0 min


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [336]:
with open('knn_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
y_pred_loaded = loaded_model.predict(X_test)
print(y_pred_loaded)

[2 7 2 5 7 2 4 0 4 5 2 0 4 0 5 0 4 6 7 3 7 3 4 2 5 0 5 5 7 4 0 1 4 5 5 4 1
 2 4 4 6 7 5 1 7 7 4 6 4 5 1 7 4 1 4 4 6 2 4 3 5 6 1 6 4 0 0 7 2 1 1 7 0 1
 0 5 3 0 5 4 1 4 1 0 5 7 0 0 2 4 4 6 2 3 1 7 0 4 0 0 3 3 5 2 2 6 5 2 6 3 3
 4 4 7 4 0 2 1 1 5 5 3 2 0 7 5 2 1 7 6 4 7 1 7]
