In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import numpy as np
import pandas as pd


file_path = "eTraffic_lag2.csv"
df = pd.read_csv(file_path)

# Select features and target variables
target_column = "GDP"
feature_columns = [
    "pedal_cycles", "two_wheeled_motor_vehicles", "cars_and_taxis",
    "buses_and_coaches", "LGVs", "HGVs_2_rigid_axle", "HGVs_3_rigid_axle",
    "HGVs_4_or_more_rigid_axle", "HGVs_3_or_4_articulated_axle",
    "HGVs_5_articulated_axle", "HGVs_6_articulated_axle",
    "all_HGVs", "all_motor_vehicles", "local_authority_name_encoded"
]


# Data Splitting
X = df[feature_columns]
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train KNN regression model
k = 3
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, y_train)

# Forecast
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

# Evaluation model
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

metrics = {
    "MAE": mean_absolute_error,
    "MSE": mean_squared_error,
    "MAPE": mean_absolute_percentage_error,
    "R²": r2_score
}

results = {}
for name, metric in metrics.items():
    results[f"{name}_train"] = metric(y_train, y_train_pred)
    results[f"{name}_test"] = metric(y_test, y_test_pred)

# Output evaluation results
print(f"training set:")
print(f"   - MAE: {results['MAE_train']:.4f}")
print(f"   - MSE: {results['MSE_train']:.4f}")
print(f"   - MAPE: {results['MAPE_train']:.2f}%")
print(f"   - R²: {results['R²_train']:.4f}")

print(f"\n testing set:")
print(f"   - MAE: {results['MAE_test']:.4f}")
print(f"   - MSE: {results['MSE_test']:.4f}")
print(f"   - MAPE: {results['MAPE_test']:.2f}%")
print(f"   - R²: {results['R²_test']:.4f}")

print(knn.get_params())

# Overfitting Check
if results["R²_train"] - results["R²_test"] > 0.2:
    print("\n may have overfitting")
elif results["R²_train"] < 0.5 and results["R²_test"] < 0.5:
    print("\n may have underfitting")
else:
    print("\n perform well")


training set:
   - MAE: 3.8004
   - MSE: 25.1095
   - MAPE: 4.58%
   - R²: 0.7904

 testing set:
   - MAE: 5.0285
   - MSE: 41.3683
   - MAPE: 6.09%
   - R²: 0.6590
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}

 perform well


In [2]:
# K value selection
for k in [3, 5, 7, 10, 15]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print(f"k={k}, R² Score: {r2_score(y_test, y_pred):.4f}")


k=3, R² Score: 0.6590
k=5, R² Score: 0.6149
k=7, R² Score: 0.5998
k=10, R² Score: 0.5400
k=15, R² Score: 0.4763


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor


target_column = "GDP"
feature_columns = [
    "pedal_cycles", "two_wheeled_motor_vehicles", "cars_and_taxis",
    "buses_and_coaches", "LGVs","HGVs_2_rigid_axle","HGVs_3_rigid_axle","HGVs_4_or_more_rigid_axle","HGVs_3_or_4_articulated_axle",
    "HGVs_5_articulated_axle","HGVs_6_articulated_axle",
    "all_HGVs","all_motor_vehicles", "local_authority_name_encoded"
]
df = df.dropna(subset=feature_columns + [target_column])
X = df[feature_columns]
y = df[target_column]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Use RandomForestRegressor for feature importance
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance using RandomForestRegressor:")
feature_importance_df



Feature Importance using RandomForestRegressor:


Unnamed: 0,Feature,Importance
8,HGVs_3_or_4_articulated_axle,0.217337
10,HGVs_6_articulated_axle,0.179163
5,HGVs_2_rigid_axle,0.130058
4,LGVs,0.07874
7,HGVs_4_or_more_rigid_axle,0.068702
13,local_authority_name_encoded,0.068681
0,pedal_cycles,0.048493
1,two_wheeled_motor_vehicles,0.04291
3,buses_and_coaches,0.042298
6,HGVs_3_rigid_axle,0.031401


In [4]:
# Feature selection based on importance threshold
importance_threshold = 0.05
selected_features = feature_importance_df[feature_importance_df['Importance'] >= importance_threshold]['Feature'].tolist()

print(f"\nSelected features (importance >= {importance_threshold}):")
print(selected_features)

# Retrain the model with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Scale the data again using the selected features
scaler = StandardScaler()
X_train_selected_scaled = scaler.fit_transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)

# Retrain the KNN model
knn_selected = KNeighborsRegressor(n_neighbors=5)
knn_selected.fit(X_train_selected_scaled, y_train)

# Make predictions
y_pred_selected = knn_selected.predict(X_test_selected_scaled)

# Evaluate the model with selected features
mse_selected = mean_squared_error(y_test, y_pred_selected)
r2_selected = r2_score(y_test, y_pred_selected)

print(f"\nModel performance with selected features:")
print(f"   - Mean Squared Error (MSE): {mse_selected:.4f}")
print(f"   - R-squared (R2): {r2_selected:.4f}")



Selected features (importance >= 0.05):
['HGVs_3_or_4_articulated_axle', 'HGVs_6_articulated_axle', 'HGVs_2_rigid_axle', 'LGVs', 'HGVs_4_or_more_rigid_axle', 'local_authority_name_encoded']

Model performance with selected features:
   - Mean Squared Error (MSE): 41.0973
   - R-squared (R2): 0.6612
