In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("your_dataset.csv")

# Separate features (independent variables) and target variables
X = data.drop(columns=['congestion_level', 'maxspeed_real', 'min_time'])
y = data[['congestion_level', 'maxspeed_real', 'min_time']]

# Impute missing values in y with median
y = y.fillna(y.median())

# Define preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Identify numerical and categorical columns
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Combine transformers for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = GradientBoostingRegressor()

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters to tune
param_grid = {
    'model__n_estimators': [100],
    'model__learning_rate': [0.1],
    'model__max_depth': [3],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1]
}

# Perform GridSearchCV for each target separately
grid_search_congestion = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_congestion.fit(X_train, y_train['congestion_level'])

grid_search_maxspeed = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_maxspeed.fit(X_train, y_train['maxspeed_real'])

grid_search_mintime = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_mintime.fit(X_train, y_train['min_time'])

# Make predictions on the test set
predictions_congestion = grid_search_congestion.predict(X_test)
predictions_maxspeed = grid_search_maxspeed.predict(X_test)
predictions_mintime = grid_search_mintime.predict(X_test)

# Calculate Mean Squared Error for each target variable
mse_congestion = mean_squared_error(y_test['congestion_level'], predictions_congestion)
mse_maxspeed = mean_squared_error(y_test['maxspeed_real'], predictions_maxspeed)
mse_mintime = mean_squared_error(y_test['min_time'], predictions_mintime)

print("Mean Squared Error - Congestion Level:", mse_congestion)
print("Mean Squared Error - Max Speed:", mse_maxspeed)
print("Mean Squared Error - Min Time:", mse_mintime)

# Calculate the number of correctly predicted and incorrectly predicted values for each target variable
#for target in ['congestion_level', 'maxspeed_real', 'min_time']:
    #correct_count = np.sum(predictions_congestion.round() == y_test[target].values)
    #incorrect_count = len(y_test) - correct_count
    #print(f"Correctly Predicted Count - {target}: {correct_count}")
    #print(f"Incorrectly Predicted Count - {target}: {incorrect_count}")

# Calculate the average of the training MSE
avg_train_mse_congestion = -grid_search_congestion.best_score_
avg_train_mse_maxspeed = -grid_search_maxspeed.best_score_
avg_train_mse_mintime = -grid_search_mintime.best_score_

print("Average Train MSE - Congestion Level:", avg_train_mse_congestion)
print("Average Train MSE - Max Speed:", avg_train_mse_maxspeed)
print("Average Train MSE - Min Time:", avg_train_mse_mintime)

# Assess validation performance
print("Best parameters - Congestion Level:", grid_search_congestion.best_params_)
print("Best parameters - Max Speed:", grid_search_maxspeed.best_params_)
print("Best parameters - Min Time:", grid_search_mintime.best_params_)

# Assess model complexity
#print("Number of features:", len(X.columns))
#print("Number of estimators - Congestion Level:", grid_search_congestion.best_params_['model__n_estimators'])
#print("Maximum depth - Congestion Level:", grid_search_congestion.best_params_['model__max_depth'])
#print("Number of estimators - Max Speed:", grid_search_maxspeed.best_params_['model__n_estimators'])
#print("Maximum depth - Max Speed:", grid_search_maxspeed.best_params_['model__max_depth'])
#print("Number of estimators - Min Time:", grid_search_mintime.best_params_['model__n_estimators'])
#print("Maximum depth - Min Time:", grid_search_mintime.best_params_['model__max_depth'])

# Concatenate RMSE values for all folds
#all_rmse_congestion = np.sqrt(-grid_search_congestion.cv_results_['mean_test_score'])
#all_rmse_maxspeed = np.sqrt(-grid_search_maxspeed.cv_results_['mean_test_score'])
#all_rmse_mintime = np.sqrt(-grid_search_mintime.cv_results_['mean_test_score'])

# Plot RMSE values for all folds for Congestion Level
#plt.figure(figsize=(10, 6))
#plt.scatter(np.arange(1, len(all_rmse_congestion) + 1), all_rmse_congestion, marker='o', label='Congestion Level')
#plt.title('RMSE for Congestion Level (All Folds)')
#plt.xlabel('Fold')
#plt.ylabel('RMSE')
#plt.legend()
#plt.grid(True)
#plt.show()

# Plot RMSE values for all folds for Max Speed
#plt.figure(figsize=(10, 6))
#plt.scatter(np.arange(1, len(all_rmse_maxspeed) + 1), all_rmse_maxspeed, marker='o', label='Max Speed')
#plt.title('RMSE for Max Speed (All Folds)')
#plt.xlabel('Fold')
#plt.ylabel('RMSE')
#plt.legend()
#plt.grid(True)
#plt.show()

# Plot RMSE values for all folds for Min Time
#plt.figure(figsize=(10, 6))
#plt.scatter(np.arange(1, len(all_rmse_mintime) + 1), all_rmse_mintime, marker='o', label='Min Time')
#plt.title('RMSE for Min Time (All Folds)')
#plt.xlabel('Fold')
#plt.ylabel('RMSE')
#plt.legend()
#plt.grid(True)
#plt.show()


Mean Squared Error - Congestion Level: 3.2073294200614434e-10
Mean Squared Error - Max Speed: 8.089374539643065e-09
Mean Squared Error - Min Time: 2.4355498247558716e-09
Average Train MSE - Congestion Level: 3.2425622491730516e-10
Average Train MSE - Max Speed: 7.408841688530866e-09
Average Train MSE - Min Time: 2.897997410410135e-09
Best parameters - Congestion Level: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best parameters - Max Speed: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best parameters - Min Time: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
