# General Imports

**Importing all libraries**

In [9]:
import numpy as np
import pandas as pd
import warnings

import plotly.graph_objects as go
from bayes_opt import BayesianOptimization
import statsmodels.api as sm

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance

Disable warnings

In [10]:
# Disable all warnings
warnings.filterwarnings("ignore")

# Enable warnings again
## warnings.filterwarnings("default")

Generate train and test data

In [11]:
# Read Data
df = pd.read_csv("data_finish_prep.csv")

# Select only the usefull part of the df
df.drop(columns=['verschil_Lengte',
               'verschil_6 MWT', 'verschil_Gewicht',
               'verschil_BMI', 'verschil_Conditie',
               'verschil_Lenigheid', 'verschil_Knijpkracht'], inplace=True)

# Drop NaN values
df.dropna(inplace=True)

# Define X and y
X = df.drop(columns=['verschil_TUG'])
y = df['verschil_TUG']

# Split data into train & test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    regressor = model(**params)
    
    # Assuming you have X_train, y_train defined for regression
    scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
    return -scores.mean()

# MLR

**feature selection**

In [13]:
# Fit the Full Model
X =  sm.add_constant(X_train)  # Add constant term for the intercept
model = sm.OLS(y_train, X_train).fit()  # Fit the MLR model
best_model = model  # Initialize the best model
selected_features = X_train.columns.tolist()
best_features = X_train.columns.tolist()
best_features_rmse = np.sqrt(-cross_val_score(LinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error').mean())

# Iteratively remove one feature at a time based on p-values
while True:
    # Compute p-values
    p_values = model.pvalues[1:]  # Exclude the constant term

    # Identify Insignificant Features
    insignificant_feature = p_values.idxmax()
    max_p_value = p_values.max()

    # Remove Insignificant Feature
    X = X.drop(insignificant_feature, axis=1)
    selected_features.remove(insignificant_feature)

    # When there a no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    model = sm.OLS(y_train, X).fit()
    
    # Evaluate Model Performance
    rmse = np.sqrt(-cross_val_score(LinearRegression(), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error').mean())

    # Check if the model performs better without the insignificant features
    if rmse <= best_features_rmse:
        best_features_rmse = rmse
        best_features = selected_features.copy()
        best_model = model

# Save results
best_model_mlr = LinearRegression().fit(X_train[best_features], y_train)
best_features_mlr = best_features

# Show results
print("Best features:", best_features)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features)))

Best features: ['AfstandTotHuisartsenpraktijk', 'AfstandTotCafetariaED', 'AfstandTotOpenNatTerreinTotaal', 'AfstandTotVerblijfsrecreatiefTerrein', 'AfstandTotPoppodium', 'AfstandTotZonnebank', 'ErvarenGezondheidGoedZeerGoed', 'EenOfMeerLangdurigeAandoeningen', 'MatigOvergewicht']
Dropped features: ['AfstandTotOpenDroogNatTerrein', 'AfstandTotOpenbaarGroenTotaal', 'ErnstigOvergewichtObesitas', 'AfstandTotKunstijsbaan', 'GemiddeldeWoningwaarde', 'VoldoetAanBeweegrichtlijn', 'AfstandTotHuisartsenpost', 'AfstandTotVolkstuin', 'AfstandTotSportterrein', 'AfstandTotBegraafplaats', 'VoldoetAanRichtlijnAlcoholgebruik', 'OppervlakteWater', 'AfstandTotRestaurant', 'AfstandTotBibliotheek', 'InBezitWoningcorporatie', 'EigendomOnbekend', 'AfstandTotApotheek', 'ALandbouwBosbouwEnVisserij', 'Rokers', 'AfstandTotBrandweerkazerne', 'PercentageOnbewoond', 'AfstandTotHotelED', 'AfstandTotBioscoop', 'PersonenautoSNaarOppervlakte', 'VernielingMisdrijfTegenOpenbareOrde', 'AfstandTotCafeED', 'AfstandTotBos', 

*Er zijn geen hyperparameters om te optimaliseren.*

# Support Vector Machines

**Hyperparameter optimalisatie**

In [14]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    classifier = model(**params)
    
    # Calculating accuracy based on cross-validation
    scores = cross_val_score(classifier, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    return scores.mean()

params_ranges = {
    'C': (0.1, 10),
    'kernel_int': (1, 4),
    'gamma': (0.001, 0.1)
}

kernel_mapping = {
    1: 'linear',
    2: 'rbf',
    3: 'poly',
    4: 'sigmoid'
}

# Example usage with SVM
model = SVR
model_bo = BayesianOptimization(f=lambda C, kernel_int, gamma:
                                    bo_params_generic(model, {
                                        'C': C,
                                        'kernel': kernel_mapping[int(kernel_int)],
                                        'gamma': gamma
                                    }, X_train, y_train),
                             pbounds=params_ranges)

results = model_bo.maximize(n_iter=40, init_points=10)
params = model_bo.max['params']

# Creating a model with the best hyperparameters
best_model_svm = model(
    C=params['C'],
    kernel=kernel_mapping[int(params['kernel_int'])],
    gamma=params['gamma']
)

# Fit the model
best_model_svm.fit(X_train, y_train)


|   iter    |  target   |     C     |   gamma   | kernel... |
-------------------------------------------------------------


| [0m1        [0m | [0m-1.597   [0m | [0m6.867    [0m | [0m0.09667  [0m | [0m1.501    [0m |
| [95m2        [0m | [95m-1.592   [0m | [95m1.652    [0m | [95m0.04881  [0m | [95m2.119    [0m |
| [0m3        [0m | [0m-1.594   [0m | [0m3.027    [0m | [0m0.03789  [0m | [0m3.982    [0m |
| [95m4        [0m | [95m-1.589   [0m | [95m4.279    [0m | [95m0.03372  [0m | [95m2.418    [0m |
| [0m5        [0m | [0m-1.597   [0m | [0m8.404    [0m | [0m0.08358  [0m | [0m1.021    [0m |
| [0m6        [0m | [0m-1.593   [0m | [0m9.647    [0m | [0m0.05252  [0m | [0m2.492    [0m |
| [0m7        [0m | [0m-1.593   [0m | [0m9.087    [0m | [0m0.004731 [0m | [0m3.018    [0m |
| [0m8        [0m | [0m-1.594   [0m | [0m1.902    [0m | [0m0.02902  [0m | [0m1.1      [0m |
| [0m9        [0m | [0m-1.599   [0m | [0m9.795    [0m | [0m0.0921   [0m | [0m1.357    [0m |
| [0m10       [0m | [0m-1.59    [0m | [0m1.673    [0m | [0m0.0713

**Feature Selection**

In [15]:
# Get the accuracy score of the model with all features
best_accuracy = cross_val_score(best_model_svm, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()
best_model = best_model_svm
selected_features = X_train.columns
best_features = selected_features

while len(selected_features) > 1:
    try:
        # Retrieve the coefficients from the SVM model
        coefficients = best_model_svm.coef_[0]

        # Sort features based on their coefficient magnitudes
        sorted_indices = np.argsort(np.abs(coefficients))
        selected_features = [X_train.columns[i] for i in sorted_indices[::-1]]

        # Remove the least important feature
        selected_features = selected_features[:-1]
        
        # Refit the linear SVM classifier
        best_model_svm_classification.fit(X_train[selected_features], y_train)
        
        # Evaluate Model Performance
        accuracy = cross_val_score(best_model_svm, X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

        # Check if the model performs better without the insignificant features
        if accuracy >= best_accuracy:
            best_accuracy = accuracy
            best_features = selected_features.copy()
            best_model = best_model_svm
    except:
        print("Scince the kernal is not linear, this feature selection is not possible") 
        break

# Save results
best_model_svm = best_model
best_features_svm = best_features

# Show results
print("Best features:", best_features_svm)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_svm)))

Scince the kernal is not linear, this feature selection is not possible
Best features: Index(['NederlandseAntillenEnAruba', 'GeboorteRelatief', 'SterfteRelatief',
       'GemiddeldeWoningwaarde', 'PercentageOnbewoond',
       'InBezitWoningcorporatie', 'InBezitOverigeVerhuurders',
       'EigendomOnbekend', 'BouwjaarVanaf2000', 'kPersonenMetLaagsteInkomen',
       'ALandbouwBosbouwEnVisserij', 'KLFinancieleDienstenOnroerendGoed',
       'PersonenautoSOverigeBrandstof', 'PersonenautoSPerHuishouden',
       'PersonenautoSNaarOppervlakte', 'AfstandTotHuisartsenpraktijk',
       'AfstandTotKinderdagverblijf', 'AfstandTotSchool', 'OppervlakteWater',
       'TotaalDiefstalUitWoningSchuurED',
       'VernielingMisdrijfTegenOpenbareOrde', 'GeweldsEnSeksueleMisdrijven',
       'AfstandTotHuisartsenpost', 'AfstandTotApotheek',
       'AfstandTotConsultatiebureau', 'AfstandTotOvDagelLevensmiddelen',
       'AfstandTotWarenhuis', 'AfstandTotCafeED', 'AfstandTotCafetariaED',
       'AfstandTotResta

# Random Forest

**Feature selection**

In [16]:
# Get the accuracy score of the model with all the features
rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X_train, y_train)
best_rf = rf
selected_features = X_train.columns.tolist()
best_features = selected_features
best_features_accuracy = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0), X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

while True:
    # Make DataFrame of feature impotances
    result = permutation_importance(
            rf, X_train[selected_features], y_train, n_repeats=10, random_state=42, n_jobs=-1
        )
    df_importances = pd.DataFrame({'feature': X_train[selected_features].columns, 'importance': result.importances_mean, 'std': result.importances_std})
    df_importances = df_importances.sort_values('importance')

    # Remove least important feature
    selected_features.remove(df_importances['feature'].iloc[0])

    # When there are no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X_train[selected_features], y_train)

    # Evaluate Model Performance
    accuracy = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    # Check if the model performs better without the insignificant features
    if accuracy >= best_features_accuracy:
        best_features_accuracy = accuracy
        best_features = selected_features.copy()
        best_model = rf

# Save results
best_model_rf = best_model
best_features_rf = best_features

# Show results
print("Best features:", best_features_rf)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_rf)))

Best features: ['KLFinancieleDienstenOnroerendGoed', 'AfstandTotAttractie', 'EenOfMeerLangdurigeAandoeningen']
Dropped features: ['AfstandTotOpenDroogNatTerrein', 'AfstandTotOpenbaarGroenTotaal', 'MatigOvergewicht', 'ErnstigOvergewichtObesitas', 'AfstandTotKunstijsbaan', 'GemiddeldeWoningwaarde', 'VoldoetAanBeweegrichtlijn', 'AfstandTotHuisartsenpost', 'AfstandTotOpenNatTerreinTotaal', 'AfstandTotVolkstuin', 'AfstandTotSportterrein', 'AfstandTotBegraafplaats', 'VoldoetAanRichtlijnAlcoholgebruik', 'OppervlakteWater', 'AfstandTotRestaurant', 'AfstandTotCafetariaED', 'AfstandTotBibliotheek', 'InBezitWoningcorporatie', 'EigendomOnbekend', 'AfstandTotApotheek', 'ALandbouwBosbouwEnVisserij', 'Rokers', 'AfstandTotBrandweerkazerne', 'AfstandTotZonnebank', 'PercentageOnbewoond', 'AfstandTotHotelED', 'AfstandTotBioscoop', 'PersonenautoSNaarOppervlakte', 'VernielingMisdrijfTegenOpenbareOrde', 'AfstandTotCafeED', 'ErvarenGezondheidGoedZeerGoed', 'AfstandTotBos', 'UrenMantelzorgPerWeek', 'AfstandTo

**Hyperparameter optimalisatie**

In [17]:
def bo_params_rf(min_samples_split,max_depth,max_features,n_estimators):
    
    params = {
        'min_samples_split': round(min_samples_split),
        'max_depth': round(max_depth),
        'max_features': max_features,
        'n_estimators':round(n_estimators)
    }    
        
    clf = RandomForestRegressor(min_samples_split=int(params['min_samples_split']), 
                                 max_depth=int(params['max_depth']), 
                                 max_features=params['max_features'],
                                 n_estimators=int(params['n_estimators']), 
                                 bootstrap=False,
                                 n_jobs = -1)
    return cross_val_score(clf, X_train[best_features_rf], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

# Create Bayesian Optimization
rf_bo = BayesianOptimization(f=bo_params_rf, pbounds={
    'n_estimators': (100, 1000),
    'max_depth': (1, 20),
    'max_features': (0.1, 1),
    'min_samples_split': (2, 12)
})

# Bayesian optimization
results = rf_bo.maximize(n_iter=40, init_points=10)

# Getting best hyperparameters from bayesian optimization
params = rf_bo.max['params']
params['n_estimators']= round(params['n_estimators'])
params['min_samples_split']= round(params['min_samples_split'])
params['max_depth']= round(params['max_depth'])

# Making random forest model with the best hyperparameters
# criterion entropy is after the hyperparameter optimization since it's more accurate but increases training time
best_model_rf_classification = RandomForestRegressor(min_samples_split=params['min_samples_split'],
                                                max_depth=params['max_depth'],
                                                max_features=params['max_features'],
                                                n_estimators=params['n_estimators'],
                                                bootstrap=False)

best_model_rf_classification.fit(X_train[best_features_rf], y_train)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-1.579   [0m | [0m18.55    [0m | [0m0.8457   [0m | [0m8.545    [0m | [0m425.3    [0m |
| [95m2        [0m | [95m-1.569   [0m | [95m1.607    [0m | [95m0.7625   [0m | [95m2.742    [0m | [95m460.5    [0m |
| [0m3        [0m | [0m-1.576   [0m | [0m6.085    [0m | [0m0.5858   [0m | [0m3.413    [0m | [0m186.8    [0m |
| [0m4        [0m | [0m-1.572   [0m | [0m5.005    [0m | [0m0.2828   [0m | [0m3.536    [0m | [0m417.6    [0m |
| [0m5        [0m | [0m-1.572   [0m | [0m4.645    [0m | [0m0.7627   [0m | [0m10.0     [0m | [0m290.2    [0m |
| [95m6        [0m | [95m-1.568   [0m | [95m4.052    [0m | [95m0.3993   [0m | [95m8.585    [0m | [95m300.6    [0m |
| [0m7        [0m | [0m-1.578   [0m | [0m7.539    [0m | [0m0.8402   [0m | [0m8.293    [0m | [0m22

# neural network

**Feature selection**

In [18]:
# Get the accuracy score of the model with all the features
mlp = MLPRegressor(random_state=0).fit(X_train, y_train)
best_model = mlp
selected_features = X_train.columns.tolist()
best_features = selected_features
best_features_accuracy = cross_val_score(MLPRegressor(random_state=0), X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

while True:
    # Retrieve the learned weights
    weights = mlp.coefs_[0]  # Weights connecting input features to the first hidden layer

    # Normalize the weights
    normalized_weights = np.abs(weights) / np.sum(np.abs(weights), axis=0)

    # Calculate feature importance
    feature_importance = np.mean(normalized_weights, axis=1)

    # Sort features based on their importance
    sorted_indices = np.argsort(feature_importance)
    selected_features = [selected_features[i] for i in sorted_indices[:-1]]

    # When there are no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    mlp = MLPRegressor(random_state=0).fit(X_train[selected_features], y_train)

    # Evaluate Model Performance
    accuracy = cross_val_score(MLPRegressor(random_state=0), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    # Check if the model performs better without the insignificant features
    if accuracy >= best_features_accuracy:
        best_features_accuracy = accuracy
        best_features = selected_features.copy()
        best_model = mlp

# Save results
best_model_nn = best_model
best_features_nn = best_features

# Show results
print("Best features:", best_features_nn)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_nn)))

Best features: ['VoldoetAanBeweegrichtlijn', 'AfstandTotVerblijfsrecreatiefTerrein', 'AfstandTotOpenbaarGroenTotaal', 'AfstandTotWarenhuis', 'BeperkingInZien', 'AfstandTotBrandweerkazerne', 'AfstandTotOpenDroogNatTerrein', 'OppervlakteWater', 'AfstandTotApotheek', 'BeperkingInHoren', 'AfstandTotDagrecreatiefTerrein', 'AfstandTotPoppodium', 'AfstandTotZonnebank', 'ALandbouwBosbouwEnVisserij', 'UrenMantelzorgPerWeek', 'GeboorteRelatief', 'AfstandTotCafeED', 'SterfteRelatief', 'EenOfMeerLangdurigeAandoeningen', 'AfstandTotOvDagelLevensmiddelen', 'AfstandTotHuisartsenpost', 'OvermatigDrinker', 'AfstandTotHotelED', 'AfstandTotKunstijsbaan', 'NederlandseAntillenEnAruba', 'AfstandTotBuitenschoolseOpvang', 'VernielingMisdrijfTegenOpenbareOrde', 'ErnstigOvergewichtObesitas', 'PersonenautoSPerHuishouden', 'AfstandTotAttractie', 'AfstandTotBioscoop', 'AfstandTotSemiOpenbaarGroenTotaal', 'AfstandTotVolkstuin', 'PersonenautoSNaarOppervlakte', 'AfstandTotOpenNatTerreinTotaal', 'KLFinancieleDienstenO

**Hyperparameter optimalisatie**

In [19]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    regressor = model(**params)
    
    # Calculating rmse based on
    scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
    return scores.mean()

params_ranges = {
    'hidden_layer_sizes': (10, 100),
    'alpha': (0.0001, 0.1),
    'learning_rate_init': (0.001, 0.1),
    'max_iter': (100, 1000)
}

# Example usage with Neural Network
model = MLPRegressor
model_bo = BayesianOptimization(f=lambda hidden_layer_sizes, alpha, learning_rate_init, max_iter:
                                    bo_params_generic(model, {
                                        'hidden_layer_sizes': (int(round(hidden_layer_sizes)),),
                                        'alpha': alpha,
                                        'learning_rate_init': learning_rate_init,
                                        'max_iter': int(round(max_iter))
                                    }, X_train[best_features_nn], y_train),
                             pbounds=params_ranges)

results = model_bo.maximize(n_iter=40, init_points=10)
params = model_bo.max['params']

# Creating a model with the best hyperparameters
best_model_nn = model(
    hidden_layer_sizes=(int(round(params['hidden_layer_sizes'])),),
    alpha=params['alpha'],
    learning_rate_init=params['learning_rate_init'],
    max_iter=int(round(params['max_iter']))
)

# Fit the model
best_model_nn.fit(X_train[best_features_nn], y_train)

|   iter    |  target   |   alpha   | hidden... | learni... | max_iter  |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-1.262   [0m | [0m0.01107  [0m | [0m50.59    [0m | [0m0.04931  [0m | [0m157.4    [0m |
| [0m2        [0m | [0m-1.269   [0m | [0m0.0221   [0m | [0m48.37    [0m | [0m0.09224  [0m | [0m789.5    [0m |
| [95m3        [0m | [95m-1.256   [0m | [95m0.009029 [0m | [95m89.67    [0m | [95m0.04827  [0m | [95m970.4    [0m |
| [0m4        [0m | [0m-1.261   [0m | [0m0.04636  [0m | [0m82.75    [0m | [0m0.0349   [0m | [0m620.3    [0m |
| [0m5        [0m | [0m-1.27    [0m | [0m0.0209   [0m | [0m40.9     [0m | [0m0.0962   [0m | [0m441.1    [0m |
| [0m6        [0m | [0m-1.263   [0m | [0m0.003881 [0m | [0m88.92    [0m | [0m0.009816 [0m | [0m820.2    [0m |
| [0m7        [0m | [0m-1.262   [0m | [0m0.06051  [0m | [0m18.15    [0m | [0m0.0201   [0m | [0m599.7   

# plotting the bar chart

**using train dataset**

In [20]:
regression_models = [
    {
        'name': 'Multiple Linear Regression',
        'model': best_model_mlr,
        'features': best_features_mlr
    },
    {
        'name': 'Random Forest',
        'model': best_model_rf,
        'features': best_features_rf
    },
    {
        'name': 'Neural Network',
        'model': best_model_nn,
        'features': best_features_nn
    },
    {
        'name': 'Support Vector Machine',
        'model': best_model_svm,
        'features': best_features_svm
    }
]

regression_stats_val = []

for model in regression_models:
    try:
        # Perform cross-validation and calculate predictions and metrics
        y_pred = cross_val_predict(model['model'], X_train[model['features']], y_train, cv=10)
        y_true = y_train
    except: # cv can't be higher then n_splits in the dt
        # Perform cross-validation and calculate predictions and metrics
        y_pred = cross_val_predict(model['model'], X_train[model['features']], y_train, cv=5)
        y_true = y_train

    # R2 Score
    r2 = r2_score(y_true, y_pred)

    # Adjusted R2 Score
    n = len(y_true)  # Number of samples
    p = X_test.shape[1]  # Number of predictors (features) in X
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

    # Save stats
    regression_stats_val.append(
        {
            'model': model['name'],
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2,
            'adj_r2': adj_r2
        }
    )

# View stats
regression_stats_val

[{'model': 'Multiple Linear Regression',
  'mae': 0.9325472013200807,
  'mse': 1.5705327731730543,
  'rmse': 1.253208990221924,
  'r2': 0.049020213502097065,
  'adj_r2': -0.0243740368340144},
 {'model': 'Random Forest',
  'mae': 0.9367582101915491,
  'mse': 1.57991811571218,
  'rmse': 1.2569479367548124,
  'r2': 0.04333725597550231,
  'adj_r2': -0.03049559086211051},
 {'model': 'Neural Network',
  'mae': 0.9397737414801397,
  'mse': 1.5912142684164559,
  'rmse': 1.2614334181463784,
  'r2': 0.036497275893293435,
  'adj_r2': -0.03786346356367387},
 {'model': 'Support Vector Machine',
  'mae': 0.9292606022834744,
  'mse': 1.5892102138796138,
  'rmse': 1.2606388118250262,
  'r2': 0.03771075923355249,
  'adj_r2': -0.03655632660288788}]

In [21]:
# Extracting model names and metric scores
model_names = [d['model'] for d in regression_stats_val]
metric_labels = list(regression_stats_val[0].keys())[1:]
metric_scores = {metric: [d[metric] for d in regression_stats_val] for metric in metric_labels}

# Plotting the bar chart
fig = go.Figure()

for metric in metric_labels:
    fig.add_trace(go.Bar(
        x=model_names,
        y=metric_scores[metric],
        name=metric
    ))

# Updating the layout
fig.update_layout(
    title='Evaluation Metrics Comparison',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group'
)

# Display the plot
fig.show()

**Evaluating on test data**

In [22]:
regression_models = [
    {
        'name': 'Multiple Linear Regression',
        'model': best_model_mlr,
        'features': best_features_mlr
    },
    {
        'name': 'Random Forest',
        'model': best_model_rf,
        'features': best_features_rf
    },
    {
        'name': 'Neural Network',
        'model': best_model_nn,
        'features': best_features_nn
    },
    {
        'name': 'Support Vector Machine',
        'model': best_model_svm,
        'features': best_features_svm
    }
]

regression_stats_test = []

for model in regression_models:
    # Make predictions on the test set
    y_pred = model['model'].predict(X_test[model['features']])

    # Defining Actual values
    y_true = y_test.copy()

    # R2 Score
    r2 = r2_score(y_true, y_pred)

    # Adjusted R2 Score
    n = len(y_true)  # Number of samples
    p = X_test.shape[1]  # Number of predictors (features) in X
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

    # Save stats
    regression_stats_test.append(
        {
            'model': model['name'],
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2,
            'adj_r2': adj_r2
        }
    )

# View stats
regression_stats_test

[{'model': 'Multiple Linear Regression',
  'mae': 0.9768324368749295,
  'mse': 1.7154191705024935,
  'rmse': 1.309740115634584,
  'r2': -0.0006624327837649879,
  'adj_r2': -0.4032277793059693},
 {'model': 'Random Forest',
  'mae': 0.961934393152983,
  'mse': 1.6917660906320193,
  'rmse': 1.3006790882581374,
  'r2': 0.013135214376190696,
  'adj_r2': -0.383879354552928},
 {'model': 'Neural Network',
  'mae': 0.9750456502469834,
  'mse': 1.7245631627083347,
  'rmse': 1.313226242011762,
  'r2': -0.005996434900211289,
  'adj_r2': -0.41070764434282503},
 {'model': 'Support Vector Machine',
  'mae': 0.9679885010940884,
  'mse': 1.7352632673064325,
  'rmse': 1.3172939183441303,
  'r2': -0.012238170379381241,
  'adj_r2': -0.4194604228308565}]

In [23]:
# Extracting model names and metric scores
model_names = [d['model'] for d in regression_stats_test]
metric_labels = list(regression_stats_test[0].keys())[1:]
metric_scores = {metric: [d[metric] for d in regression_stats_test] for metric in metric_labels}

# Plotting the bar chart
fig = go.Figure()

for metric in metric_labels:
    fig.add_trace(go.Bar(
        x=model_names,
        y=metric_scores[metric],
        name=metric
    ))

# Updating the layout
fig.update_layout(
    title='Evaluation Metrics Comparison',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group'
)

# Display the plot
fig.show()

Naive bayes

In [24]:
# Defining Actual values
y_true = y_test.copy()

# Defining Predicted values
y_pred = np.full_like(y_test, y_train.mean())

# Mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Mean Squared Error
mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)

# Root Mean Squared Error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# R2 Score
r2 = r2_score(y_true, y_pred)
print("R2 Score:", r2)

# Adjusted R2 Score
n = len(y_true)  # Number of samples
p = X_test.shape[1]  # Number of predictors (features) in X
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print("Adjusted R2 Score:", adj_r2)

Mean Absolute Error: 0.9708538458328116
Mean Squared Error: 1.7233840289528939
Root Mean Squared Error: 1.3127772198483998
R2 Score: -0.005308606016993789
Adjusted R2 Score: -0.40974310269049696


Save best model as Pickle

In [25]:
import pickle

# Save model
file_path = 'Saved Models/TUG.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(best_model_mlr, file)

# Save features
with open('Saved Models/best_features_TUG.txt', 'w') as file:
    file.write(f'{best_features_mlr}')