# General Imports

**Importing all libraries**

In [1]:
import numpy as np
import pandas as pd
import warnings

import plotly.graph_objects as go
from bayes_opt import BayesianOptimization
import statsmodels.api as sm

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance


Disable warnings

In [2]:
# Disable all warnings
warnings.filterwarnings("ignore")

# Enable warnings again
## warnings.filterwarnings("default")

Generate train and test data

In [3]:
# Read Data
df = pd.read_csv("data_finish_prep.csv")

# Select only the usefull part of the df
df.drop(columns=['verschil_Lengte',
               'verschil_6 MWT', 'verschil_TUG',
               'verschil_BMI', 'verschil_Conditie',
               'verschil_Gewicht', 'verschil_Knijpkracht'], inplace=True)

# Drop NaN values
df.dropna(inplace=True)

# Define X and y
X = df.drop(columns=['verschil_Lenigheid'])
y = df['verschil_Lenigheid']

# Split data into train & test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    regressor = model(**params)
    
    # Assuming you have X_train, y_train defined for regression
    scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
    return -scores.mean()

# MLR

**feature selection**

In [5]:
# Fit the Full Model
X =  sm.add_constant(X_train)  # Add constant term for the intercept
model = sm.OLS(y_train, X_train).fit()  # Fit the MLR model
best_model = model  # Initialize the best model
selected_features = X_train.columns.tolist()
best_features = X_train.columns.tolist()
best_features_rmse = np.sqrt(-cross_val_score(LinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error').mean())

# Iteratively remove one feature at a time based on p-values
while True:
    # Compute p-values
    p_values = model.pvalues[1:]  # Exclude the constant term

    # Identify Insignificant Features
    insignificant_feature = p_values.idxmax()
    max_p_value = p_values.max()

    # Remove Insignificant Feature
    X = X.drop(insignificant_feature, axis=1)
    selected_features.remove(insignificant_feature)

    # When there a no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    model = sm.OLS(y_train, X).fit()
    
    # Evaluate Model Performance
    rmse = np.sqrt(-cross_val_score(LinearRegression(), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error').mean())

    # Check if the model performs better without the insignificant features
    if rmse <= best_features_rmse:
        best_features_rmse = rmse
        best_features = selected_features.copy()
        best_model = model

# Save results
best_model_mlr = LinearRegression().fit(X_train[best_features], y_train)
best_features_mlr = best_features

# Show results
print("Best features:", best_features)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features)))

Best features: ['EigendomOnbekend', 'PersonenautoSOverigeBrandstof', 'AfstandTotHuisartsenpraktijk', 'TotaalDiefstalUitWoningSchuurED', 'AfstandTotConsultatiebureau', 'AfstandTotHotelED']
Dropped features: ['AfstandTotOvDagelLevensmiddelen', 'ZwaarBelasteMantelzorgers', 'AfstandTotCafetariaED', 'SterfteRelatief', 'AfstandTotPodiumkunstenTotaal', 'AfstandTotBioscoop', 'ErvarenGezondheidGoedZeerGoed', 'GeweldsEnSeksueleMisdrijven', 'AfstandTotSemiOpenbaarGroenTotaal', 'kPersonenMetLaagsteInkomen', 'AfstandTotPoppodium', 'AfstandTotDagrecreatiefTerrein', 'AfstandTotVolkstuin', 'GeboorteRelatief', 'AfstandTotApotheek', 'AfstandTotHuisartsenpost', 'AfstandTotRestaurant', 'PersonenautoSPerHuishouden', 'AfstandTotSportterrein', 'AfstandTotBos', 'AfstandTotSchool', 'AfstandTotBegraafplaats', 'UrenMantelzorgPerWeek', 'AfstandTotTreinstationsTotaal', 'AfstandTotBrandweerkazerne', 'NederlandseAntillenEnAruba', 'AfstandTotMuseum', 'PercentageOnbewoond', 'ALandbouwBosbouwEnVisserij', 'AfstandTotVer

*Er zijn geen hyperparameters om te optimaliseren.*

# Support Vector Machines

**Hyperparameter optimalisatie**

In [6]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    classifier = model(**params)
    
    # Calculating accuracy based on cross-validation
    scores = cross_val_score(classifier, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    return scores.mean()

params_ranges = {
    'C': (0.1, 10),
    'kernel_int': (1, 4),
    'gamma': (0.001, 0.1)
}

kernel_mapping = {
    1: 'linear',
    2: 'rbf',
    3: 'poly',
    4: 'sigmoid'
}

# Example usage with SVM
model = SVR
model_bo = BayesianOptimization(f=lambda C, kernel_int, gamma:
                                    bo_params_generic(model, {
                                        'C': C,
                                        'kernel': kernel_mapping[int(kernel_int)],
                                        'gamma': gamma
                                    }, X_train, y_train),
                             pbounds=params_ranges)

results = model_bo.maximize(n_iter=40, init_points=10)
params = model_bo.max['params']

# Creating a model with the best hyperparameters
best_model_svm = model(
    C=params['C'],
    kernel=kernel_mapping[int(params['kernel_int'])],
    gamma=params['gamma']
)

# Fit the model
best_model_svm.fit(X_train, y_train)


|   iter    |  target   |     C     |   gamma   | kernel... |
-------------------------------------------------------------
| [0m1        [0m | [0m-40.25   [0m | [0m6.609    [0m | [0m0.004262 [0m | [0m2.613    [0m |
| [0m2        [0m | [0m-40.27   [0m | [0m3.138    [0m | [0m0.08299  [0m | [0m2.216    [0m |
| [0m3        [0m | [0m-40.28   [0m | [0m9.28     [0m | [0m0.08618  [0m | [0m2.721    [0m |
| [0m4        [0m | [0m-40.3    [0m | [0m6.0      [0m | [0m0.08572  [0m | [0m3.029    [0m |
| [0m5        [0m | [0m-40.28   [0m | [0m4.186    [0m | [0m0.04879  [0m | [0m2.218    [0m |
| [0m6        [0m | [0m-40.27   [0m | [0m3.146    [0m | [0m0.04924  [0m | [0m3.083    [0m |
| [95m7        [0m | [95m-40.24   [0m | [95m7.463    [0m | [95m0.004816 [0m | [95m1.321    [0m |
| [0m8        [0m | [0m-40.25   [0m | [0m0.8075   [0m | [0m0.05861  [0m | [0m2.358    [0m |
| [0m9        [0m | [0m-40.33   [0m | [0m6.765    

**Feature Selection**

In [7]:
# Get the accuracy score of the model with all features
best_accuracy = cross_val_score(best_model_svm, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()
best_model = best_model_svm
selected_features = X_train.columns
best_features = selected_features

while len(selected_features) > 1:
    try:
        # Retrieve the coefficients from the SVM model
        coefficients = best_model_svm.coef_[0]

        # Sort features based on their coefficient magnitudes
        sorted_indices = np.argsort(np.abs(coefficients))
        selected_features = [X_train.columns[i] for i in sorted_indices[::-1]]

        # Remove the least important feature
        selected_features = selected_features[:-1]
        
        # Refit the linear SVM classifier
        best_model_svm_classification.fit(X_train[selected_features], y_train)
        
        # Evaluate Model Performance
        accuracy = cross_val_score(best_model_svm, X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

        # Check if the model performs better without the insignificant features
        if accuracy >= best_accuracy:
            best_accuracy = accuracy
            best_features = selected_features.copy()
            best_model = best_model_svm
    except:
        print("Scince the kernal is not linear, this feature selection is not possible") 
        break

# Save results
best_model_svm = best_model
best_features_svm = best_features

# Show results
print("Best features:", best_features_svm)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_svm)))

Scince the kernal is not linear, this feature selection is not possible
Best features: Index(['NederlandseAntillenEnAruba', 'GeboorteRelatief', 'SterfteRelatief',
       'GemiddeldeWoningwaarde', 'PercentageOnbewoond',
       'InBezitWoningcorporatie', 'InBezitOverigeVerhuurders',
       'EigendomOnbekend', 'BouwjaarVanaf2000', 'kPersonenMetLaagsteInkomen',
       'ALandbouwBosbouwEnVisserij', 'KLFinancieleDienstenOnroerendGoed',
       'PersonenautoSOverigeBrandstof', 'PersonenautoSPerHuishouden',
       'PersonenautoSNaarOppervlakte', 'AfstandTotHuisartsenpraktijk',
       'AfstandTotKinderdagverblijf', 'AfstandTotSchool', 'OppervlakteWater',
       'TotaalDiefstalUitWoningSchuurED',
       'VernielingMisdrijfTegenOpenbareOrde', 'GeweldsEnSeksueleMisdrijven',
       'AfstandTotHuisartsenpost', 'AfstandTotApotheek',
       'AfstandTotConsultatiebureau', 'AfstandTotOvDagelLevensmiddelen',
       'AfstandTotWarenhuis', 'AfstandTotCafeED', 'AfstandTotCafetariaED',
       'AfstandTotResta

# Random Forest

**Feature selection**

In [8]:
# Get the accuracy score of the model with all the features
rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X_train, y_train)
best_rf = rf
selected_features = X_train.columns.tolist()
best_features = selected_features
best_features_accuracy = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0), X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

while True:
    # Make DataFrame of feature impotances
    result = permutation_importance(
            rf, X_train[selected_features], y_train, n_repeats=10, random_state=42, n_jobs=-1
        )
    df_importances = pd.DataFrame({'feature': X_train[selected_features].columns, 'importance': result.importances_mean, 'std': result.importances_std})
    df_importances = df_importances.sort_values('importance')

    # Remove least important feature
    selected_features.remove(df_importances['feature'].iloc[0])

    # When there are no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X_train[selected_features], y_train)

    # Evaluate Model Performance
    accuracy = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    # Check if the model performs better without the insignificant features
    if accuracy >= best_features_accuracy:
        best_features_accuracy = accuracy
        best_features = selected_features.copy()
        best_model = rf

# Save results
best_model_rf = best_model
best_features_rf = best_features

# Show results
print("Best features:", best_features_rf)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_rf)))

Best features: ['AfstandTotHotelED', 'AfstandTotOpenNatNatuurlijkTerrein', 'AfstandTotKunstijsbaan']
Dropped features: ['AfstandTotOvDagelLevensmiddelen', 'ZwaarBelasteMantelzorgers', 'AfstandTotCafetariaED', 'SterfteRelatief', 'AfstandTotPodiumkunstenTotaal', 'AfstandTotBioscoop', 'ErvarenGezondheidGoedZeerGoed', 'GeweldsEnSeksueleMisdrijven', 'AfstandTotSemiOpenbaarGroenTotaal', 'AfstandTotHuisartsenpraktijk', 'kPersonenMetLaagsteInkomen', 'AfstandTotPoppodium', 'AfstandTotDagrecreatiefTerrein', 'AfstandTotVolkstuin', 'GeboorteRelatief', 'AfstandTotApotheek', 'AfstandTotHuisartsenpost', 'AfstandTotRestaurant', 'PersonenautoSPerHuishouden', 'AfstandTotSportterrein', 'AfstandTotBos', 'AfstandTotSchool', 'AfstandTotBegraafplaats', 'UrenMantelzorgPerWeek', 'AfstandTotTreinstationsTotaal', 'AfstandTotBrandweerkazerne', 'PersonenautoSOverigeBrandstof', 'TotaalDiefstalUitWoningSchuurED', 'NederlandseAntillenEnAruba', 'AfstandTotMuseum', 'PercentageOnbewoond', 'ALandbouwBosbouwEnVisserij', '

**Hyperparameter optimalisatie**

In [9]:
def bo_params_rf(min_samples_split,max_depth,max_features,n_estimators):
    
    params = {
        'min_samples_split': round(min_samples_split),
        'max_depth': round(max_depth),
        'max_features': max_features,
        'n_estimators':round(n_estimators)
    }    
        
    clf = RandomForestRegressor(min_samples_split=int(params['min_samples_split']), 
                                 max_depth=int(params['max_depth']), 
                                 max_features=params['max_features'],
                                 n_estimators=int(params['n_estimators']), 
                                 bootstrap=False,
                                 n_jobs = -1)
    return cross_val_score(clf, X_train[best_features_rf], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

# Create Bayesian Optimization
rf_bo = BayesianOptimization(f=bo_params_rf, pbounds={
    'n_estimators': (100, 1000),
    'max_depth': (1, 20),
    'max_features': (0.1, 1),
    'min_samples_split': (2, 12)
})

# Bayesian optimization
results = rf_bo.maximize(n_iter=40, init_points=10)

# Getting best hyperparameters from bayesian optimization
params = rf_bo.max['params']
params['n_estimators']= round(params['n_estimators'])
params['min_samples_split']= round(params['min_samples_split'])
params['max_depth']= round(params['max_depth'])

# Making random forest model with the best hyperparameters
# criterion entropy is after the hyperparameter optimization since it's more accurate but increases training time
best_model_rf_classification = RandomForestRegressor(min_samples_split=params['min_samples_split'],
                                                max_depth=params['max_depth'],
                                                max_features=params['max_features'],
                                                n_estimators=params['n_estimators'],
                                                bootstrap=False)

best_model_rf_classification.fit(X_train[best_features_rf], y_train)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-40.01   [0m | [0m1.831    [0m | [0m0.4243   [0m | [0m5.849    [0m | [0m971.8    [0m |
| [0m2        [0m | [0m-40.07   [0m | [0m14.95    [0m | [0m0.3514   [0m | [0m10.61    [0m | [0m374.5    [0m |
| [0m3        [0m | [0m-40.08   [0m | [0m9.579    [0m | [0m0.2602   [0m | [0m9.09     [0m | [0m339.8    [0m |
| [0m4        [0m | [0m-40.17   [0m | [0m5.878    [0m | [0m0.8091   [0m | [0m6.319    [0m | [0m726.2    [0m |
| [0m5        [0m | [0m-40.09   [0m | [0m19.09    [0m | [0m0.5504   [0m | [0m9.906    [0m | [0m266.5    [0m |
| [0m6        [0m | [0m-40.09   [0m | [0m18.19    [0m | [0m0.3616   [0m | [0m2.716    [0m | [0m241.7    [0m |
| [0m7        [0m | [0m-40.08   [0m | [0m18.49    [0m | [0m0.3557   [0m | [0m6.034    [0m | [0m401.5    [0m 

# neural network

**Feature selection**

In [10]:
# Get the accuracy score of the model with all the features
mlp = MLPRegressor(random_state=0).fit(X_train, y_train)
best_model = mlp
selected_features = X_train.columns.tolist()
best_features = selected_features
best_features_accuracy = cross_val_score(MLPRegressor(random_state=0), X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

while True:
    # Retrieve the learned weights
    weights = mlp.coefs_[0]  # Weights connecting input features to the first hidden layer

    # Normalize the weights
    normalized_weights = np.abs(weights) / np.sum(np.abs(weights), axis=0)

    # Calculate feature importance
    feature_importance = np.mean(normalized_weights, axis=1)

    # Sort features based on their importance
    sorted_indices = np.argsort(feature_importance)
    selected_features = [selected_features[i] for i in sorted_indices[:-1]]

    # When there are no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    mlp = MLPRegressor(random_state=0).fit(X_train[selected_features], y_train)

    # Evaluate Model Performance
    accuracy = cross_val_score(MLPRegressor(random_state=0), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    # Check if the model performs better without the insignificant features
    if accuracy >= best_features_accuracy:
        best_features_accuracy = accuracy
        best_features = selected_features.copy()
        best_model = mlp

# Save results
best_model_nn = best_model
best_features_nn = best_features

# Show results
print("Best features:", best_features_nn)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_nn)))

Best features: ['AfstandTotBelangrijkOverstapstation', 'VoldoetAanRichtlijnAlcoholgebruik', 'AfstandTotCafeED', 'NederlandseAntillenEnAruba', 'PersonenautoSPerHuishouden', 'AfstandTotPoppodium', 'SterfteRelatief', 'AfstandTotHuisartsenpraktijk', 'ALandbouwBosbouwEnVisserij', 'Ondergewicht', 'AfstandTotBegraafplaats', 'TotaalDiefstalUitWoningSchuurED', 'ErnstigOvergewichtObesitas', 'InBezitOverigeVerhuurders']
Dropped features: ['AfstandTotHotelED', 'AfstandTotOvDagelLevensmiddelen', 'ZwaarBelasteMantelzorgers', 'AfstandTotCafetariaED', 'AfstandTotPodiumkunstenTotaal', 'AfstandTotBioscoop', 'ErvarenGezondheidGoedZeerGoed', 'GeweldsEnSeksueleMisdrijven', 'AfstandTotSemiOpenbaarGroenTotaal', 'kPersonenMetLaagsteInkomen', 'AfstandTotDagrecreatiefTerrein', 'AfstandTotVolkstuin', 'GeboorteRelatief', 'AfstandTotApotheek', 'AfstandTotHuisartsenpost', 'AfstandTotRestaurant', 'AfstandTotSportterrein', 'AfstandTotBos', 'AfstandTotSchool', 'UrenMantelzorgPerWeek', 'AfstandTotTreinstationsTotaal', 

**Hyperparameter optimalisatie**

In [11]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    regressor = model(**params)
    
    # Calculating rmse based on
    scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
    return scores.mean()

params_ranges = {
    'hidden_layer_sizes': (10, 100),
    'alpha': (0.0001, 0.1),
    'learning_rate_init': (0.001, 0.1),
    'max_iter': (100, 1000)
}

# Example usage with Neural Network
model = MLPRegressor
model_bo = BayesianOptimization(f=lambda hidden_layer_sizes, alpha, learning_rate_init, max_iter:
                                    bo_params_generic(model, {
                                        'hidden_layer_sizes': (int(round(hidden_layer_sizes)),),
                                        'alpha': alpha,
                                        'learning_rate_init': learning_rate_init,
                                        'max_iter': int(round(max_iter))
                                    }, X_train[best_features_nn], y_train),
                             pbounds=params_ranges)

results = model_bo.maximize(n_iter=40, init_points=10)
params = model_bo.max['params']

# Creating a model with the best hyperparameters
best_model_nn = model(
    hidden_layer_sizes=(int(round(params['hidden_layer_sizes'])),),
    alpha=params['alpha'],
    learning_rate_init=params['learning_rate_init'],
    max_iter=int(round(params['max_iter']))
)

# Fit the model
best_model_nn.fit(X_train[best_features_nn], y_train)

|   iter    |  target   |   alpha   | hidden... | learni... | max_iter  |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-6.326   [0m | [0m0.0384   [0m | [0m33.01    [0m | [0m0.09991  [0m | [0m735.5    [0m |
| [95m2        [0m | [95m-6.322   [0m | [95m0.08634  [0m | [95m38.05    [0m | [95m0.09131  [0m | [95m296.6    [0m |
| [0m3        [0m | [0m-6.329   [0m | [0m0.05689  [0m | [0m43.52    [0m | [0m0.03594  [0m | [0m810.1    [0m |
| [95m4        [0m | [95m-6.319   [0m | [95m0.09579  [0m | [95m56.55    [0m | [95m0.07501  [0m | [95m569.6    [0m |
| [0m5        [0m | [0m-6.326   [0m | [0m0.06148  [0m | [0m75.15    [0m | [0m0.02454  [0m | [0m811.8    [0m |
| [0m6        [0m | [0m-6.332   [0m | [0m0.01993  [0m | [0m75.45    [0m | [0m0.009217 [0m | [0m538.4    [0m |
| [0m7        [0m | [0m-6.32    [0m | [0m0.09086  [0m | [0m88.87    [0m | [0m0.08749  [0m | [0m64

# plotting the bar chart

**using train dataset**

In [12]:
regression_models = [
    {
        'name': 'Multiple Linear Regression',
        'model': best_model_mlr,
        'features': best_features_mlr
    },
    {
        'name': 'Random Forest',
        'model': best_model_rf,
        'features': best_features_rf
    },
    {
        'name': 'Neural Network',
        'model': best_model_nn,
        'features': best_features_nn
    },
    {
        'name': 'Support Vector Machine',
        'model': best_model_svm,
        'features': best_features_svm
    }
]

regression_stats_val = []

for model in regression_models:
    try:
        # Perform cross-validation and calculate predictions and metrics
        y_pred = cross_val_predict(model['model'], X_train[model['features']], y_train, cv=10)
        y_true = y_train
    except: # cv can't be higher then n_splits in the dt
        # Perform cross-validation and calculate predictions and metrics
        y_pred = cross_val_predict(model['model'], X_train[model['features']], y_train, cv=5)
        y_true = y_train

    # R2 Score
    r2 = r2_score(y_true, y_pred)

    # Adjusted R2 Score
    n = len(y_true)  # Number of samples
    p = X_test.shape[1]  # Number of predictors (features) in X
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

    # Save stats
    regression_stats_val.append(
        {
            'model': model['name'],
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2,
            'adj_r2': adj_r2
        }
    )

# View stats
regression_stats_val

[{'model': 'Multiple Linear Regression',
  'mae': 4.271132898632918,
  'mse': 39.92912894295568,
  'rmse': 6.318949987375725,
  'r2': 0.012762824657256755,
  'adj_r2': -0.06162533710161511},
 {'model': 'Random Forest',
  'mae': 4.322654777866058,
  'mse': 40.11135702458488,
  'rmse': 6.333352747525191,
  'r2': 0.008257283431131857,
  'adj_r2': -0.06647037013164603},
 {'model': 'Neural Network',
  'mae': 4.260345979475965,
  'mse': 40.24957054897028,
  'rmse': 6.344254924651931,
  'r2': 0.0048399905168952095,
  'adj_r2': -0.07014515551520084},
 {'model': 'Support Vector Machine',
  'mae': 4.225914993743921,
  'mse': 40.14149887176524,
  'rmse': 6.335731912870465,
  'r2': 0.007512034214389174,
  'adj_r2': -0.06727177375653937}]

In [13]:
# Extracting model names and metric scores
model_names = [d['model'] for d in regression_stats_val]
metric_labels = list(regression_stats_val[0].keys())[1:]
metric_scores = {metric: [d[metric] for d in regression_stats_val] for metric in metric_labels}

# Plotting the bar chart
fig = go.Figure()

for metric in metric_labels:
    fig.add_trace(go.Bar(
        x=model_names,
        y=metric_scores[metric],
        text=metric_scores[metric],  # Display the values on the bars
        textposition='auto',  # Position the text inside the bars
        name=metric
    ))

# Updating the layout
fig.update_layout(
    title='Evaluation Metrics Comparison',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group'
)

# Display the plot
fig.show()

**Evaluating on test data**

In [14]:
regression_models = [
    {
        'name': 'Multiple Linear Regression',
        'model': best_model_mlr,
        'features': best_features_mlr
    },
    {
        'name': 'Random Forest',
        'model': best_model_rf,
        'features': best_features_rf
    },
    {
        'name': 'Neural Network',
        'model': best_model_nn,
        'features': best_features_nn
    },
    {
        'name': 'Support Vector Machine',
        'model': best_model_svm,
        'features': best_features_svm
    }
]

regression_stats_test = []

for model in regression_models:
    # Make predictions on the test set
    y_pred = model['model'].predict(X_test[model['features']])

    # Defining Actual values
    y_true = y_test.copy()

    # R2 Score
    r2 = r2_score(y_true, y_pred)

    # Adjusted R2 Score
    n = len(y_true)  # Number of samples
    p = X_test.shape[1]  # Number of predictors (features) in X
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

    # Save stats
    regression_stats_test.append(
        {
            'model': model['name'],
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2,
            'adj_r2': adj_r2
        }
    )

# View stats
regression_stats_test

[{'model': 'Multiple Linear Regression',
  'mae': 3.97781567679382,
  'mse': 34.18902499117986,
  'rmse': 5.84713818813784,
  'r2': -0.013434883588436142,
  'adj_r2': -0.40754844942838364},
 {'model': 'Random Forest',
  'mae': 3.9341030694094976,
  'mse': 33.660782863811036,
  'rmse': 5.801791349558431,
  'r2': 0.002223328337602859,
  'adj_r2': -0.3858009328644405},
 {'model': 'Neural Network',
  'mae': 3.8927329889485485,
  'mse': 33.23141476486775,
  'rmse': 5.764669527810571,
  'r2': 0.014950705309641044,
  'adj_r2': -0.3681240204032763},
 {'model': 'Support Vector Machine',
  'mae': 3.8623681616949863,
  'mse': 33.26738144798301,
  'rmse': 5.7677882631025055,
  'r2': 0.013884576886107891,
  'adj_r2': -0.3696047543248502}]

In [18]:
# Extracting model names and metric scores
model_names = [d['model'] for d in regression_stats_test]
metric_labels = list(regression_stats_test[0].keys())[1:]
metric_scores = {metric: [d[metric] for d in regression_stats_test] for metric in metric_labels}

# Plotting the bar chart
fig = go.Figure()

for metric in metric_labels:
    fig.add_trace(go.Bar(
        x=model_names,
        y=metric_scores[metric],
        text=metric_scores[metric],  # Display the values on the bars
        textposition='auto',  # Position the text inside the bars
        name=metric
    ))

# Updating the layout
fig.update_layout(
    title='Evaluation Metrics Comparison Lenigheid',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group'
)

# Display the plot
fig.show()

Naive bayes

In [16]:
# Defining Actual values
y_true = y_test.copy()

# Defining Predicted values
y_pred = np.full_like(y_test, y_train.mean())

# Mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Mean Squared Error
mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)

# Root Mean Squared Error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# R2 Score
r2 = r2_score(y_true, y_pred)
print("R2 Score:", r2)

# Adjusted R2 Score
n = len(y_true)  # Number of samples
p = X_test.shape[1]  # Number of predictors (features) in X
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print("Adjusted R2 Score:", adj_r2)

Mean Absolute Error: 3.901570418326693
Mean Squared Error: 33.74061749653026
Root Mean Squared Error: 5.808667445854536
R2 Score: -0.00014313873003102273
Adjusted R2 Score: -0.3890876926805986


Save best model as Pickle

In [17]:
import pickle

# Save model
file_path = 'Saved Models/Lenigheid.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(best_model_mlr, file)

# Save features
with open('Saved Models/best_features_Lenigheid.txt', 'w') as file:
    file.write(f'{best_features_mlr}')