# General Imports

**Importing all libraries**

In [2]:
import numpy as np
import pandas as pd
import warnings

import plotly.graph_objects as go
from bayes_opt import BayesianOptimization
import statsmodels.api as sm

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance


Disable warnings

In [3]:
# Disable all warnings
warnings.filterwarnings("ignore")

# Enable warnings again
## warnings.filterwarnings("default")

Generate train and test data

In [4]:
# Read Data
df = pd.read_csv("data_finish_prep.csv")

# Select only the usefull part of the df
df.drop(columns=['verschil_Lengte',
               'verschil_6 MWT', 'verschil_TUG',
               'verschil_Gewicht', 'verschil_Conditie',
               'verschil_Lenigheid', 'verschil_Knijpkracht'], inplace=True)

# Drop NaN values
df.dropna(inplace=True)

# Define X and y
X = df.drop(columns=['verschil_BMI'])
y = df['verschil_BMI']

# Split data into train & test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    regressor = model(**params)
    
    # Assuming you have X_train, y_train defined for regression
    scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
    return -scores.mean()

# MLR

**feature selection**

In [6]:
# Fit the Full Model
X =  sm.add_constant(X_train)  # Add constant term for the intercept
model = sm.OLS(y_train, X_train).fit()  # Fit the MLR model
best_model = model  # Initialize the best model
selected_features = X_train.columns.tolist()
best_features = X_train.columns.tolist()
best_features_rmse = np.sqrt(-cross_val_score(LinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error').mean())

# Iteratively remove one feature at a time based on p-values
while True:
    # Compute p-values
    p_values = model.pvalues[1:]  # Exclude the constant term

    # Identify Insignificant Features
    insignificant_feature = p_values.idxmax()
    max_p_value = p_values.max()

    # Remove Insignificant Feature
    X = X.drop(insignificant_feature, axis=1)
    selected_features.remove(insignificant_feature)

    # When there a no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    model = sm.OLS(y_train, X).fit()
    
    # Evaluate Model Performance
    rmse = np.sqrt(-cross_val_score(LinearRegression(), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error').mean())

    # Check if the model performs better without the insignificant features
    if rmse <= best_features_rmse:
        best_features_rmse = rmse
        best_features = selected_features.copy()
        best_model = model

# Save results
best_model_mlr = LinearRegression().fit(X_train[best_features], y_train)
best_features_mlr = best_features

# Show results
print("Best features:", best_features)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features)))

Best features: ['SterfteRelatief', 'AfstandTotOpenbaarGroenTotaal', 'AfstandTotDagrecreatiefTerrein', 'AfstandTotSauna', 'ErvarenGezondheidGoedZeerGoed']
Dropped features: ['GeboorteRelatief', 'NederlandseAntillenEnAruba', 'PercentageOnbewoond', 'AfstandTotOvDagelLevensmiddelen', 'VernielingMisdrijfTegenOpenbareOrde', 'AfstandTotOpenNatNatuurlijkTerrein', 'AfstandTotHuisartsenpost', 'AfstandTotHotelED', 'InBezitOverigeVerhuurders', 'AfstandTotOpenDroogNatTerrein', 'AfstandTotPodiumkunstenTotaal', 'PersonenautoSNaarOppervlakte', 'ZwaarBelasteMantelzorgers', 'Mantelzorger', 'AfstandTotBos', 'AfstandTotHuisartsenpraktijk', 'kPersonenMetLaagsteInkomen', 'VoldoetAanRichtlijnAlcoholgebruik', 'EenOfMeerLangdurigeAandoeningen', 'KLFinancieleDienstenOnroerendGoed', 'AfstandTotVolkstuin', 'AfstandTotMuseum', 'AfstandTotBioscoop', 'AfstandTotTreinstationsTotaal', 'AfstandTotKunstijsbaan', 'AfstandTotSportterrein', 'AfstandTotZonnebank', 'AfstandTotBegraafplaats', 'MatigOvergewicht', 'UrenMantelzo

*Er zijn geen hyperparameters om te optimaliseren.*

# Support Vector Machines

**Hyperparameter optimalisatie**

In [7]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    classifier = model(**params)
    
    # Calculating accuracy based on cross-validation
    scores = cross_val_score(classifier, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    return scores.mean()

params_ranges = {
    'C': (0.1, 10),
    'kernel_int': (1, 4),
    'gamma': (0.001, 0.1)
}

kernel_mapping = {
    1: 'linear',
    2: 'rbf',
    3: 'poly',
    4: 'sigmoid'
}

# Example usage with SVM
model = SVR
model_bo = BayesianOptimization(f=lambda C, kernel_int, gamma:
                                    bo_params_generic(model, {
                                        'C': C,
                                        'kernel': kernel_mapping[int(kernel_int)],
                                        'gamma': gamma
                                    }, X_train, y_train),
                             pbounds=params_ranges)

results = model_bo.maximize(n_iter=40, init_points=10)
params = model_bo.max['params']

# Creating a model with the best hyperparameters
best_model_svm = model(
    C=params['C'],
    kernel=kernel_mapping[int(params['kernel_int'])],
    gamma=params['gamma']
)

# Fit the model
best_model_svm.fit(X_train, y_train)


|   iter    |  target   |     C     |   gamma   | kernel... |
-------------------------------------------------------------
| [0m1        [0m | [0m-1.464   [0m | [0m8.212    [0m | [0m0.06187  [0m | [0m1.454    [0m |
| [95m2        [0m | [95m-1.462   [0m | [95m6.616    [0m | [95m0.0192   [0m | [95m3.165    [0m |
| [0m3        [0m | [0m-1.464   [0m | [0m8.611    [0m | [0m0.04692  [0m | [0m1.715    [0m |
| [95m4        [0m | [95m-1.459   [0m | [95m6.991    [0m | [95m0.07612  [0m | [95m2.141    [0m |
| [95m5        [0m | [95m-1.456   [0m | [95m7.939    [0m | [95m0.09745  [0m | [95m3.105    [0m |
| [0m6        [0m | [0m-1.46    [0m | [0m9.544    [0m | [0m0.04993  [0m | [0m2.052    [0m |
| [0m7        [0m | [0m-1.462   [0m | [0m7.167    [0m | [0m0.01756  [0m | [0m3.542    [0m |
| [0m8        [0m | [0m-1.459   [0m | [0m3.804    [0m | [0m0.07632  [0m | [0m2.321    [0m |
| [0m9        [0m | [0m-1.464   [0m | [0

**Feature Selection**

In [8]:
# Get the accuracy score of the model with all features
best_accuracy = cross_val_score(best_model_svm, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()
best_model = best_model_svm
selected_features = X_train.columns
best_features = selected_features

while len(selected_features) > 1:
    try:
        # Retrieve the coefficients from the SVM model
        coefficients = best_model_svm.coef_[0]

        # Sort features based on their coefficient magnitudes
        sorted_indices = np.argsort(np.abs(coefficients))
        selected_features = [X_train.columns[i] for i in sorted_indices[::-1]]

        # Remove the least important feature
        selected_features = selected_features[:-1]
        
        # Refit the linear SVM classifier
        best_model_svm_classification.fit(X_train[selected_features], y_train)
        
        # Evaluate Model Performance
        accuracy = cross_val_score(best_model_svm, X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

        # Check if the model performs better without the insignificant features
        if accuracy >= best_accuracy:
            best_accuracy = accuracy
            best_features = selected_features.copy()
            best_model = best_model_svm
    except:
        print("Scince the kernal is not linear, this feature selection is not possible") 
        break

# Save results
best_model_svm = best_model
best_features_svm = best_features

# Show results
print("Best features:", best_features_svm)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_svm)))

Scince the kernal is not linear, this feature selection is not possible
Best features: Index(['NederlandseAntillenEnAruba', 'GeboorteRelatief', 'SterfteRelatief',
       'GemiddeldeWoningwaarde', 'PercentageOnbewoond',
       'InBezitWoningcorporatie', 'InBezitOverigeVerhuurders',
       'EigendomOnbekend', 'BouwjaarVanaf2000', 'kPersonenMetLaagsteInkomen',
       'ALandbouwBosbouwEnVisserij', 'KLFinancieleDienstenOnroerendGoed',
       'PersonenautoSOverigeBrandstof', 'PersonenautoSPerHuishouden',
       'PersonenautoSNaarOppervlakte', 'AfstandTotHuisartsenpraktijk',
       'AfstandTotKinderdagverblijf', 'AfstandTotSchool', 'OppervlakteWater',
       'TotaalDiefstalUitWoningSchuurED',
       'VernielingMisdrijfTegenOpenbareOrde', 'GeweldsEnSeksueleMisdrijven',
       'AfstandTotHuisartsenpost', 'AfstandTotApotheek',
       'AfstandTotConsultatiebureau', 'AfstandTotOvDagelLevensmiddelen',
       'AfstandTotWarenhuis', 'AfstandTotCafeED', 'AfstandTotCafetariaED',
       'AfstandTotResta

# Random Forest

**Feature selection**

In [9]:
# Get the accuracy score of the model with all the features
rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X_train, y_train)
best_rf = rf
selected_features = X_train.columns.tolist()
best_features = selected_features
best_features_accuracy = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0), X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

while True:
    # Make DataFrame of feature impotances
    result = permutation_importance(
            rf, X_train[selected_features], y_train, n_repeats=10, random_state=42, n_jobs=-1
        )
    df_importances = pd.DataFrame({'feature': X_train[selected_features].columns, 'importance': result.importances_mean, 'std': result.importances_std})
    df_importances = df_importances.sort_values('importance')

    # Remove least important feature
    selected_features.remove(df_importances['feature'].iloc[0])

    # When there are no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X_train[selected_features], y_train)

    # Evaluate Model Performance
    accuracy = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    # Check if the model performs better without the insignificant features
    if accuracy >= best_features_accuracy:
        best_features_accuracy = accuracy
        best_features = selected_features.copy()
        best_model = rf

# Save results
best_model_rf = best_model
best_features_rf = best_features

# Show results
print("Best features:", best_features_rf)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_rf)))

Best features: ['AfstandTotZonnebank']
Dropped features: ['GeboorteRelatief', 'NederlandseAntillenEnAruba', 'PercentageOnbewoond', 'AfstandTotOvDagelLevensmiddelen', 'VernielingMisdrijfTegenOpenbareOrde', 'AfstandTotOpenNatNatuurlijkTerrein', 'AfstandTotHuisartsenpost', 'AfstandTotHotelED', 'InBezitOverigeVerhuurders', 'AfstandTotOpenDroogNatTerrein', 'AfstandTotPodiumkunstenTotaal', 'SterfteRelatief', 'PersonenautoSNaarOppervlakte', 'ZwaarBelasteMantelzorgers', 'Mantelzorger', 'AfstandTotBos', 'AfstandTotHuisartsenpraktijk', 'kPersonenMetLaagsteInkomen', 'VoldoetAanRichtlijnAlcoholgebruik', 'EenOfMeerLangdurigeAandoeningen', 'KLFinancieleDienstenOnroerendGoed', 'AfstandTotVolkstuin', 'AfstandTotMuseum', 'AfstandTotBioscoop', 'AfstandTotTreinstationsTotaal', 'AfstandTotKunstijsbaan', 'ErvarenGezondheidGoedZeerGoed', 'AfstandTotSportterrein', 'AfstandTotBegraafplaats', 'MatigOvergewicht', 'UrenMantelzorgPerWeek', 'AfstandTotCafeED', 'BouwjaarVanaf2000', 'PersonenautoSOverigeBrandstof', 

**Hyperparameter optimalisatie**

In [10]:
def bo_params_rf(min_samples_split,max_depth,max_features,n_estimators):
    
    params = {
        'min_samples_split': round(min_samples_split),
        'max_depth': round(max_depth),
        'max_features': max_features,
        'n_estimators':round(n_estimators)
    }    
        
    clf = RandomForestRegressor(min_samples_split=int(params['min_samples_split']), 
                                 max_depth=int(params['max_depth']), 
                                 max_features=params['max_features'],
                                 n_estimators=int(params['n_estimators']), 
                                 bootstrap=False,
                                 n_jobs = -1)
    return cross_val_score(clf, X_train[best_features_rf], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

# Create Bayesian Optimization
rf_bo = BayesianOptimization(f=bo_params_rf, pbounds={
    'n_estimators': (100, 1000),
    'max_depth': (1, 20),
    'max_features': (0.1, 1),
    'min_samples_split': (2, 12)
})

# Bayesian optimization
results = rf_bo.maximize(n_iter=40, init_points=10)

# Getting best hyperparameters from bayesian optimization
params = rf_bo.max['params']
params['n_estimators']= round(params['n_estimators'])
params['min_samples_split']= round(params['min_samples_split'])
params['max_depth']= round(params['max_depth'])

# Making random forest model with the best hyperparameters
# criterion entropy is after the hyperparameter optimization since it's more accurate but increases training time
best_model_rf_classification = RandomForestRegressor(min_samples_split=params['min_samples_split'],
                                                max_depth=params['max_depth'],
                                                max_features=params['max_features'],
                                                n_estimators=params['n_estimators'],
                                                bootstrap=False)

best_model_rf_classification.fit(X_train[best_features_rf], y_train)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-1.431   [0m | [0m16.01    [0m | [0m0.3792   [0m | [0m10.02    [0m | [0m631.9    [0m |
| [95m2        [0m | [95m-1.431   [0m | [95m19.75    [0m | [95m0.9291   [0m | [95m5.092    [0m | [95m956.0    [0m |
| [0m3        [0m | [0m-1.431   [0m | [0m15.49    [0m | [0m0.5633   [0m | [0m10.19    [0m | [0m721.4    [0m |
| [0m4        [0m | [0m-1.431   [0m | [0m6.254    [0m | [0m0.8138   [0m | [0m9.955    [0m | [0m193.0    [0m |
| [0m5        [0m | [0m-1.431   [0m | [0m7.144    [0m | [0m0.3693   [0m | [0m11.74    [0m | [0m210.8    [0m |
| [0m6        [0m | [0m-1.442   [0m | [0m1.867    [0m | [0m0.6937   [0m | [0m10.03    [0m | [0m660.9    [0m |
| [0m7        [0m | [0m-1.431   [0m | [0m10.87    [0m | [0m0.6695   [0m | [0m10.58    [0m | [0m201.2   

# neural network

**Feature selection**

In [11]:
# Get the accuracy score of the model with all the features
mlp = MLPRegressor(random_state=0).fit(X_train, y_train)
best_model = mlp
selected_features = X_train.columns.tolist()
best_features = selected_features
best_features_accuracy = cross_val_score(MLPRegressor(random_state=0), X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

while True:
    # Retrieve the learned weights
    weights = mlp.coefs_[0]  # Weights connecting input features to the first hidden layer

    # Normalize the weights
    normalized_weights = np.abs(weights) / np.sum(np.abs(weights), axis=0)

    # Calculate feature importance
    feature_importance = np.mean(normalized_weights, axis=1)

    # Sort features based on their importance
    sorted_indices = np.argsort(feature_importance)
    selected_features = [selected_features[i] for i in sorted_indices[:-1]]

    # When there are no features left
    if len(selected_features) == 0:
        break
    
    # Refit the Model
    mlp = MLPRegressor(random_state=0).fit(X_train[selected_features], y_train)

    # Evaluate Model Performance
    accuracy = cross_val_score(MLPRegressor(random_state=0), X_train[selected_features], y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    # Check if the model performs better without the insignificant features
    if accuracy >= best_features_accuracy:
        best_features_accuracy = accuracy
        best_features = selected_features.copy()
        best_model = mlp

# Save results
best_model_nn = best_model
best_features_nn = best_features

# Show results
print("Best features:", best_features_nn)
print("Dropped features:", list(set(X_train.columns.tolist()) - set(best_features_nn)))

Best features: ['ErnstigOvergewichtObesitas', 'UrenMantelzorgPerWeek', 'ZwaarBelasteMantelzorgers', 'AfstandTotSchool', 'AfstandTotBos', 'kPersonenMetLaagsteInkomen', 'VoldoetAanBeweegrichtlijn', 'OvermatigDrinker', 'InBezitWoningcorporatie', 'MatigOvergewicht', 'PersonenautoSPerHuishouden', 'AfstandTotWarenhuis', 'ErvarenGezondheidGoedZeerGoed', 'BouwjaarVanaf2000', 'AfstandTotConsultatiebureau', 'NederlandseAntillenEnAruba', 'AfstandTotBibliotheek', 'BeperkingInZien', 'AfstandTotHotelED', 'OppervlakteWater', 'PersonenautoSNaarOppervlakte', 'VernielingMisdrijfTegenOpenbareOrde', 'AfstandTotBrandweerkazerne', 'Ondergewicht', 'GeboorteRelatief', 'AfstandTotTreinstationsTotaal', 'AfstandTotCafeED', 'GeweldsEnSeksueleMisdrijven', 'AfstandTotSauna', 'AfstandTotBegraafplaats', 'AfstandTotZwembad', 'AfstandTotPodiumkunstenTotaal', 'KLFinancieleDienstenOnroerendGoed', 'AfstandTotOpenNatNatuurlijkTerrein', 'VoldoetAanRichtlijnAlcoholgebruik', 'AfstandTotPoppodium', 'AfstandTotBuitenschoolseOpv

**Hyperparameter optimalisatie**

In [12]:
def bo_params_generic(model, params, X_train, y_train):
    # Create the model instance with the specified parameters
    regressor = model(**params)
    
    # Calculating rmse based on
    scores = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
    return scores.mean()

params_ranges = {
    'hidden_layer_sizes': (10, 100),
    'alpha': (0.0001, 0.1),
    'learning_rate_init': (0.001, 0.1),
    'max_iter': (100, 1000)
}

# Example usage with Neural Network
model = MLPRegressor
model_bo = BayesianOptimization(f=lambda hidden_layer_sizes, alpha, learning_rate_init, max_iter:
                                    bo_params_generic(model, {
                                        'hidden_layer_sizes': (int(round(hidden_layer_sizes)),),
                                        'alpha': alpha,
                                        'learning_rate_init': learning_rate_init,
                                        'max_iter': int(round(max_iter))
                                    }, X_train[best_features_nn], y_train),
                             pbounds=params_ranges)

results = model_bo.maximize(n_iter=40, init_points=10)
params = model_bo.max['params']

# Creating a model with the best hyperparameters
best_model_nn = model(
    hidden_layer_sizes=(int(round(params['hidden_layer_sizes'])),),
    alpha=params['alpha'],
    learning_rate_init=params['learning_rate_init'],
    max_iter=int(round(params['max_iter']))
)

# Fit the model
best_model_nn.fit(X_train[best_features_nn], y_train)

|   iter    |  target   |   alpha   | hidden... | learni... | max_iter  |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-1.196   [0m | [0m0.01551  [0m | [0m34.05    [0m | [0m0.05803  [0m | [0m339.0    [0m |
| [95m2        [0m | [95m-1.189   [0m | [95m0.05164  [0m | [95m89.73    [0m | [95m0.07927  [0m | [95m794.0    [0m |
| [0m3        [0m | [0m-1.195   [0m | [0m0.02318  [0m | [0m79.38    [0m | [0m0.006472 [0m | [0m335.8    [0m |
| [0m4        [0m | [0m-1.189   [0m | [0m0.005794 [0m | [0m84.41    [0m | [0m0.03631  [0m | [0m437.0    [0m |
| [0m5        [0m | [0m-1.196   [0m | [0m0.06415  [0m | [0m28.57    [0m | [0m0.05     [0m | [0m421.8    [0m |
| [0m6        [0m | [0m-1.193   [0m | [0m0.05351  [0m | [0m27.65    [0m | [0m0.05896  [0m | [0m195.5    [0m |
| [0m7        [0m | [0m-1.195   [0m | [0m0.0496   [0m | [0m91.51    [0m | [0m0.06839  [0m | [0m175.4   

# plotting the bar chart

**using train dataset**

In [13]:
regression_models = [
    {
        'name': 'Multiple Linear Regression',
        'model': best_model_mlr,
        'features': best_features_mlr
    },
    {
        'name': 'Random Forest',
        'model': best_model_rf,
        'features': best_features_rf
    },
    {
        'name': 'Neural Network',
        'model': best_model_nn,
        'features': best_features_nn
    },
    {
        'name': 'Support Vector Machine',
        'model': best_model_svm,
        'features': best_features_svm
    }
]

regression_stats_val = []

for model in regression_models:
    try:
        # Perform cross-validation and calculate predictions and metrics
        y_pred = cross_val_predict(model['model'], X_train[model['features']], y_train, cv=10)
        y_true = y_train
    except: # cv can't be higher then n_splits in the dt
        # Perform cross-validation and calculate predictions and metrics
        y_pred = cross_val_predict(model['model'], X_train[model['features']], y_train, cv=5)
        y_true = y_train

    # R2 Score
    r2 = r2_score(y_true, y_pred)

    # Adjusted R2 Score
    n = len(y_true)  # Number of samples
    p = X_test.shape[1]  # Number of predictors (features) in X
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

    # Save stats
    regression_stats_val.append(
        {
            'model': model['name'],
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2,
            'adj_r2': adj_r2
        }
    )

# View stats
regression_stats_val

[{'model': 'Multiple Linear Regression',
  'mae': 0.8658034142298338,
  'mse': 1.4329686505901573,
  'rmse': 1.1970666859411623,
  'r2': -0.00024049827615324482,
  'adj_r2': -0.07829717038243067},
 {'model': 'Random Forest',
  'mae': 0.8660175252217047,
  'mse': 1.4303383451797227,
  'rmse': 1.195967535169631,
  'r2': 0.0015955069939328492,
  'adj_r2': -0.07631788710910459},
 {'model': 'Neural Network',
  'mae': 0.8710141324847772,
  'mse': 1.4350780077305418,
  'rmse': 1.1979474144262519,
  'r2': -0.0017128713362837278,
  'adj_r2': -0.07988444435026354},
 {'model': 'Support Vector Machine',
  'mae': 0.8606476198631337,
  'mse': 1.4553841018399163,
  'rmse': 1.2063930130102363,
  'r2': -0.01588692718993978,
  'adj_r2': -0.09516461381568764}]

In [14]:
# Extracting model names and metric scores
model_names = [d['model'] for d in regression_stats_val]
metric_labels = list(regression_stats_val[0].keys())[1:]
metric_scores = {metric: [d[metric] for d in regression_stats_val] for metric in metric_labels}

# Plotting the bar chart
fig = go.Figure()

for metric in metric_labels:
    fig.add_trace(go.Bar(
        x=model_names,
        y=metric_scores[metric],
        name=metric
    ))

# Updating the layout
fig.update_layout(
    title='Evaluation Metrics Comparison BMI',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group'
)

# Display the plot
fig.show()

**Evaluating on test data**

In [15]:
regression_models = [
    {
        'name': 'Multiple Linear Regression',
        'model': best_model_mlr,
        'features': best_features_mlr
    },
    {
        'name': 'Random Forest',
        'model': best_model_rf,
        'features': best_features_rf
    },
    {
        'name': 'Neural Network',
        'model': best_model_nn,
        'features': best_features_nn
    },
    {
        'name': 'Support Vector Machine',
        'model': best_model_svm,
        'features': best_features_svm
    }
]

regression_stats_test = []

for model in regression_models:
    # Make predictions on the test set
    y_pred = model['model'].predict(X_test[model['features']])

    # Defining Actual values
    y_true = y_test.copy()

    # R2 Score
    r2 = r2_score(y_true, y_pred)

    # Adjusted R2 Score
    n = len(y_true)  # Number of samples
    p = X_test.shape[1]  # Number of predictors (features) in X
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

    # Save stats
    regression_stats_test.append(
        {
            'model': model['name'],
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2,
            'adj_r2': adj_r2
        }
    )

# View stats
regression_stats_test

[{'model': 'Multiple Linear Regression',
  'mae': 0.9044952818839904,
  'mse': 1.5831372919145972,
  'rmse': 1.2582278378396328,
  'r2': 0.00014222434620525526,
  'adj_r2': -0.4091562803073949},
 {'model': 'Random Forest',
  'mae': 0.8941572116607733,
  'mse': 1.5706296636202557,
  'rmse': 1.2532476465648184,
  'r2': 0.00804163361977539,
  'adj_r2': -0.3980231947230066},
 {'model': 'Neural Network',
  'mae': 0.9102933583645284,
  'mse': 1.5865271895589421,
  'rmse': 1.2595742096275797,
  'r2': -0.0019987242219821066,
  'adj_r2': -0.4121736405701619},
 {'model': 'Support Vector Machine',
  'mae': 0.8925430540704585,
  'mse': 1.6333271641412361,
  'rmse': 1.2780168872676276,
  'r2': -0.031556058715642044,
  'adj_r2': -0.45383046871619714}]

In [19]:
# Extracting model names and metric scores
model_names = [d['model'] for d in regression_stats_test]
metric_labels = list(regression_stats_test[0].keys())[1:]
metric_scores = {metric: [d[metric] for d in regression_stats_test] for metric in metric_labels}

# Plotting the bar chart
fig = go.Figure()

for metric in metric_labels:
    fig.add_trace(go.Bar(
        x=model_names,
        y=metric_scores[metric],
        text=metric_scores[metric],  # Display the values on the bars
        textposition='auto',  # Position the text inside the bars
        name=metric
    ))

# Updating the layout
fig.update_layout(
    title='Evaluation Metrics Comparison BMI',
    xaxis_title='Models',
    yaxis_title='Scores',
    barmode='group'
)

# Display the plot
fig.show()

Naive bayes

In [17]:
# Defining Actual values
y_true = y_test.copy()

# Defining Predicted values
y_pred = np.full_like(y_test, y_train.mean())

# Mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Mean Squared Error
mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)

# Root Mean Squared Error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# R2 Score
r2 = r2_score(y_true, y_pred)
print("R2 Score:", r2)

# Adjusted R2 Score
n = len(y_true)  # Number of samples
p = X_test.shape[1]  # Number of predictors (features) in X
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print("Adjusted R2 Score:", adj_r2)

Mean Absolute Error: 0.9067514130956161
Mean Squared Error: 1.5936473903413322
Root Mean Squared Error: 1.2623974771605544
R2 Score: -0.006495610343513203
Adjusted R2 Score: -0.41851135726775834


Save best model as Pickle

In [18]:
import pickle

# Save model
file_path = 'Saved Models/BMI.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(best_model_mlr, file)

# Save features
with open('Saved Models/best_features_BMI.txt', 'w') as file:
    file.write(f'{best_features_mlr}')