# Bayesian Ridge model on the Goat dataset

In [21]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import seaborn as sns

from sklearn.linear_model import BayesianRidge
from scipy.stats import f_oneway
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


## Load the data and preprocess

In [11]:
#Load in the dataset
df = pd.read_excel("C:/Users/daanm/Documents/Universiteit Utrecht/Scriptie/goat_df.xlsx")
df = df.drop(['Unnamed: 0','advise_vip','purchase_bin','cs_cs_ratio_post_goat','buy_nr_goat','buy_goat', 'ratio_insured_goat', 'n_previd_goat','wave', 'id'], axis=1)
df.head()

Unnamed: 0,afm_language,age_constant,agric_land,amh_language,educ_recoded_constant,eng_language,expend,irrigated_land_bin,cs_cs_diff_post_goat,number_minors,...,activity_child_recoded,household_description,number_adults,main_info_source_recoded,religion_recoded,owns_phone,household_moved,why_not_purchase_recoded,know_vip,trust_vip
0,No,25,Yes,No,Never attended,No,0,1,-120.671875,2,...,Working with Livestock,Fully settled: The whole of the household (all...,2,Interpersonal Sources,Traditional/Wakefata,0,No,Lack of Awareness or Understanding,Yes,Yes
1,Yes,27,Yes,No,Adult Education,No,0,0,-758.621033,2,...,Not working,Fully settled: The whole of the household (all...,2,Interpersonal Sources,Traditional/Wakefata,0,No,"Financial, Practical, and Situational Constraints",Yes,Yes
2,Yes,29,No,No,Elementary,No,1,0,-1180.266846,2,...,Working with Livestock,Fully settled: The whole of the household (all...,2,Professional and Organizational Sources,Christian,0,No,"Financial, Practical, and Situational Constraints",Yes,Yes
3,No,35,Yes,No,Never attended,No,0,0,-53.165897,3,...,Student,Fully settled: The whole of the household (all...,2,Interpersonal Sources,Traditional/Wakefata,0,No,"Financial, Practical, and Situational Constraints",Yes,Yes
4,No,36,Yes,No,Never attended,No,1,1,-513.432312,3,...,Working with Livestock,Fully settled: The whole of the household (all...,2,Interpersonal Sources,Traditional/Wakefata,0,No,Lack of Awareness or Understanding,Yes,Yes


In [12]:
df['cs_diff_log'] = np.log(np.abs(df['cs_cs_diff_post_goat']) + 1)
df[['cs_cs_diff_post_goat', 'cs_diff_log']].head()

Unnamed: 0,cs_cs_diff_post_goat,cs_diff_log
0,-120.671875,4.801328
1,-758.621033,6.63282
2,-1180.266846,7.074343
3,-53.165897,3.992052
4,-513.432312,6.243064


In [13]:
y = df['cs_diff_log']
X = df.drop(columns=['cs_diff_log', 'cs_cs_diff_post_goat'])

In [15]:
categorical = X.select_dtypes(include=['object','category']).columns.tolist()
numerical = X.select_dtypes(include=['int64','float64']).columns.tolist()

In [16]:
X_encoded = X.copy()

# Encode categorical variables
label_encoders = {}
for col in categorical:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

# Scale the numerical variables
scaler = StandardScaler()
X_encoded[numerical] = scaler.fit_transform(X_encoded[numerical])


X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

## Deploy the model

In [17]:
bayes_model = BayesianRidge()
bayes_model.fit(X_train, y_train)

In [20]:
y_pred = bayes_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²-score: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

R²-score: 0.010
MAE: 1.130
RMSE: 1.506


## Gridsearch with Cross Validation

To find the best model, we will be applying grid search to this method

In [35]:
param_grid = {
    "alpha_1": [1e-2, 1e-3, 1e-4, 1e-5, 1e-1, 1],
    "alpha_2": [1e-2, 1e-3, 1e-4, 1e-5, 1e-1, 1],
    "lambda_1": [1e-2, 1e-3, 1e-4, 1e-5, 1e-1, 1],
    "lambda_2": [1e-2, 1e-3, 1e-4, 1e-5, 1e-1, 1]
}

bayesian_grid = BayesianRidge()

grid_search = GridSearchCV(bayesian_grid, param_grid, cv=5, scoring="r2", n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, y_train)

In [36]:
results_df = pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score')

results_summary = []

#Rerun the gridsearchCV and store all the values of the performance metrics

for i, row in results_df.iterrows():
    params = row['params']
    
    model = BayesianRidge(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results_summary.append({
        'Parameters': params,
        'R²': round(r2, 3),
        'MAE': round(mae, 3),
        'RMSE': round(rmse, 3)
    })


summary_df = pd.DataFrame(results_summary)

summary_df = summary_df.sort_values(by='R²', ascending=False).reset_index(drop=True)
styled_df = summary_df.style.background_gradient(subset=['R²'], cmap='Greens')

styled_df

Unnamed: 0,Parameters,R²,MAE,RMSE
0,"{'alpha_1': 1, 'alpha_2': 1e-05, 'lambda_1': 1e-05, 'lambda_2': 1}",0.016,1.127,1.502
1,"{'alpha_1': 1, 'alpha_2': 0.0001, 'lambda_1': 1e-05, 'lambda_2': 1}",0.016,1.127,1.502
2,"{'alpha_1': 1, 'alpha_2': 0.001, 'lambda_1': 1e-05, 'lambda_2': 1}",0.016,1.127,1.502
3,"{'alpha_1': 1, 'alpha_2': 0.01, 'lambda_1': 1e-05, 'lambda_2': 1}",0.016,1.127,1.502
4,"{'alpha_1': 1, 'alpha_2': 1e-05, 'lambda_1': 0.0001, 'lambda_2': 1}",0.016,1.127,1.502
5,"{'alpha_1': 1, 'alpha_2': 0.0001, 'lambda_1': 0.0001, 'lambda_2': 1}",0.016,1.127,1.502
6,"{'alpha_1': 1, 'alpha_2': 0.001, 'lambda_1': 0.0001, 'lambda_2': 1}",0.016,1.127,1.502
7,"{'alpha_1': 1, 'alpha_2': 0.01, 'lambda_1': 0.0001, 'lambda_2': 1}",0.016,1.127,1.502
8,"{'alpha_1': 1, 'alpha_2': 0.1, 'lambda_1': 1e-05, 'lambda_2': 1}",0.016,1.127,1.502
9,"{'alpha_1': 1, 'alpha_2': 0.1, 'lambda_1': 0.0001, 'lambda_2': 1}",0.016,1.127,1.502
