In [95]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pygam import GAM, s, f, LinearGAM
import warnings
warnings.filterwarnings('ignore')

### Import Data

In [96]:
data_path = '../../Data/KORA_S3_S4/KORA_Noise_CleanData_MiniModel.csv'
data = pd.read_csv(data_path)
print(data.describe())

               sex          age          bmi      smoking     lden_org  \
count  9116.000000  9116.000000  9116.000000  9116.000000  9116.000000   
mean      0.492979    49.555836    27.115082     2.169263    54.726163   
std       0.499978    14.033439     4.594119     0.810235     6.564306   
min       0.000000    24.000000    15.840000     1.000000    30.000000   
25%       0.000000    37.000000    23.940000     1.000000    50.600000   
50%       0.000000    50.000000    26.580000     2.000000    53.900000   
75%       1.000000    61.000000    29.630000     3.000000    58.500000   
max       1.000000    75.000000    56.930000     3.000000    77.500000   

           bp_syst  
count  9116.000000  
mean    130.545470  
std      19.540302  
min      77.000000  
25%     116.500000  
50%     128.250000  
75%     142.000000  
max     228.000000  


In [97]:
X = data.drop(['bp_syst'], axis = 1)
print(X.head())

# Output is the systolic Blood Pressure 
Y_SBP = data['bp_syst']

print('Data description \n')
print('Sex: Female = 0, Male = 1 \nSmoking: Current = 1, Ex-smoker = 2, Never-smoker =3 \n ')


   sex   age    bmi  smoking  lden_org
0  0.0  31.0  18.94      2.0      41.0
1  1.0  40.0  27.14      2.0      55.2
2  1.0  59.0  30.34      3.0      55.2
3  0.0  62.0  19.46      1.0      46.8
4  0.0  62.0  31.25      2.0      51.2
Data description 

Sex: Female = 0, Male = 1 
Smoking: Current = 1, Ex-smoker = 2, Never-smoker =3 
 


### Train-Test data split

In [99]:
# original data
X_train, X_test, y_train, y_test = train_test_split(X, Y_SBP, test_size = 0.2, random_state = 10)

# Prediction with LR

In [100]:
LR = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_LR = LR.predict(X_test)
print(f'Mean squared error (MSE) : {mean_squared_error(y_test, y_pred_LR):.2f}')
print(f'Mean absolute error (MAE) : {mean_absolute_error(y_test, y_pred_LR):.2f}')
print(f'R^2: {r2_score(y_test, y_pred_LR):.2f}')

Mean squared error (MSE) : 294.79
Mean absolute error (MAE) : 13.07
R^2: 0.27


# Prediction with GAM

In [101]:
# Building the model with Pygam

gam_pyGam = GAM(f(0) + s(1) + s(2) + f(3) + s(4)).fit(X_train, y_train)
#gam_pyGam.summary()
y_pred_GAM = gam_pyGam.predict(X_test)
print(f'Mean squared error (MSE) : {mean_squared_error(y_test, y_pred_GAM):.2f}')
print(f'Mean absolute error (MAE) : {mean_absolute_error(y_test, y_pred_GAM):.2f}')
print(f'R^2: {r2_score(y_test, y_pred_GAM):.2f}')

Mean squared error (MSE) : 293.11
Mean absolute error (MAE) : 13.02
R^2: 0.27


# Prediction with Random Forest

In [102]:
RF = RandomForestRegressor(n_estimators= 10, max_depth= 10, random_state= 0, max_features='sqrt').fit(X_train, y_train)
y_pred_RF = RF.predict(X_test)
print(f'Mean squared error (MSE) : {mean_squared_error(y_test, y_pred_RF):.2f}')
print(f'Mean absolute error (MAE) : {mean_absolute_error(y_test, y_pred_RF):.2f}')
print(f'R^2: {r2_score(y_test, y_pred_RF):.2f}')

Mean squared error (MSE) : 299.10
Mean absolute error (MAE) : 13.20
R^2: 0.26
