In [2]:
import statsmodels.api as sm
import numpy as np
from statsmodels.gam.api import GLMGam, BSplines
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pygam import GAM, s, f, LinearGAM
import warnings
warnings.filterwarnings('ignore')

### Import Data

In [3]:
data_path = '../../Data/KORA_S3_S4/KORA_Noise_noMissing_median.csv'
KORA_Noise_noMissing = pd.read_csv(data_path)

In [4]:
X_mini = KORA_Noise_noMissing[['sex', 'age', 'bmi', 'smoking', 'lden_org']]

# noise cut-off
X_mini_test = X_mini['lden_org']
X_mini_test[X_mini_test<=40] = 40
X_mini['lden_org'] = X_mini_test

Y_SBP = KORA_Noise_noMissing['bp_syst']

Data_numeric = KORA_Noise_noMissing[['age', 'bmi', 'lden_org', 'bp_syst']]
Data_categoric = KORA_Noise_noMissing[['sex', 'smoking']]
Data = KORA_Noise_noMissing[['sex', 'age', 'bmi', 'smoking', 'lden_org', 'bp_syst']]
print('Data description \n')
print('Sex: Female = 0, Male = 1 \nSmoking: Current = 1, Ex-smoker = 2, Never-smoker =3 \n ')
X_mini.describe()

Data description 

Sex: Female = 0, Male = 1 
Smoking: Current = 1, Ex-smoker = 2, Never-smoker =3 
 


Unnamed: 0,sex,age,bmi,smoking,lden_org
count,9116.0,9116.0,9116.0,9116.0,9116.0
mean,0.492979,49.555836,27.115082,2.169263,54.750581
std,0.499978,14.033439,4.594119,0.810235,6.498166
min,0.0,24.0,15.84,1.0,40.0
25%,0.0,37.0,23.94,1.0,50.6
50%,0.0,50.0,26.58,2.0,53.9
75%,1.0,61.0,29.63,3.0,58.5
max,1.0,75.0,56.93,3.0,77.5


### Scaling numeric data and mapping categoric data

In [5]:
Data_categoric['smoking']= Data_categoric['smoking'].map({1.0:'Current', 2.0:'Ex_smoker', 3.0:'Never_smoker'})
Data_categoric['sex']= Data_categoric['sex'].map({0.0:'female', 1.0:'male'})
print(Data_categoric.dtypes)
Data_categoric.head()

sex        object
smoking    object
dtype: object


Unnamed: 0,sex,smoking
0,female,Ex_smoker
1,male,Ex_smoker
2,male,Never_smoker
3,female,Current
4,female,Ex_smoker


In [6]:
Data_numeric_scaled = pd.DataFrame(scale(Data_numeric), index=Data_numeric.index,
                           columns=Data_numeric.columns)
# Data_numeric.head()
Data_numeric_scaled.head()

Unnamed: 0,age,bmi,lden_org,bp_syst
0,-1.322331,-1.779564,-2.08314,-1.256215
1,-0.680971,0.005424,0.072704,0.637412
2,0.673013,0.702005,0.072704,1.712172
3,0.886799,-1.66637,-1.202584,-1.102678
4,0.886799,0.900095,-0.534576,-0.897961


In [7]:
Data = pd.concat([Data_numeric, Data_categoric], axis= 1)
Data_scaled = pd.concat([Data_numeric_scaled, Data_categoric], axis= 1)
print(Data.head())
print(Data_scaled.head())

    age    bmi  lden_org  bp_syst     sex       smoking
0  31.0  18.94      41.0    106.0  female     Ex_smoker
1  40.0  27.14      55.2    143.0    male     Ex_smoker
2  59.0  30.34      55.2    164.0    male  Never_smoker
3  62.0  19.46      46.8    109.0  female       Current
4  62.0  31.25      51.2    113.0  female     Ex_smoker
        age       bmi  lden_org   bp_syst     sex       smoking
0 -1.322331 -1.779564 -2.083140 -1.256215  female     Ex_smoker
1 -0.680971  0.005424  0.072704  0.637412    male     Ex_smoker
2  0.673013  0.702005  0.072704  1.712172    male  Never_smoker
3  0.886799 -1.666370 -1.202584 -1.102678  female       Current
4  0.886799  0.900095 -0.534576 -0.897961  female     Ex_smoker


In [8]:
# make the formular
variables = Data.columns.drop('bp_syst')
formula = 'bp_syst ~ ' + ' + '.join(variables)
print(formula)

bp_syst ~ age + bmi + lden_org + sex + smoking


### Train-Test data split

In [9]:
# original data
X_train, X_test, y_train, y_test = train_test_split(X_mini, Y_SBP, test_size = 0.2, random_state = 10)

# Prediction with LR

In [10]:
LR = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_LR = LR.predict(X_test)
print('Mean squared error (MSE) : %.2f' % mean_squared_error(y_test, y_pred_LR))
print('Mean absolute error (MAE) : %.2f' % mean_absolute_error(y_test, y_pred_LR))
print('R^2: %.2f' % r2_score(y_test, y_pred_LR))

Mean squared error (MSE) : 294.79
Mean absolute error (MAE) : 13.07
R^2: 0.27


# Prediction with GAM

In [11]:
# Building the model with Pygam

gam_pyGam = GAM(f(0) + s(1) + s(2) + f(3) + s(4)).fit(X_train, y_train)
#gam_pyGam.summary()
y_pred_GAM = gam_pyGam.predict(X_test)
print('Mean squared error (MSE) : %.2f' % mean_squared_error(y_test, y_pred_GAM))
print('Mean absolute error (MAE) : %.2f' % mean_absolute_error(y_test, y_pred_GAM))
print('R^2: %.2f' % r2_score(y_test, y_pred_GAM))

Mean squared error (MSE) : 293.26
Mean absolute error (MAE) : 13.03
R^2: 0.27


# Prediction with Random Forest

In [12]:
RF = RandomForestRegressor(n_estimators= 10, max_depth= 10, random_state= 0, max_features='sqrt').fit(X_train, y_train)
y_pred_RF = RF.predict(X_test)
MSE = mean_squared_error(y_test, y_pred_RF)
print(f'Mean squared error (MSE) : {MSE}')
print('Mean absolute error (MAE) : %.2f' % mean_absolute_error(y_test, y_pred_RF))
print('R^2: %.2f' % r2_score(y_test, y_pred_RF))

Mean squared error (MSE) : 299.6310967316614
Mean absolute error (MAE) : 13.23
R^2: 0.26
