In [18]:
'''Predicting Mortality across Germany wih different AI Methods
'''

import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import impute
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
import dbf
from pygam import GAM, s, f, LinearGAM
import xgboost
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Load Data

In [19]:
# Set the working directory
os.chdir("N:/WG_ENRI/20_projects/Noise2NAKO/04_data/grid_prediction/3_output")

In [30]:
# Read training and test data
data_training = pd.read_csv('Prediction/Training_data_cvd_noise.csv', sep=',')
data_test = pd.read_csv('Prediction/Test_data_cvd_noise.csv', sep=',')

In [35]:
# distinguish between response and input variables and make training and test data 
# remove unnecessary features 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'

output_variable = 'cvd_mortality_17'
output_path = 'CVD Mortality Rate 2017 incl Noise_XY'

# output_variable = 'ks_Mortality_17'
# output_path = 'Total Mortality Rate 2017 incl XY'

output_path = '../5_results/' + output_path + '/'
coord = True

y = data_training[output_variable]
if coord:
    X = data_training.drop([output_variable, 'ks_Mortality_17', 'id'], axis = 1)   
else:
    X = data_training.drop([output_variable, 'ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)
    
pred_y = data_test[output_variable]
if coord:
    pred_X = data_test.drop([output_variable, 'ks_Mortality_17', 'id'], axis = 1)
else:
    pred_X = data_test.drop([output_variable, 'ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [36]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 685 entries, 152 to 265
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   x_sw           685 non-null    int64  
 1   y_sw           685 non-null    float64
 2   x_mp           685 non-null    int64  
 3   y_mp           685 non-null    int64  
 4   ks_foreign_17  685 non-null    float64
 5   ks_inc_hh_17   685 non-null    float64
 6   ks_inc_17      685 non-null    float64
 7   ks_nondipl_17  685 non-null    float64
 8   ks_abi_17      685 non-null    float64
 9   ks_arbl_17     685 non-null    float64
 10  ks_GISD_14     685 non-null    float64
 11  geb_abs        685 non-null    float64
 12  hh_abs         685 non-null    float64
 13  pers_abs       685 non-null    float64
 14  imp_pct05_15   685 non-null    int64  
 15  imp_pct25_15   685 non-null    int64  
 16  imp_pct50_15   685 non-null    float64
 17  imp_mean_15    685 non-null    float64
 18  imp_pct7

# Prediction

In [37]:
def Prediction(method, X_train, X_test, y_train, y_test, pred_X, pred_y, output_variable):
    
    '''Prediction Function'''
    
    if method == 'LR':
        model = linear_model.LinearRegression().fit(X_train, y_train)
    elif method == 'LR_Ridge':
        model = linear_model.RidgeCV(cv = 5).fit(X_train, y_train)
    elif method == 'LR_Lasso':
        model = linear_model.LassoCV(cv = 5, random_state=0).fit(X_train, y_train)
    elif method == 'LR_Elastic':
        model = linear_model.ElasticNetCV(cv = 5, random_state=0).fit(X_train, y_train)
    elif method == 'GAM':
        model = GAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9)).fit(X_train, y_train)
    elif method == 'RF':
        model = RandomForestRegressor(n_estimators= 10, max_depth= 10, random_state= 0, max_features='sqrt').fit(X_train, y_train)
    elif method == 'AdaB':
        model = AdaBoostRegressor(n_estimators= 10, random_state=0).fit(X_train, y_train)
    else:
        model = xgboost.XGBRegressor().fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred = model.predict(pred_X)

    result = {'Method': method, 'MSE_train': mean_squared_error(y_train, y_pred_train),
                            'MAE_train': mean_absolute_error(y_train, y_pred_train), 'R_2_train': r2_score(y_train, y_pred_train),
                           'MSE_val': mean_squared_error(y_test, y_pred_test), 'MAE_val': mean_absolute_error(y_test, y_pred_test),
                           'R_2_val': r2_score(y_test, y_pred_test), 'MSE_pred': mean_squared_error(pred_y, y_pred), 'MAE_pred': mean_absolute_error(pred_y, y_pred),
                           'R_2_pred': r2_score(pred_y, y_pred)}
    
    return result, pd.DataFrame(y_pred, columns = [output_variable])

### Prediction with different methods

In [38]:
result= pd.DataFrame(columns=['Method', 'MSE_train', 'MAE_train', 'R_2_train', 'MSE_val', 'MAE_val', 'R_2_val', 'MSE_pred', 'MAE_pred', 'R_2_pred'])

# Linear Regression
ToAppend, y_pred_LR = Prediction('LR', X_train, X_test, y_train, y_test, pred_X, pred_y, output_variable)
result = result.append(ToAppend, ignore_index=True)

# Linear Regression _ Ridge
ToAppend, y_pred_LR = Prediction(method='LR_Ridge', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# Linear Regression - Lasso
ToAppend, y_pred_LR = Prediction(method='LR_Lasso', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# Linear Regression - ElasticNet
ToAppend, y_pred_LR = Prediction(method='LR_Elastic', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# GAM
ToAppend, y_pred_GAM = Prediction(method='GAM', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# Random Forest
ToAppend, y_pred_RF = Prediction(method='RF', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# AdaBoost
ToAppend, y_pred_RF = Prediction(method='AdaB', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# XGBoost
ToAppend, y_pred_XGBoost = Prediction(method='XGBoost', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

result.to_csv(output_path + 'PredictionTask_results_alMethods.csv', index=False, sep=',')
result.round(decimals=3).to_csv(output_path + 'PredictionTask_results_alMethods_round.csv', index=False, sep=',')

result

Unnamed: 0,Method,MSE_train,MAE_train,R_2_train,MSE_val,MAE_val,R_2_val,MSE_pred,MAE_pred,R_2_pred
0,LR,0.109875,0.239581,0.895079,0.098786,0.223477,0.903112,0.506337,0.540181,0.499332
1,LR_Ridge,0.110552,0.240676,0.894433,0.100105,0.227555,0.901819,0.52069,0.551779,0.48514
2,LR_Lasso,0.518194,0.589722,0.505173,0.463448,0.57003,0.545459,0.862006,0.68555,0.147645
3,LR_Elastic,0.518198,0.589725,0.505169,0.463451,0.570032,0.545457,0.862007,0.68555,0.147644
4,GAM,0.019731,0.088177,0.981159,0.029679,0.106747,0.970892,0.4705,0.507509,0.534768
5,RF,0.009992,0.050354,0.990458,0.032427,0.097554,0.968197,0.436365,0.50272,0.568521
6,AdaB,0.048115,0.157629,0.954055,0.063676,0.175032,0.937548,0.478524,0.53168,0.526834
7,XGBoost,2e-06,0.00094,0.999998,0.013894,0.055238,0.986373,0.397873,0.474731,0.606581


### Merge dataframes to make the final output dataset

In [75]:
def OutputDataframe(data_training, data_test, y_pred, method, output_variable, output_path):
    
    data_result= pd.DataFrame()
    data_result = data_result.append(data_training)
    data_result = data_result.append(pd.concat([data_test.drop([output_variable], axis = 1),pd.DataFrame(y_pred)],axis=1))
    data_result = data_result[['id', output_variable]]
    data_result.to_csv((output_path + 'prediction_result_'+method+'.csv'), sep=',', index=False)

### Make the difference output

In [76]:
def DiffDataframe(data_training, data_test, y_pred, method, output_variable, output_path):
 
    data_result= pd.DataFrame()
    data_result = data_result.append(data_training)
    data_result[output_variable] = 0
    pred_variable = 'pred'+output_variable
    data_test = pd.concat([data_test, y_pred.rename(columns={output_variable: pred_variable})], axis=1)
    data_test[output_variable] = data_test[output_variable] - data_test[pred_variable]
    data_result = data_result.append(data_test.drop([pred_variable], axis = 1))
    data_result = data_result[['id', output_variable]]
    data_result.to_csv((output_path + 'prediction_result_diff_'+method+'.csv'), sep=',', index=False)

In [77]:
OutputDataframe(data_training,data_test, y_pred_XGBoost, 'XGBoost', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_XGBoost, 'XGBoost', output_variable, output_path)

In [78]:
OutputDataframe(data_training,data_test, y_pred_LR, 'LR', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_LR, 'LR', output_variable, output_path)

In [79]:
OutputDataframe(data_training,data_test, y_pred_RF, 'RF', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_RF, 'RF', output_variable, output_path)

In [80]:
OutputDataframe(data_training,data_test, y_pred_GAM, 'GAM', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_GAM, 'GAM', output_variable, output_path)

In [81]:
OutputDataframe(data_training,data_test, pred_y, 'GT', output_variable, output_path)