In [110]:
'''Predicting Mortality across Germany wih different AI Methods
'''

import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import impute
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import dbf
from pygam import GAM, s, f, LinearGAM
import xgboost
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Load Data

In [111]:
# Set the working directory
os.chdir("N:/WG_ENRI/20_projects/Noise2NAKO/04_data/grid_prediction/3_output")

In [112]:
# Read SES training and test data
data_training = pd.read_csv('SES_training.csv', sep=',')
data_test = pd.read_csv('SES_test.csv', sep=',')

In [125]:
# distinguish between response and input variables and make training and test data 
# remove unnecessary features 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'

y = data_training['ks_Mortality_17']
X = data_training.drop(['ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

pred_y = data_test['ks_Mortality_17']
pred_X = data_test.drop(['ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

# Prediction

In [159]:
def Prediction(method, X_train, X_test, y_train, y_test, pred_X, pred_y):
    
    '''Prediction Function'''
    
    if method == 'LR':
        model = linear_model.LinearRegression().fit(X_train, y_train)
    elif method == 'GAM':
        model = GAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9)).fit(X_train, y_train)
    elif method == 'RF':
        model = RandomForestRegressor(n_estimators= 10, max_depth= 10, random_state= 0, max_features='sqrt').fit(X_train, y_train)
    else:
        model = xgboost.XGBRegressor().fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred = model.predict(pred_X)

    result = {'Method': method, 'MSE_train': mean_squared_error(y_train, y_pred_train),
                            'MAE_train': mean_absolute_error(y_train, y_pred_train), 'R_2_train': r2_score(y_train, y_pred_train),
                           'MSE_test': mean_squared_error(y_test, y_pred_test), 'MAE_test': mean_absolute_error(y_test, y_pred_test),
                           'R_2_test': r2_score(y_test, y_pred_test), 'MSE_pred': mean_squared_error(pred_y, y_pred), 'MAE_pred': mean_absolute_error(pred_y, y_pred),
                           'R_2_pred': r2_score(pred_y, y_pred)}
    
    return result, pd.DataFrame(y_pred, columns = ['ks_Mortality_17'])

### Prediction with different methods

In [161]:
result= pd.DataFrame(columns=['Method', 'MSE_train', 'MAE_train', 'R_2_train', 'MSE_test', 'MAE_test', 'R_2_test', 'MSE_pred', 'MAE_pred', 'R_2_pred'])

# Linear Regression
ToAppend, y_pred_LR = Prediction(method='LR', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

# GAM
ToAppend, y_pred_GAM = Prediction(method='GAM', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

# Random Forest
ToAppend, y_pred_RF = Prediction(method='RF', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

# XGBoost
ToAppend, y_pred_XGBoost = Prediction(method='XGBoost', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

result.to_csv('PredictionTask_results.csv', index=False, sep=',')

result

Unnamed: 0,Method,MSE_train,MAE_train,R_2_train,MSE_test,MAE_test,R_2_test,MSE_pred,MAE_pred,R_2_pred
0,LR,0.384853,0.426614,0.893768,0.361315,0.409386,0.900386,1.567778,0.955292,0.53693
1,GAM,0.091077,0.183962,0.97486,0.125594,0.20867,0.965374,1.67824,0.996681,0.504303
2,RF,0.030275,0.07756,0.991643,0.123404,0.161132,0.965978,1.330406,0.880585,0.607041
3,XGBoost,1.1e-05,0.002114,0.999997,0.098933,0.126608,0.972724,1.323051,0.87243,0.609214


### Merge dataframes to make the final output dataset

In [180]:
def OutputDataframe(data_training, data_test, y_pred, method):
    
    data_result= pd.DataFrame()
    data_result = data_result.append(data_training)
    data_result = data_result.append(pd.concat([data_test.drop(['ks_Mortality_17'], axis = 1),pd.DataFrame(y_pred)],axis=1))
    data_result = data_result[['id', 'ks_Mortality_17']]
    data_result.to_csv(('../5_results/prediction_results/prediction_result_'+method+'.csv'), sep=',', index=False)

In [181]:
OutputDataframe(data_training,data_test, y_pred_XGBoost, 'XGBoost')

In [171]:
data_result= pd.DataFrame()
data_result = data_result.append(data_training)
data_result = data_result.append(pd.concat([data_test.drop(['ks_Mortality_17'], axis = 1),pd.DataFrame(y_pred_LR)],axis=1))
data_result = data_result[['id','ks_Mortality_17']]
data_result

Unnamed: 0,id,ks_Mortality_17
0,5kmN2760E4145,9.141840
1,5kmN2760E4155,9.139040
2,5kmN2760E4160,8.290640
3,5kmN2760E4165,9.223600
4,5kmN2765E4145,8.965440
...,...,...
13840,5kmN3540E4210,13.322621
13841,5kmN3540E4215,13.325853
13842,5kmN3545E4215,13.323629
13843,5kmN3545E4220,13.323586


In [165]:
temp = pd.concat([pred_X,pd.DataFrame(y_pred_LR)],axis=1)
temp

Unnamed: 0,ks_foreign_17,ks_inc_hh_17,ks_inc_17,ks_nondipl_17,ks_abi_17,ks_arbl_17,ks_GISD_14,geb_abs,hh_abs,pers_abs,ks_Mortality_17
0,8.4,2047.0,2978.0,5.0,19.4,2.4,0.501063,184.0,378.0,378.0,10.367672
1,8.4,2047.0,2978.0,5.0,19.4,2.4,0.501063,1.0,2.0,8.0,10.367160
2,8.4,2047.0,2978.0,5.0,19.4,2.4,0.501063,590.0,1232.0,1232.0,10.368682
3,8.4,2047.0,2978.0,5.0,19.4,2.4,0.501063,288.0,727.0,728.0,10.367172
4,8.4,2047.0,2978.0,5.0,19.4,2.4,0.501063,50.0,46.0,108.0,10.367577
...,...,...,...,...,...,...,...,...,...,...,...
13840,6.8,2135.0,2768.0,7.3,26.2,5.5,0.745840,209.0,187.0,190.0,13.322621
13841,6.8,2135.0,2768.0,7.3,26.2,5.5,0.745840,412.0,143.0,160.0,13.325853
13842,6.8,2135.0,2768.0,7.3,26.2,5.5,0.745840,310.0,255.0,355.0,13.323629
13843,6.8,2135.0,2768.0,7.3,26.2,5.5,0.745840,501.0,698.0,1120.0,13.323586


In [147]:
data_test

Unnamed: 0,id,x_sw,y_sw,x_mp,y_mp,ks_foreign_17,ks_inc_hh_17,ks_inc_17,ks_nondipl_17,ks_abi_17,ks_arbl_17,ks_Mortality_17,ks_GISD_14,geb_abs,hh_abs,pers_abs
0,5kmN2685E4330,4330000,2685000.0,4332500,2687500,8.4,2047.0,2978.0,5.0,19.4,2.4,10.3,0.501063,184.0,378.0,378.0
1,5kmN2685E4340,4340000,2685000.0,4342500,2687500,8.4,2047.0,2978.0,5.0,19.4,2.4,10.3,0.501063,1.0,2.0,8.0
2,5kmN2690E4330,4330000,2690000.0,4332500,2692500,8.4,2047.0,2978.0,5.0,19.4,2.4,10.3,0.501063,590.0,1232.0,1232.0
3,5kmN2690E4335,4335000,2690000.0,4337500,2692500,8.4,2047.0,2978.0,5.0,19.4,2.4,10.3,0.501063,288.0,727.0,728.0
4,5kmN2690E4340,4340000,2690000.0,4342500,2692500,8.4,2047.0,2978.0,5.0,19.4,2.4,10.3,0.501063,50.0,46.0,108.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13840,5kmN3540E4210,4210000,3540000.0,4212500,3542500,6.8,2135.0,2768.0,7.3,26.2,5.5,11.7,0.745840,209.0,187.0,190.0
13841,5kmN3540E4215,4215000,3540000.0,4217500,3542500,6.8,2135.0,2768.0,7.3,26.2,5.5,11.7,0.745840,412.0,143.0,160.0
13842,5kmN3545E4215,4215000,3545000.0,4217500,3547500,6.8,2135.0,2768.0,7.3,26.2,5.5,11.7,0.745840,310.0,255.0,355.0
13843,5kmN3545E4220,4220000,3545000.0,4222500,3547500,6.8,2135.0,2768.0,7.3,26.2,5.5,11.7,0.745840,501.0,698.0,1120.0
