In [1]:
'''Predicting Mortality across Germany wih different AI Methods
'''

import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import impute
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import dbf
from pygam import GAM, s, f, LinearGAM
import xgboost
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Load Data

In [2]:
# Set the working directory
os.chdir("N:/WG_ENRI/20_projects/Noise2NAKO/04_data/grid_prediction/3_output")

In [3]:
# Read training and test data
data_training = pd.read_csv('Prediction/Training_data.csv', sep=',')
data_test = pd.read_csv('Prediction/Test_data.csv', sep=',')

In [4]:
# distinguish between response and input variables and make training and test data 
# remove unnecessary features 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'

y = data_training['ks_Mortality_17']
X = data_training.drop(['ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

pred_y = data_test['ks_Mortality_17']
pred_X = data_test.drop(['ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

# Prediction

In [5]:
def Prediction(method, X_train, X_test, y_train, y_test, pred_X, pred_y):
    
    '''Prediction Function'''
    
    if method == 'LR':
        model = linear_model.LinearRegression().fit(X_train, y_train)
    elif method == 'GAM':
        model = GAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9)).fit(X_train, y_train)
    elif method == 'RF':
        model = RandomForestRegressor(n_estimators= 10, max_depth= 10, random_state= 0, max_features='sqrt').fit(X_train, y_train)
    else:
        model = xgboost.XGBRegressor().fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred = model.predict(pred_X)

    result = {'Method': method, 'MSE_train': mean_squared_error(y_train, y_pred_train),
                            'MAE_train': mean_absolute_error(y_train, y_pred_train), 'R_2_train': r2_score(y_train, y_pred_train),
                           'MSE_test': mean_squared_error(y_test, y_pred_test), 'MAE_test': mean_absolute_error(y_test, y_pred_test),
                           'R_2_test': r2_score(y_test, y_pred_test), 'MSE_pred': mean_squared_error(pred_y, y_pred), 'MAE_pred': mean_absolute_error(pred_y, y_pred),
                           'R_2_pred': r2_score(pred_y, y_pred)}
    
    return result, pd.DataFrame(y_pred, columns = ['ks_Mortality_17'])

### Prediction with different methods

In [6]:
result= pd.DataFrame(columns=['Method', 'MSE_train', 'MAE_train', 'R_2_train', 'MSE_test', 'MAE_test', 'R_2_test', 'MSE_pred', 'MAE_pred', 'R_2_pred'])

# Linear Regression
ToAppend, y_pred_LR = Prediction(method='LR', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

# GAM
ToAppend, y_pred_GAM = Prediction(method='GAM', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

# Random Forest
ToAppend, y_pred_RF = Prediction(method='RF', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

# XGBoost
ToAppend, y_pred_XGBoost = Prediction(method='XGBoost', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y)
result = result.append(ToAppend, ignore_index=True)

result.to_csv('../5_results/PredictionTask_results.csv', index=False, sep=',')

result

Unnamed: 0,Method,MSE_train,MAE_train,R_2_train,MSE_test,MAE_test,R_2_test,MSE_pred,MAE_pred,R_2_pred
0,LR,0.356645,0.418341,0.901554,0.35757,0.417339,0.901419,1.499346,0.927773,0.557142
1,GAM,0.091077,0.183962,0.97486,0.125594,0.20867,0.965374,1.67824,0.996681,0.504303
2,RF,0.059473,0.128474,0.983583,0.182717,0.23141,0.949626,1.419871,0.906787,0.580616
3,XGBoost,3e-06,0.001231,0.999999,0.105029,0.143332,0.971044,1.28793,0.863227,0.619588


### Merge dataframes to make the final output dataset

In [180]:
def OutputDataframe(data_training, data_test, y_pred, method):
    
    data_result= pd.DataFrame()
    data_result = data_result.append(data_training)
    data_result = data_result.append(pd.concat([data_test.drop(['ks_Mortality_17'], axis = 1),pd.DataFrame(y_pred)],axis=1))
    data_result = data_result[['id', 'ks_Mortality_17']]
    data_result.to_csv(('../5_results/prediction_results/prediction_result_'+method+'.csv'), sep=',', index=False)

In [181]:
OutputDataframe(data_training,data_test, y_pred_XGBoost, 'XGBoost')