In [5]:
'''Predicting Mortality across Germany wih different AI Methods
'''

import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import impute
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import dbf
from pygam import GAM, s, f, LinearGAM
import xgboost
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Load Data

In [6]:
# Set the working directory
os.chdir("N:/WG_ENRI/20_projects/Noise2NAKO/04_data/grid_prediction/3_output")

In [7]:
# Read training and test data
data_training = pd.read_csv('Prediction/Training_data_cvd.csv', sep=',')
data_test = pd.read_csv('Prediction/Test_data_cvd.csv', sep=',')

In [21]:
# distinguish between response and input variables and make training and test data 
# remove unnecessary features 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'

output_variable = 'cvd_mortality_17'
output_path = '../5_results/CVD Mortality Rate 2017/'

y = data_training[output_variable]
X = data_training.drop([output_variable, 'ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

pred_y = data_test[output_variable]
pred_X = data_test.drop([output_variable, 'ks_Mortality_17', 'id', 'x_sw', 'y_sw','x_mp', 'y_mp'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

# Prediction

In [20]:
def Prediction(method, X_train, X_test, y_train, y_test, pred_X, pred_y, output_variable):
    
    '''Prediction Function'''
    
    if method == 'LR':
        model = linear_model.LinearRegression().fit(X_train, y_train)
    elif method == 'GAM':
        model = GAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9)).fit(X_train, y_train)
    elif method == 'RF':
        model = RandomForestRegressor(n_estimators= 10, max_depth= 10, random_state= 0, max_features='sqrt').fit(X_train, y_train)
    else:
        model = xgboost.XGBRegressor().fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred = model.predict(pred_X)

    result = {'Method': method, 'MSE_train': mean_squared_error(y_train, y_pred_train),
                            'MAE_train': mean_absolute_error(y_train, y_pred_train), 'R_2_train': r2_score(y_train, y_pred_train),
                           'MSE_val': mean_squared_error(y_test, y_pred_test), 'MAE_val': mean_absolute_error(y_test, y_pred_test),
                           'R_2_val': r2_score(y_test, y_pred_test), 'MSE_pred': mean_squared_error(pred_y, y_pred), 'MAE_pred': mean_absolute_error(pred_y, y_pred),
                           'R_2_pred': r2_score(pred_y, y_pred)}
    
    return result, pd.DataFrame(y_pred, columns = [output_variable])

### Prediction with different methods

In [22]:
result= pd.DataFrame(columns=['Method', 'MSE_train', 'MAE_train', 'R_2_train', 'MSE_val', 'MAE_val', 'R_2_val', 'MSE_pred', 'MAE_pred', 'R_2_pred'])

# Linear Regression
ToAppend, y_pred_LR = Prediction(method='LR', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# GAM
ToAppend, y_pred_GAM = Prediction(method='GAM', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# Random Forest
ToAppend, y_pred_RF = Prediction(method='RF', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

# XGBoost
ToAppend, y_pred_XGBoost = Prediction(method='XGBoost', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, pred_X=pred_X, pred_y=pred_y, output_variable = output_variable)
result = result.append(ToAppend, ignore_index=True)

result.to_csv(output_path + 'PredictionTask_results.csv', index=False, sep=',')

result

Unnamed: 0,Method,MSE_train,MAE_train,R_2_train,MSE_val,MAE_val,R_2_val,MSE_pred,MAE_pred,R_2_pred
0,LR,0.116732,0.247131,0.888532,0.112786,0.248329,0.889382,0.548719,0.567655,0.457424
1,GAM,0.023427,0.092908,0.97763,0.040478,0.113215,0.9603,0.515077,0.537532,0.490689
2,RF,0.009288,0.054473,0.99113,0.032168,0.10586,0.968451,0.44555,0.511289,0.559438
3,XGBoost,1e-06,0.000772,0.999999,0.024058,0.068355,0.976405,0.465673,0.516687,0.539541


### Merge dataframes to make the final output dataset

In [27]:
def OutputDataframe(data_training, data_test, y_pred, method, output_variable, output_path):
    
    data_result= pd.DataFrame()
    data_result = data_result.append(data_training)
    data_result = data_result.append(pd.concat([data_test.drop([output_variable], axis = 1),pd.DataFrame(y_pred)],axis=1))
    data_result = data_result[['id', output_variable]]
    data_result.to_csv((output_path + 'prediction_result_'+method+'.csv'), sep=',', index=False)

### Make the difference output

In [25]:
def DiffDataframe(data_training, data_test, y_pred, method, output_variable, output_path):
 
    data_result= pd.DataFrame()
    data_result = data_result.append(data_training)
    data_result[output_variable] = 0
    pred_variable = 'pred'+output_variable
    data_test = pd.concat([data_test, y_pred.rename(columns={output_variable: pred_variable})], axis=1)
    data_test[output_variable] = data_test[output_variable] - data_test[pred_variable]
    data_result = data_result.append(data_test.drop([pred_variable], axis = 1))
    data_result = data_result[['id', output_variable]]
    data_result.to_csv((output_path + 'prediction_result_diff_'+method+'.csv'), sep=',', index=False)

In [29]:
OutputDataframe(data_training,data_test, y_pred_XGBoost, 'XGBoost', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_XGBoost, 'XGBoost', output_variable, output_path)

In [30]:
OutputDataframe(data_training,data_test, y_pred_LR, 'LR', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_LR, 'LR', output_variable, output_path)

In [31]:
OutputDataframe(data_training,data_test, y_pred_RF, 'RF', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_RF, 'RF', output_variable, output_path)

In [32]:
OutputDataframe(data_training,data_test, y_pred_GAM, 'GAM', output_variable, output_path)
DiffDataframe(data_training,data_test, y_pred_GAM, 'GAM', output_variable, output_path)

In [33]:
OutputDataframe(data_training,data_test, pred_y, 'GT', output_variable, output_path)