In [2]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pathlib import Path
import pandas as pd
import statsmodels

In [3]:
df = pd.read_csv(r"C:\Users\Sahm9\Work\Github\projects\Who_Project\World_Health_Organisation_Data.csv")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
df.head(1)

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,97,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5


In [5]:
# All features included for model 1

features_model_1 = [
    "Year",
    "Infant_deaths",
    "Under_five_deaths",
    "Adult_mortality",
    "Alcohol_consumption",
    "Hepatitis_B",
    "Measles",
    "BMI",
    "Polio",
    "Diphtheria",
    "Incidents_HIV",
    "GDP_per_capita",
    "Population_mln",
    "Thinness_ten_nineteen_years",
    "Thinness_five_nine_years",
    "Schooling",
    "Economy_status_Developed",
    ]

In [6]:
# Reducing features by taking out sensitive information for model 2

non_sensitive_features_model_2 = [
    "Year",
    "GDP_per_capita",
    "Population_mln",
    "Schooling",
    "Economy_status_Developed",
    ]

In [7]:
def feature_eng_model_1(df):
    df = df.copy()
    df = sm.add_constant(df)
    return df

In [8]:
def feature_eng_model_2(df):
    df = df.copy()
    df = sm.add_constant(df)
    return df

In [9]:
# Model 1 - All features

X_model_1 = df[features_model_1]
y_model_1 = df["Life_expectancy"]

# Modle 2 - All non-sensitive features

X_model_2 = df[non_sensitive_features_model_2]
y_model_2 = df["Life_expectancy"]

In [10]:
# Model 1 - All features

X_train_model_1, X_test_model_1, y_train_model_1, y_test_model_1 = train_test_split(X_model_1, y_model_1, test_size=0.2, random_state= 1)

# Model 2 - All non-sensitive features

X_train_model_2, X_test_model_2, y_train_model_2, y_test_model_2 = train_test_split(X_model_2, y_model_2, test_size=0.2, random_state= 2)

In [None]:
# Generate and fit model 1

X_train_model_1_fe = feature_eng_model_1(X_train_model_1)
lin_reg = sm.OLS(y_train_model_1, X_train_model_1_fe)
lin_reg_model_1 = lin_reg.fit()

# Generate and fit model 2

X_train_model_2_fe = feature_eng_model_2(X_train_model_2)
lin_reg = sm.OLS(y_train_model_2, X_train_model_2_fe)
lin_reg_model_2 = lin_reg.fit()

In [None]:
# Calculate the RMSE for both train and test for model 1 and 2

# Model 1 - All features

print(f"Model 1 - All features:\n")

# train RMSE

y_train_pred_model_1 = lin_reg_model_1.predict(X_train_model_1_fe)
train_rmse_model_1 = statsmodels.tools.eval_measures.rmse(y_train_model_1, y_train_pred_model_1) # check
print(f"Model 1 Train RMSE: {train_rmse_model_1}")

# test RMSE

X_test_model_1_fe = feature_eng_model_1(X_test_model_1)
y_test_pred_model_1 = lin_reg_model_1.predict(X_test_model_1_fe)
test_rmse_model_1 = statsmodels.tools.eval_measures.rmse(y_test_model_1, y_test_pred_model_1)
print(f"Model 1 Test RMSE: {test_rmse_model_1}\n")

print(f"---------------------------------------------\n")

# # Model 2 - All non-sensitive features

print(f"Model 2 - Non-Sensitive features only:\n")

# train RMSE

y_train_pred_model_2 = lin_reg_model_2.predict(X_train_model_2_fe)
train_rmse_model_2 = statsmodels.tools.eval_measures.rmse(y_train_model_2, y_train_pred_model_2) # check
print(f"Model 2 Train RMSE: {train_rmse_model_2}")

# test RMSE

X_test_model_2_fe = feature_eng_model_1(X_test_model_2)
y_test_pred_model_2 = lin_reg_model_2.predict(X_test_model_2_fe)
test_rmse_model_2 = statsmodels.tools.eval_measures.rmse(y_test_model_2, y_test_pred_model_2)
print(f"Model 2 Test RMSE: {test_rmse_model_2}\n")


Model 1 - All features:

Model 1 Train RMSE: 1.3506110570904786
Model 1 Test RMSE: 1.363622067739805

---------------------------------------------

Model 2 - Non-Sensitive features only:

Model 2 Train RMSE: 5.972435484566709
Model 2 Test RMSE: 6.45216646987358



In [None]:
# Parameters for prediction in model 1:

print(lin_reg_model_1.params)

const                          35.143980
Year                            0.024859
Infant_deaths                  -0.047143
Under_five_deaths              -0.056292
Adult_mortality                -0.048447
Alcohol_consumption             0.067761
Hepatitis_B                    -0.009084
Measles                         0.001338
BMI                            -0.155824
Polio                          -0.001738
Diphtheria                      0.004110
Incidents_HIV                   0.092358
GDP_per_capita                  0.000024
Population_mln                 -0.000180
Thinness_ten_nineteen_years    -0.032738
Thinness_five_nine_years       -0.002472
Schooling                       0.090811
Economy_status_Developed        0.645618
dtype: float64


In [None]:
# Parameters for prediction in model 2:

print(lin_reg_model_2.params)

const                      -242.806529
Year                          0.148006
GDP_per_capita                0.000126
Population_mln                0.003676
Schooling                     1.686082
Economy_status_Developed      0.736579
dtype: float64
