<h4>Data and Module Importing</h4>

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

import statsmodels.api as sm
import statsmodels.tools

from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv("C:\\Users\\Toby\\Documents\\Digital Futures\\Projects\\WHO Project\\Life Expectancy Data.csv")
df.shape

(2864, 21)

<h4>Train-Test Split</h4>

In [None]:
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy')

X = df[feature_cols]
y = df['Life_expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

<h2>Interactive Model Selection and Outputs</h2>

In [126]:
model_choice = int(input("""Do you want to run the full model (1) or run a censored model to cover sensitive data (2)?
Enter your option here: """))

if model_choice == 1:

    # Model FE and defining stage
    
    X_train_fe = feature_eng(X_train)
    model_cols = X_train_fe.columns
    
    model_state = "full"
    modelling(model_cols)
    print() # Line Break
    model_state = "VIF optimised"
    modelling(optimal_cols)

Do you want to run the full model (1) or run a censored model to cover sensitive data (2)?
Enter your option here:  1



The following shows the level of success our full model has with predicting life expectancy:


P-Values:

const                                   0.000
Year                                    0.000
Under_five_deaths                       0.000
Adult_mortality                         0.000
Alcohol_consumption                     0.057
Hepatitis_B                             0.002
BMI                                     0.000
Polio                                   0.000
Diphtheria                              0.014
Incidents_HIV                           0.000
Thinness_ten_nineteen_years             0.001
Schooling                               0.000
Economy_status_Developed                0.000
Region_Asia                             0.032
Region_Central America and Caribbean    0.000
Region_European Union                   0.000
Region_Middle East                      0.115
Region_North America                    0.006
Region_Oceania                          0.000
Region_Rest of Euro

<h4>Behind the Scenes Workings</h4>

In [17]:
def feature_eng(data):
    data = data.copy()

    # Removing autocorrelated columns
    
    data = data.drop(columns = ['Country', 'Economy_status_Developing', 'Infant_deaths'])
    
    # One hot encoding
    
    data = pd.get_dummies(data, columns = ['Region'], drop_first = True, prefix = 'Region', dtype=int) 

    # Fixing exponential relationship

    data['log_GDP'] =  np.log(data['GDP_per_capita'])

    # Scaling
    
    scaler = StandardScaler()
    data[data.columns] = scaler.fit_transform(data[data.columns])

    # Removing columns we are not interested in for our model

    data = data.drop(columns = ['Measles', 'GDP_per_capita', 'Population_mln', 'Thinness_five_nine_years'])
    
    # VIF

    data_col = data.columns
    
    calculate_vif(data[data_col])
    
    data = sm.add_constant(data)
    return data

In [19]:
def calculate_vif(X, thresh = 5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        # this bit uses list comprehension to gather all the VIF values of the different variables
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]
        
        maxloc = vif.index(max(vif)) # getting the index of the highest VIF value
        if max(vif) > thresh:
            del variables[maxloc] # we delete the highest VIF value on condition that it's higher than the threshold
            dropped = True # if we deleted anything, we set the 'dropped' value to True to stay in the while loop    
    
    global optimal_cols 
    optimal_cols = list(X.columns[variables])
    optimal_cols.append('const')

    # We now create a global variable and assign the list of columns still in the valid set to it, remembering to add the constant back in. We can use this to check for an optimal condition number.
    
    return optimal_cols

In [122]:
def modelling(col):

    # Modelling Stage
    
    lin_reg = sm.OLS(y_train, X_train_fe[col])
    results = lin_reg.fit()

    # Metrics Observations 
    
    print(f"\nThe following shows the level of success our {model_state} model has with predicting life expectancy:\n")
    print(f"""
P-Values:

{round(results.pvalues,3)}

R-Squared:
    
{results.rsquared}
    
AIC and BIC:
    
{results.aic}
{results.bic}
    
Condition Number:
    
{results.condition_number}
""")

    # RMSE Calculations
    
    y_pred = results.predict(X_train_fe[col])
    rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
    print(f"RMSE:\n\n{rmse}")
    # print(results.summary())