In [23]:
# Import all required libraries
import pandas as pd
import statsmodels.api as sm
import statsmodels.tools
from sklearn.model_selection import train_test_split
from pathlib import Path

In [24]:
#Lets the user enter the file path to the csv and also formats it to escape any / and remove leading and trailing "
filepath = input("Please enter filepath to CSV: ")
filepath = Path(filepath.strip().replace('"', '').replace("'", ""))  # remove " and '
df = pd.read_csv(filepath)

Please enter filepath to CSV:  C:\Users\Gaming\Downloads\Life Expectancy Data.csv


The Consent Function

In [25]:
# The columns we want in the all model
feature_cols = [
            "Year",
            "Infant_deaths",
            "Under_five_deaths",
            "Adult_mortality",
            "Alcohol_consumption",
            "Hepatitis_B",
            "Measles",
            "BMI",
            "Polio",
            "Diphtheria",
            "Incidents_HIV",
            "GDP_per_capita",
            "Population_mln",
            "Thinness_ten_nineteen_years",
            "Thinness_five_nine_years",
            "Schooling",
            "Economy_status_Developed",
        ]

In [26]:
# As all the features we want are numeric we don't have to OHE or label encode
def feature_eng(dh):
    dh = dh.copy()
    dh = sm.add_constant(dh)
    return dh

In [27]:
# Define X and y for the TT split
X = df[feature_cols]
y = df["Life_expectancy"]

In [28]:
# Perform the TT split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=69
)

In [29]:
# Generate and fit the mode
X_train_fe = feature_eng(X_train)
lin_reg = sm.OLS(y_train, X_train_fe)
results = lin_reg.fit()

In [30]:
# Calculate the RMSE
y_pred = results.predict(X_train_fe)
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
print(f"Train RMSE is {rmse}")

X_test_fe = feature_eng(X_test)
y_test_pred = results.predict(X_test_fe)
rmse_test = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(f"Test RMSE is {rmse_test}")

Train RMSE is 1.3539658514350055
Test RMSE is 1.3547123247382367


In [31]:
# Print the parameters which will be used in another notebook for the function to predict user inputed data for the all model
print(results.params)

const                          28.660577
Year                            0.027768
Infant_deaths                  -0.044838
Under_five_deaths              -0.057968
Adult_mortality                -0.047660
Alcohol_consumption             0.071053
Hepatitis_B                    -0.006812
Measles                         0.002920
BMI                            -0.143999
Polio                           0.004377
Diphtheria                     -0.002598
Incidents_HIV                   0.066033
GDP_per_capita                  0.000027
Population_mln                 -0.000060
Thinness_ten_nineteen_years    -0.038488
Thinness_five_nine_years        0.001688
Schooling                       0.082064
Economy_status_Developed        0.622223
dtype: float64


In [38]:
# The columns we want in the ethical model
feature_cols = [
            "Year",
            "GDP_per_capita",
            "Population_mln",
            "Schooling",
            "Economy_status_Developed",
        ]

In [39]:
# Define X and y for the TT split
X1 = df[feature_cols]
y1 = df["Life_expectancy"]

In [40]:
# Perform the TT split
X_train, X_test, y_train, y_test = train_test_split(
    X1, y1, test_size=0.2, random_state=69
)

In [41]:
# Generate and fit the mode
X_train_fe = feature_eng(X_train)
lin_reg = sm.OLS(y_train, X_train_fe)
results = lin_reg.fit()

In [42]:
# Calculate the RMSE
y_pred = results.predict(X_train_fe)
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
print(f"Train RMSE is {rmse}")

X_test_fe = feature_eng(X_test)
y_test_pred = results.predict(X_test_fe)
rmse_test = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(f"Test RMSE is {rmse_test}")

Train RMSE is 6.079398507717432
Test RMSE is 6.0471917039482515


In [43]:
# Print the parameters which will be used in another notebook for the function to predict user inputed data for the ethical model
print(results.params)

const                      -243.593138
Year                          0.148155
GDP_per_capita                0.000126
Population_mln                0.003584
Schooling                     1.755860
Economy_status_Developed      0.479281
dtype: float64
