In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [41]:
# Load in data
data = pd.read_csv("Life Expectancy Data Cleaned With Insurance.csv")
# Take out the "Life Expectancy" column as y
ys = data["Life_expectancy"].values
# Make this column boolean based on whether each value is higher or lower than the median.
is_high = lambda y: y >= np.median(ys)
y = np.array([is_high(val) for val in ys])
# The rest of the columns we decided to work with become X
X = data.drop(["Life_expectancy","Year","Country","Region", "Infant_deaths","Under_five_deaths", "Adult_mortality", "GDP_per_capita", "Economy_status_Developed", "Economy_status_Developing"], axis=1).values

# Split testing and training data through a pseudo-shuffle. Keeping the original order for X and y is useful later.
n = int(0.7*len(y))
mask = np.arange(len(y))
np.random.shuffle(mask)
X_train = X[mask[:n]]
y_train = y[mask[:n]]
X_test = X[mask[n:]]
y_test = y[mask[n:]]

# Create and fit random forest
forest = RandomForestClassifier()
forest.fit(X_train,y_train)

# Predict test data
y_hat = forest.predict(X_test)
# Sanity check: are about half of the values above the median?
print("Predicted percent above median in test data:",np.mean(y_hat))
# Find the accuracy
A = accuracy_score(y_test, y_hat)
print("Accuracy score:",A)

Predicted percent above median in test data: 0.5034883720930232
Accuracy score: 0.9813953488372092


In [42]:
# Now change all of the "Universal Heathcare" values to be true and pass in the most recent year
mask = data["Year"] == 2015
newest_data = data[mask]
newest_data.loc[:,'Universal_insurance'] = True
X_UH = newest_data.drop(["Life_expectancy","Year","Country","Region", "Infant_deaths","Under_five_deaths", "Adult_mortality", "GDP_per_capita", "Economy_status_Developed", "Economy_status_Developing"], axis=1).values
y_UH = forest.predict(X_UH)
print("Percent of countries above the old median if they all switched to universal healthcare:\n",np.mean(y_UH))

Percent of countries above the old median if they all switched to universal healthcare:
 0.6312849162011173


In [43]:
# Do the same but say they all choose against universal healthcare
newest_data.loc[:,'Universal_insurance'] = False
X_NUH = newest_data.drop(["Life_expectancy","Year","Country","Region", "Infant_deaths","Under_five_deaths", "Adult_mortality", "GDP_per_capita", "Economy_status_Developed", "Economy_status_Developing"], axis=1).values
y_NUH = forest.predict(X_NUH)
print("Percent of countries above the old median if they all stopped universal healthcare:\n",np.mean(y_NUH))

Percent of countries above the old median if they all stopped universal healthcare:
 0.547486033519553


In [44]:
# The median is higher in the last year! See how well our model predicts this.
mask = data["Year"] == 2015
newest_data = data[mask]
X_N = newest_data.drop(["Life_expectancy","Year","Country","Region", "Infant_deaths","Under_five_deaths", "Adult_mortality", "GDP_per_capita", "Economy_status_Developed", "Economy_status_Developing"], axis=1).values
y_N = forest.predict(X_N)
print("Predicted percent of countries above the old median in the most recent year:\n",np.mean(y_N))
# y itself deosn't get shuffled
print("Actual:\n",np.mean(y[mask]))

Predicted percent of countries above the old median in the most recent year:
 0.5698324022346368
Actual:
 0.5754189944134078
