## Temporal analysis of the data

This section contains the code for the temporal modelling of the data. It is split into 3 parts:
The first part has imports and data preparation. The second part defines the model and runs it and the third part evaluates it.

### Imports and data preparation

In [1]:
# The necessary libraries are imported and the random generator seed is fixed.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
import seaborn as sns
import time

import numpyro
import numpyro.distributions as dist
from numpyro import handlers
from numpyro.infer import MCMC, NUTS

import jax.numpy as jnp
from jax import random, vmap
from jax.scipy.special import logsumexp
from jax import lax
import warnings
warnings.filterwarnings("ignore")

# fix random generator seed (for reproducibility of results)
np.random.seed(42)

# matplotlib style options
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read data and take difference from year to year to introduce temporal aspect.


data = pd.read_csv("Data/Data_processed.csv", sep=';')


for country in data['Country'].unique():
    d = data[data['Country'] == country]
    # sort by year
    d = d.sort_values(by='Year')
    # transform data to look at differences from year to year for all nummerical features (2 first columns are country and continent)
    d.iloc[:,5:] = d.iloc[:,5:].diff()
    # replace invividual nan values with 0
    data[data['Country'] == country] = d

# for life expectancy we don't want to replace the old column as we will use it to calculate the difference for 1,2 and 3 years.
# So instead we define a new column and replace the old one at the end.

data['Life expectancy difference'] = 0
for country in data['Country'].unique():
    d = data[data['Country'] == country]
    # sort by year
    d = d.sort_values(by='Year')
    # get difference in life expectancy
    d['Life expectancy difference'] = d['Life expectancy '].diff()


    # replace old values with new values
    data[data['Country'] == country] = d


In [3]:
# We add two new columns for the difference in life expectancy for 1 and 2 years ago. 
# We shift the life expectancy difference column
# such that we always predict one year ahead. 
# So for a difference in a variable from 2000 to 2001 we predict the difference in 2001 to 2002.

data['Life expectancy difference 1 year ago'] = 0
data['Life expectancy difference 2 years ago'] = 0
# get difference in life expectancy for each country and year
for country in data['Country'].unique():
    d = data[data['Country'] == country]
    # sort by year
    d = d.sort_values(by='Year')
    # get difference in life expectancy
    d['Life expectancy difference'] = d['Life expectancy difference'].shift(-1)
    d['Life expectancy difference 1 year ago'] = d['Life expectancy difference'].shift(1)
    d['Life expectancy difference 2 years ago'] = d['Life expectancy difference'].shift(2)

    # replace old values with new values
    data[data['Country'] == country] = d

# drop nan as the year 2015 (most recent year) has no difference in life expectancy, as 2016 is not in the data to compare.
data = data.dropna()

In [4]:
# Make train and test split. The validation split is implicit in the test_split,
# as we will just use the last 2 years from test data as validation data.
# The reason for this is that the last two values of each split has to be given to the model as input, 
# (since we model it as dependent on the last two years).

train_split = data[data['Year'] < 2009]
test_split = data[data['Year'] >= 2009]

# make list of all columns that need to be standardized
cols = train_split.columns
# standardize all columns except country, continent, year and status (which are categorical and not used in the model)
for col in cols:
    if col not in ['Country', 'continent', 'Year', 'Status']:
        mean = train_split[col].mean()
        std = train_split[col].std()
        train_split[col] = (train_split[col] - mean) / std
        test_split[col] = (test_split[col] - mean) / std

# make numpy array of standardized data, first dimension is time, second is countries, third is features
# This prepares the data for the model.
countries_train = train_split['Country'].unique()
years_train = train_split['Year'].unique()
countries_test = test_split['Country'].unique()
years_test = test_split['Year'].unique()
cols = cols.drop(['Life expectancy ', 'Country', 'Year', 'continent', 'Life expectancy difference 1 year ago', 'Life expectancy difference 2 years ago', 'Life expectancy difference'])
train_split_np_features = np.empty((len(years_train), len(countries_train), len(cols)))
train_split_np_target = np.empty((len(years_train), len(countries_train)))
test_split_np_features = np.empty((len(years_test), len(countries_test), len(cols)))
test_split_np_target = np.empty((len(years_test), len(countries_test)))
target = 'Life expectancy difference'

# Here we fill in the values for the train and test split, both target and features.
for i, country in enumerate(countries_train):
    for j, year in enumerate(years_train):
        # take nummeric data of country and put in numpy array
        train_split_np_features[j, i] = train_split.loc[(train_split['Country'] == country) & (train_split['Year'] == year)][cols].to_numpy()
        # take target of country and put in numpy array
        train_split_np_target[j, i] = train_split.loc[(train_split['Country'] == country) & (train_split['Year'] == year)][target].to_numpy()
for i, country in enumerate(countries_test):
    for j, year in enumerate(years_test):
        # take nummeric data of country and put in numpy array
        test_split_np_features[j, i] = test_split.loc[(test_split['Country'] == country) & (test_split['Year'] == year)][cols].to_numpy()
        # take target of country and put in numpy array
        test_split_np_target[j, i] = test_split.loc[(test_split['Country'] == country) & (test_split['Year'] == year)][target].to_numpy()

# We add the last two years of the train split to the test split, as we need them as input for the model.
# Again, this is because we model the life expectancy difference as dependent on the last two years.
test_split_np_features = np.concatenate((train_split_np_features[-2:], test_split_np_features))
test_split_np_target = np.concatenate((train_split_np_target[-2:], test_split_np_target))


### Defining the model

First, we will be defining the model using numpyro. Here we use the jax backend for numpyro.

In [5]:
# We will now define the model

# This function is the carry function, which is used in the scan function, it models the time dependency to calculate the ht latent variable.

def f(carry, h_external_change):
  beta2, z_prev1, z_prev2 = carry
  z_t_normal = (beta2[0]*z_prev1 + beta2[1]*z_prev2 + h_external_change)
  z_t = z_t_normal
  z_prev2 = z_prev1
  z_prev1 = z_t
  return (beta2, z_prev1, z_prev2), z_t


# Here we define the model. 
# For a more detailed explanation of the model, see the report (PGM or generative story)
def model(obs_x, obs_y):
    time = obs_x.shape[0]
    countries = obs_x.shape[1]
    features_x = obs_x.shape[2]
    dim_h = 2
    
    with numpyro.plate("weights", features_x):
      beta1 = numpyro.sample(name="beta1", fn=dist.Normal(loc=0., scale=1))
    with numpyro.plate("countries", countries):
      z0 = numpyro.sample(name="z0", fn=dist.Normal(loc=0, scale=.1))
      z1 = numpyro.sample(name="z1", fn=dist.Normal(loc=0, scale=.1))
    
    with numpyro.plate("time_x", features_x):
      with numpyro.plate("countries_x", countries):
        with numpyro.plate("features_x", time):
          xs = numpyro.sample(name="xs", fn=dist.Normal(loc=0., scale=1.), obs=obs_x)

    with numpyro.plate("weights2", 2):
      beta2 = numpyro.sample(name="beta2", fn=dist.Normal(loc=0., scale=1))

    tau = numpyro.sample(name="tau", fn=dist.HalfCauchy(scale=.1))
    sigma = numpyro.sample(name="sigma", fn=dist.HalfCauchy(scale=.1))

    with numpyro.plate("countries", countries):
      with numpyro.plate("time", time):
          h_external_change = numpyro.sample(name="h_change", fn=dist.Normal(loc=xs @ beta1, scale=tau))

    carry = (beta2, z0, z1)
    z_collection = [z0, z1]
    carry, zs_exp = lax.scan(f, carry, h_external_change, time)
    z_collection = jnp.concatenate((jnp.array(z_collection), zs_exp), axis=0)

    with numpyro.plate("countries", countries):
      with numpyro.plate("time", time):
        numpyro.sample(name="y_obs", fn=dist.Normal(loc=zs_exp, scale=sigma), obs=obs_y)
    return z_collection


In [6]:

# We will now run the model
t0 = time.time()

rng_key = random.PRNGKey(0)
rng_key, rng_key_ = random.split(rng_key)

nuts_kernel = NUTS(model=model, max_tree_depth=8, step_size=5e-3, adapt_step_size=False)
mcmc = MCMC(nuts_kernel, num_samples=1000, num_warmup=1000, num_chains=1, progress_bar=True)
mcmc.run(rng_key_, obs_x=train_split_np_features, obs_y=train_split_np_target)

t_fin = time.time()

print("Total time: {0:.3f}m".format((t_fin - t0)/60))

sample: 100%|██████████| 2000/2000 [00:43<00:00, 46.21it/s, 255 steps of size 5.00e-03. acc. prob=1.00]


Total time: 0.842m


### Evaluating the model

We will both be printing some values of the beta parameters and looking at the performance.
The beta1 has all the coeffici of the differential features effect on the h_{t} latent variable, while beta2 holds the coefficients of the h_{t-1} and h_{t-2} latent variables on the h_{t} latent variable.

In [7]:
# Print some statistics for the beta's
beta1 = mcmc.get_samples()['beta1'].mean(0)
beta2 = mcmc.get_samples()['beta2'].mean(0)
print(beta1)
print(beta2)


[ 7.0166595e-02 -1.8294312e-01  7.7119460e-03  2.1047480e-04
 -1.9920954e-02  2.7549623e-02  8.2173347e-03  1.0607784e-02
 -3.6333199e-04 -2.1152161e-01  3.1199126e-02  6.1083712e-02
  3.3958375e-02 -2.5482597e-02  7.0272081e-02 -1.2428053e-02]
[-0.13790154  0.13728343]


In [8]:
# Here we calculate the test split for the model and the baseline
errors = []
errors_baseline = []
# the reason for looping from 2 to 6 is because we need the last two years of the train split as input for the model
for i in range(2, 6):
    # Predictions are simply made by using the mean values of the beta's (see report for more details)
    prediction = beta2[0] * test_split_np_target[i-1] + beta2[1] * test_split_np_target[i-2] + test_split_np_features[i] @ beta1
    errors.append(np.mean(((prediction - test_split_np_target[i])*std)**2))
    errors_baseline.append(np.mean((test_split_np_target[i]*std)**2))
print("RMSE for model on test_split: " + str(np.sqrt(np.mean(errors))))
print("RMSE for baseline on test_split: " + str(np.sqrt(np.mean(errors_baseline))))
 

RMSE for model on test_split: 0.52887076
RMSE for baseline on test_split: 0.543183710823336


In [9]:
#Very much the same as before, but in this case for the validation split

errors = []
errors_baseline = []
# we loop from 4 to 8 here, so we include the last 2 values of the test split as input for the model
for i in range(4, 8):
    prediction = beta2[0] * test_split_np_target[i-1] + beta2[1] * test_split_np_target[i-2] + test_split_np_features[i] @ beta1
    errors.append(np.mean(((prediction - test_split_np_target[i])*std)**2))
    errors_baseline.append(np.mean((test_split_np_target[i]*std)**2))
print("RMSE for model on validation_split: " + str(np.sqrt(np.mean(errors))))
print("RMSE for baseline on validation_split: " + str(np.sqrt(np.mean(errors_baseline))))

RMSE for model on validation_split: 0.9060316
RMSE for baseline on validation_split: 0.9252065056485308


In [10]:
# And finally, for the train split
errors = []
errors_baseline = []
# Here we loop from 0 to 7, as we have no values before the train split to use as input for the model
for i in range(7):
    prediction = beta2[0] * train_split_np_target[i-1] + beta2[1] * train_split_np_target[i-2] + train_split_np_features[i] @ beta1
    errors.append(np.mean(((prediction - train_split_np_target[i])*std)**2))
    errors_baseline.append(np.mean((train_split_np_target[i]*std)**2))
print("RMSE for model on train_split: " + str(np.sqrt(np.mean(errors))))
print("RMSE for baseline on train_split: " + str(np.sqrt(np.mean(errors_baseline))))


RMSE for model on train_split: 0.51673096
RMSE for baseline on train_split: 0.5494875377015397


In [11]:
# and finally, we print the summary of the model

mcmc.print_summary()


                     mean       std    median      5.0%     95.0%     n_eff     r_hat
       beta1[0]      0.07      0.07      0.07     -0.04      0.18     97.18      1.01
       beta1[1]     -0.18      0.03     -0.18     -0.24     -0.13     29.94      1.01
       beta1[2]      0.01      0.03      0.01     -0.04      0.05     90.99      1.02
       beta1[3]      0.00      0.03      0.00     -0.05      0.05     42.91      1.05
       beta1[4]     -0.02      0.03     -0.02     -0.07      0.02     87.92      1.00
       beta1[5]      0.03      0.03      0.03     -0.02      0.07    113.50      1.00
       beta1[6]      0.01      0.03      0.01     -0.03      0.05    115.88      1.01
       beta1[7]      0.01      0.03      0.01     -0.03      0.05    173.62      1.00
       beta1[8]     -0.00      0.03     -0.00     -0.04      0.04     77.89      1.04
       beta1[9]     -0.21      0.03     -0.21     -0.26     -0.17     89.46      1.00
      beta1[10]      0.03      0.03      0.03     -0.