In [219]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
import seaborn as sns
import time

import numpyro
import numpyro.distributions as dist
from numpyro import handlers
from numpyro.infer import MCMC, NUTS

import jax.numpy as jnp
from jax import random, vmap
from jax.scipy.special import logsumexp
from jax import lax

# fix random generator seed (for reproducibility of results)
np.random.seed(42)

# matplotlib style options
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)

In [220]:
data = pd.read_csv("Data/Data_processed.csv", sep=';')


for country in data['Country'].unique():
    d = data[data['Country'] == country]
    # sort by year
    d = d.sort_values(by='Year')
    # transform data to look at differences from year to year for all nummerical features (2 first columns are country and continent)
    d.iloc[:,5:] = d.iloc[:,5:].diff()
    # replace invividual nan values with 0
    data[data['Country'] == country] = d

data['Life expectancy difference'] = 0
for country in data['Country'].unique():
    d = data[data['Country'] == country]
    # sort by year
    d = d.sort_values(by='Year')
    # get difference in life expectancy
    d['Life expectancy difference'] = d['Life expectancy '].diff()


    # replace old values with new values
    data[data['Country'] == country] = d


In [221]:
data['Life expectancy difference 1 year ago'] = 0
data['Life expectancy difference 2 years ago'] = 0
# get difference in life expectancy for each country and year
for country in data['Country'].unique():
    d = data[data['Country'] == country]
    # sort by year
    d = d.sort_values(by='Year')
    # get difference in life expectancy
    d['Life expectancy difference'] = d['Life expectancy difference'].shift(-1)
    d['Life expectancy difference 1 year ago'] = d['Life expectancy difference'].shift(1)
    d['Life expectancy difference 2 years ago'] = d['Life expectancy difference'].shift(2)

    # replace old values with new values
    data[data['Country'] == country] = d

# remove year 2001 and 2002
# data = data[data['Year'] != 2001]

# loop over each year and print how many nan values there are
# for year in data['Year'].unique():
#     print(year, data[data['Year'] == year].isna().sum().sum())

# print average over contries life expectancy difference for each year
data = data.dropna()

In [235]:
train_split = data[data['Year'] < 2009]
test_split = data[data['Year'] >= 2009]

# make list of all columns that need to be standardized
cols = train_split.columns
# cols = cols.drop(['Country', 'continent', 'Year', 'Status'])
# standardize all columns except country, continent, year and status
for col in cols:
    if col not in ['Country', 'continent', 'Year', 'Status']:
        mean = train_split[col].mean()
        std = train_split[col].std()
        train_split[col] = (train_split[col] - mean) / std
        test_split[col] = (test_split[col] - mean) / std

# make numpy array of standardized data, first dimension is time, second is countries, third is features
countries_train = train_split['Country'].unique()
years_train = train_split['Year'].unique()
countries_test = test_split['Country'].unique()
years_test = test_split['Year'].unique()
cols = cols.drop(['Life expectancy ', 'Country', 'Year', 'continent', 'Life expectancy difference 1 year ago', 'Life expectancy difference 2 years ago', 'Life expectancy difference'])
print(cols)
train_split_np_features = np.empty((len(years_train), len(countries_train), len(cols)))
train_split_np_target = np.empty((len(years_train), len(countries_train)))
test_split_np_features = np.empty((len(years_test), len(countries_test), len(cols)))
test_split_np_target = np.empty((len(years_test), len(countries_test)))

target = 'Life expectancy difference'
for i, country in enumerate(countries_train):
    for j, year in enumerate(years_train):
        # take nummeric data of country and put in numpy array
        train_split_np_features[j, i] = train_split.loc[(train_split['Country'] == country) & (train_split['Year'] == year)][cols].to_numpy()
        # take target of country and put in numpy array
        train_split_np_target[j, i] = train_split.loc[(train_split['Country'] == country) & (train_split['Year'] == year)][target].to_numpy()
for i, country in enumerate(countries_test):
    for j, year in enumerate(years_test):
        # take nummeric data of country and put in numpy array
        test_split_np_features[j, i] = test_split.loc[(test_split['Country'] == country) & (test_split['Year'] == year)][cols].to_numpy()
        # take target of country and put in numpy array
        test_split_np_target[j, i] = test_split.loc[(test_split['Country'] == country) & (test_split['Year'] == year)][target].to_numpy()

test_split_np_features = np.concatenate((train_split_np_features[-2:], test_split_np_features))
test_split_np_target = np.concatenate((train_split_np_target[-2:], test_split_np_target))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_split[col] = (train_split[col] - mean) / std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_split[col] = (test_split[col] - mean) / std


Index(['Status', 'infant deaths', 'Alcohol', 'Hepatitis B', 'MeaslesPrMillion',
       'OverweightOfAdults%', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', 'Schooling',
       'WaterFacility', 'WomenInParlament'],
      dtype='object')


In [223]:
def f(carry, h_external_change):
  beta2, z_prev1, z_prev2 = carry
  z_t_normal = (beta2[0]*z_prev1 + beta2[1]*z_prev2 + h_external_change)
  z_t = z_t_normal
  z_prev2 = z_prev1
  z_prev1 = z_t
  return (beta2, z_prev1, z_prev2), z_t


def model(obs_x, obs_y):
    time = obs_x.shape[0]
    countries = obs_x.shape[1]
    features_x = obs_x.shape[2]
    dim_h = 2
    
    with numpyro.plate("weights", features_x):
      beta1 = numpyro.sample(name="beta1", fn=dist.Normal(loc=0., scale=1))
    with numpyro.plate("countries", countries):
      z0 = numpyro.sample(name="z0", fn=dist.Normal(loc=0, scale=.1))
      z1 = numpyro.sample(name="z1", fn=dist.Normal(loc=0, scale=.1))
    
    with numpyro.plate("time_x", features_x):
      with numpyro.plate("countries_x", countries):
        with numpyro.plate("features_x", time):
          xs = numpyro.sample(name="xs", fn=dist.Normal(loc=0., scale=1.), obs=obs_x)

    with numpyro.plate("weights2", 2):
      beta2 = numpyro.sample(name="beta2", fn=dist.Normal(loc=0., scale=1))

    tau = numpyro.sample(name="tau", fn=dist.HalfCauchy(scale=.1))
    sigma = numpyro.sample(name="sigma", fn=dist.HalfCauchy(scale=.1))
    # outlier_effect = numpyro.sample(name="outlier_effect", fn=dist.Normal(loc=0., scale=1))

    # outlier_prob = numpyro.sample(name="outlier_prob", fn=dist.Beta(concentration1=100, concentration0=1))
    with numpyro.plate("countries", countries):
      with numpyro.plate("time", time):
          # outliers = numpyro.sample(name="outliers", fn=dist.Categorical(probs=jnp.array([0.98, 0.01, 0.01]))) 
          # print(outliers)
          h_external_change = numpyro.sample(name="h_change", fn=dist.Normal(loc=xs @ beta1, scale=tau))

    carry = (beta2, z0, z1)
    z_collection = [z0, z1]
    carry, zs_exp = lax.scan(f, carry, h_external_change, time)
    z_collection = jnp.concatenate((jnp.array(z_collection), zs_exp), axis=0)

    with numpyro.plate("countries", countries):
      with numpyro.plate("time", time):
        # is_outlier = numpyro.sample(name="is_outlier", fn=dist.Bernoulli(probs=0.001))
        numpyro.sample(name="y_obs", fn=dist.Normal(loc=zs_exp, scale=sigma), obs=obs_y)
    return z_collection


In [224]:
# print(train_split_np_features.shape, test_split_np_features.shape, train_split_np_target.shape, test_split_np_target.shape)

t0 = time.time()

rng_key = random.PRNGKey(0)
rng_key, rng_key_ = random.split(rng_key)

nuts_kernel = NUTS(model=model, max_tree_depth=8, step_size=5e-3, adapt_step_size=False)
mcmc = MCMC(nuts_kernel, num_samples=4000, num_warmup=2000, num_chains=1, progress_bar=True)
mcmc.run(rng_key_, obs_x=train_split_np_features, obs_y=train_split_np_target)

t_fin = time.time()

print("Total time: {0:.3f}m".format((t_fin - t0)/60))

sample: 100%|██████████| 6000/6000 [04:35<00:00, 21.78it/s, 255 steps of size 5.00e-03. acc. prob=1.00]


Total time: 4.619m


In [230]:
beta1 = mcmc.get_samples()['beta1'].mean(0)
beta2 = mcmc.get_samples()['beta2'].mean(0)
print(beta1)
print(beta2)


[ 0.07266977 -0.19789526  0.00534275  0.00196212 -0.02222377  0.0308468
  0.02062435  0.01021231 -0.00353696 -0.21691348  0.03644846  0.06025819
  0.03290103 -0.02614388  0.06606474 -0.00973863]
[-0.13698427  0.0982231 ]


In [231]:
errors = []
errors_baseline = []
for i in range(2, 6):
    prediction = beta2[0] * test_split_np_target[i-1] + beta2[1] * test_split_np_target[i-2] + test_split_np_features[i] @ beta1
    errors.append(np.mean((prediction - test_split_np_target[i])**2))
    errors_baseline.append(np.mean(test_split_np_target[i]**2))
print(np.sqrt(np.mean(errors)))
print(np.sqrt(np.mean(errors_baseline)))

0.9506199
0.9881418898079111


In [232]:
errors = []
errors_baseline = []
for i in range(4, 8):
    prediction = beta2[0] * test_split_np_target[i-1] + beta2[1] * test_split_np_target[i-2] + test_split_np_features[i] @ beta1
    errors.append(np.mean((prediction - test_split_np_target[i])**2))
    errors_baseline.append(np.mean(test_split_np_target[i]**2))
print(np.sqrt(np.mean(errors)))
print(np.sqrt(np.mean(errors_baseline)))

1.6496867
1.6831051571269542


In [233]:
errors = []
errors_baseline = []
for i in range(7):
    prediction = beta2[0] * train_split_np_target[i-1] + beta2[1] * train_split_np_target[i-2] + train_split_np_features[i] @ beta1
    errors.append(np.mean((prediction - train_split_np_target[i])**2))
    errors_baseline.append(np.mean(train_split_np_target[i]**2))
print(np.sqrt(np.mean(errors)))
print(np.sqrt(np.mean(errors_baseline)))

0.9385389
0.9996096037329262


In [234]:
mcmc.print_summary()


                     mean       std    median      5.0%     95.0%     n_eff     r_hat
       beta1[0]      0.07      0.07      0.07     -0.04      0.19    278.75      1.00
       beta1[1]     -0.20      0.03     -0.20     -0.25     -0.14    239.78      1.00
       beta1[2]      0.01      0.03      0.00     -0.04      0.05    505.74      1.00
       beta1[3]      0.00      0.03      0.00     -0.05      0.05    448.56      1.00
       beta1[4]     -0.02      0.03     -0.02     -0.07      0.02    527.31      1.00
       beta1[5]      0.03      0.03      0.03     -0.01      0.07    358.05      1.00
       beta1[6]      0.02      0.03      0.02     -0.02      0.06    454.54      1.00
       beta1[7]      0.01      0.03      0.01     -0.03      0.06    437.46      1.00
       beta1[8]     -0.00      0.03     -0.00     -0.05      0.04    434.99      1.00
       beta1[9]     -0.22      0.03     -0.22     -0.26     -0.17    413.92      1.00
      beta1[10]      0.04      0.03      0.04     -0.