In [1]:
import os

from IPython.display import set_matplotlib_formats
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

from jax import numpy as jnp
from jax import ops, random
from jax.scipy.special import expit

import numpyro
from numpyro import distributions as dist
from numpyro.distributions import constraints
from numpyro.infer import MCMC, NUTS, Predictive

plt.style.use("seaborn")
# if "NUMPYRO_SPHINXBUILD" in os.environ:
#    set_matplotlib_formats("svg")

from icecream import ic

https://num.pyro.ai/en/latest/tutorials/bayesian_imputation.html?highlight=imputation
https://num.pyro.ai/en/latest/utilities.html?highlight=Predictive%20regression#numpyro.infer.util.Predictive
https://num.pyro.ai/en/latest/tutorials/bayesian_regression.html?highlight=Predictive%20regression#Bayesian-Regression-Using-NumPyro (Posterior predictive)

In [2]:
dataset = pd.read_csv("../data/X_station_day.csv")
dataset.head()

Unnamed: 0,station_id,year,month,day,latitude,longitude,altitude,wind_direction,wind_speed,temperature,humidity,dew_point,precipitations,ground_truth
0,14066001,2016,1,1,49.334,-0.431,2,146.5,3.91375,280.33374,88.59167,278.5146,0.2,3.4
1,14066001,2016,1,2,49.334,-0.431,2,205.625,8.04125,282.93668,82.3,279.9975,3.4,11.7
2,14066001,2016,1,3,49.334,-0.431,2,195.25,5.430417,281.10165,86.604164,278.9975,11.7,0.6
3,14066001,2016,1,4,49.334,-0.431,2,212.66667,6.715417,281.055,80.645836,277.90082,0.6,0.4
4,14066001,2016,1,5,49.334,-0.431,2,205.04167,5.957083,281.25583,82.75,278.48416,0.4,3.0


In [None]:
columns = dataset.columns.tolist()
for i in range(len(columns)):
    for j in range(i, len(columns)):
        if i != j:
            spearman_correlation = dataset[columns[i]].corr(dataset[columns[j]], method='spearman')
            pearson_correlation = dataset[columns[i]].corr(dataset[columns[j]], method='pearson')
            if max(np.abs(spearman_correlation), np.abs(pearson_correlation)) > 0.4 :
                print('Correlation between ', columns[i], 'and', columns[j])
                print('Pearson\'s correlation: %.3f' % spearman_correlation)
                print('Spearman\'s correlation: %.3f' % pearson_correlation)

# General functions

In [33]:
def get_normalization_infos(*x_s, columns):
    normalization_infos = pd.DataFrame(data=[[1000 for _ in range(len(columns))],[0 for _ in range(len(columns))]],
                                       index=["min","max"],
                                       columns=columns)
    for x in x_s :
        for col in columns:
            min_value = min(normalization_infos[col]["min"], x[col].min())
            max_value = max(normalization_infos[col]["max"], x[col].max())
            normalization_infos[col] = [min_value, max_value]

    normalization_infos.loc["spread"] = normalization_infos.apply(lambda c : c["max"] - c["min"], axis=0)

    return normalization_infos


def normalize(x : pd.DataFrame, normalization_infos : pd.DataFrame):
    for col in x.columns:
        x[col] = (x[col] - normalization_infos[col]["min"])/normalization_infos[col]["spread"]
    return x

def de_normalize(x : pd.DataFrame, normalization_infos : pd.DataFrame):
    for col in x.columns:
        x[col] = x[col] * normalization_infos[col]["spread"] + normalization_infos[col]["min"]
    return x


def create_nans(dataset, columns, ratio_nan):
    for col in columns:
        random_vec = np.random.random(dataset[col].shape) < 1 - ratio_nan
        dataset[col] = dataset[col].where(random_vec, other=np.nan)
    return dataset

In [83]:
def model_ground_truth(
        latitude, longitude, altitude, wind_direction, wind_speed, temperature, humidity, dew_point, precipitations, mu=None, sigma=None, ground_truth=None,
        nan_columns = None
):
    lat, long, alt, w_d, w_s, temp, hum, d_pt, prec = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    default = numpyro.sample("default", dist.Normal(0.0, 0.2))
    if latitude is not None:
        bayes_latitude = numpyro.sample('bayes_latitude', dist.Normal(0, 1))
        lat = bayes_latitude * latitude
    if longitude is not None:
        bayes_longitude = numpyro.sample('bayes_longitude', dist.Normal(0, 1))
        long = bayes_longitude * longitude
    if altitude is not None:
        bayes_altitude = numpyro.sample('bayes_altitude', dist.Normal(0, 1))
        alt = bayes_altitude * altitude

    if 'wind_direction' in nan_columns:
        wd_mu = numpyro.sample("wd_mu", dist.Normal(mu['wind_direction'], 0.2))
        wd_sigma = numpyro.sample("wd_sigma", dist.Normal(sigma['wind_direction'], 0.2))

        wind_direction_nanidx = np.array(np.isnan(wind_direction).astype(int)).nonzero()[0]
        wd_impute = numpyro.sample("wd_impute", dist.Normal(wd_mu, wd_sigma)
                                            .expand((len(wind_direction_nanidx),))
                                            .mask(False))

        wind_direction = ops.index_update(wind_direction, wind_direction_nanidx, wd_impute)

        numpyro.sample("latent_wind_direction", dist.Normal(mu['wind_direction'], sigma['wind_direction']), obs=wind_direction)
        bayes_wind_direction = numpyro.sample("bayes_wind_direction", dist.Normal(0, 1))
        w_d = bayes_wind_direction * wind_direction
    else:
        if wind_direction is not None:
            bayes_wind_direction = numpyro.sample('bayes_wind_direction', dist.Normal(0, 1))
            w_d = bayes_wind_direction * wind_direction

    if 'wind_speed' in nan_columns:
        ws_mu = numpyro.sample("ws_mu", dist.Normal(mu['wind_speed'], 0.2))
        ws_sigma = numpyro.sample("ws_sigma", dist.Normal(sigma['wind_speed'], 0.2))

        wind_speed_nanidx = np.array(np.isnan(wind_speed).astype(int)).nonzero()[0]
        ws_impute = numpyro.sample("ws_impute", dist.Normal(ws_mu, ws_sigma)
                                   .expand((len(wind_speed_nanidx),))
                                   .mask(False))

        wind_speed = ops.index_update(wind_speed, wind_speed_nanidx, ws_impute)

        numpyro.sample("latent_wind_speed", dist.Normal(mu['wind_speed'], sigma['wind_speed']), obs=wind_speed)
        bayes_wind_speed = numpyro.sample("bayes_wind_speed", dist.Normal(0, 1))
        w_s = bayes_wind_speed * wind_speed
    else:
        if wind_speed is not None:
            bayes_wind_speed = numpyro.sample('bayes_wind_speed', dist.Normal(0, 1))
            w_s = bayes_wind_speed * wind_speed

    if 'temperature' in nan_columns:
        temp_mu = numpyro.sample("temp_mu", dist.Normal(mu['temperature'], 0.2))
        temp_sigma = numpyro.sample("temp_sigma", dist.Normal(sigma['temperature'], 0.2))

        temperature_nanidx = np.array(np.isnan(temperature).astype(int)).nonzero()[0]
        temperature_impute = numpyro.sample("temperature_impute", dist.Normal(temp_mu, temp_sigma)
                                         .expand((len(temperature_nanidx),))
                                         .mask(False))

        temperature = ops.index_update(temperature, temperature_nanidx, temperature_impute)

        numpyro.sample("latent_temperature", dist.Normal(mu['temperature'], sigma['temperature']), obs=temperature)
        bayes_temperature = numpyro.sample("bayes_temperature", dist.Normal(0, 1))
        temp = bayes_temperature * temperature
    else:
        if temperature is not None:
            bayes_temperature = numpyro.sample('bayes_temperature', dist.Normal(0, 1))
            temp = bayes_temperature * temperature

    if 'humidity' in nan_columns:
        hum_mu = numpyro.sample("hum_mu", dist.Normal(mu['humidity'], 0.2))
        hum_sigma = numpyro.sample("hum_sigma", dist.Normal(sigma['humidity'], 0.2))

        humidity_nanidx = np.array(np.isnan(humidity).astype(int)).nonzero()[0]
        humidity_impute = numpyro.sample("humidity_impute", dist.Normal(hum_mu, hum_sigma)
                                          .expand((len(humidity_nanidx),))
                                          .mask(False))

        humidity = ops.index_update(humidity, humidity_nanidx, humidity_impute)

        numpyro.sample("latent_humidity", dist.Normal(mu['humidity'], sigma['humidity']), obs=humidity)
        bayes_humidity = numpyro.sample('bayes_humidity', dist.Normal(0, 1))
        hum = bayes_humidity * humidity
    else:
        if humidity is not None:
            bayes_humidity = numpyro.sample('bayes_humidity', dist.Normal(0, 1))
            hum = bayes_humidity * humidity

    if 'dew_point' in nan_columns:
        dew_point_mu = numpyro.sample("dew_point_mu", dist.Normal(mu['dew_point'], 0.2))
        dew_point_sigma = numpyro.sample("dew_point_sigma", dist.Normal(sigma['dew_point'], 0.2))

        dew_point_nanidx = np.array(np.isnan(dew_point).astype(int)).nonzero()[0]
        dew_point_impute = numpyro.sample("dew_point_impute", dist.Normal(dew_point_mu, dew_point_sigma)
                                               .expand((len(dew_point_nanidx),))

                                               .mask(False))

        dew_point = ops.index_update(dew_point, dew_point_nanidx, dew_point_impute)

        numpyro.sample("latent_dew_point", dist.Normal(mu['dew_point'], sigma['dew_point']), obs=dew_point)
        bayes_dew_point = numpyro.sample('bayes_dew_point', dist.Normal(0, 1))
        d_pt = bayes_dew_point * dew_point
    else:
        if dew_point is not None:
            bayes_dew_point = numpyro.sample('bayes_dew_point', dist.Normal(0, 1))
            d_pt = bayes_dew_point * dew_point
    if 'precipitations' in nan_columns:
        precipitations_mu = numpyro.sample("precipitations_mu", dist.Normal(mu['precipitations'], 0.2))
        precipitations_sigma = numpyro.sample("precipitations_sigma", dist.Normal(sigma['precipitations'], 0.2))

        precipitations_nanidx = np.array(np.isnan(precipitations).astype(int)).nonzero()[0]
        precipitations_impute = numpyro.sample("precipitations_impute", dist.Normal(precipitations_mu, precipitations_sigma)
                                            .expand((len(precipitations_nanidx),))
                                            .mask(False))

        precipitations = ops.index_update(precipitations, precipitations_nanidx, precipitations_impute)

        numpyro.sample("latent_precipitations", dist.Normal(mu['precipitations'], sigma['precipitations']), obs=precipitations)
        bayes_precipitations = numpyro.sample('bayes_precipitations', dist.Normal(0, 1))
        prec = bayes_precipitations * precipitations
    else:
        if precipitations is not None:
            bayes_precipitations = numpyro.sample('bayes_precipitations', dist.Normal(0, 1))
            prec = bayes_precipitations * precipitations

    sigma_model = numpyro.sample("sigma", dist.Exponential(1.0))
    mu_model = default + lat + long + alt + w_d + w_s + temp + hum + d_pt + prec
    # print("sigma", sigma_model, "mu", mu_model)
    numpyro.sample("ground_truth", dist.Normal(mu_model, sigma_model), obs=ground_truth)

# Data fully provided

In [68]:
def get_data(normalization = True):
    dataset = pd.read_csv("../data/X_station_day.csv")
    del dataset['station_id']
    ground_truth = dataset.ground_truth.values
    del dataset['ground_truth']

    columns = dataset.columns.tolist()
    print(columns)

    if normalization:
        normalisation_infos = get_normalization_infos(dataset, columns=columns)
        dataset = normalize(dataset, normalisation_infos)

    data = dict(
        latitude = dataset.latitude.values,
        longitude = dataset.longitude.values,
        altitude = dataset.altitude.values,
        wind_direction = dataset.wind_direction.values,
        wind_speed = dataset.wind_speed.values,
        temperature = dataset.temperature.values,
        humidity = dataset.humidity.values,
        dew_point = dataset.dew_point.values,
        precipitations = dataset.precipitations.values,
    )

    mu_col = dict()
    sigma_col = dict()

    for column in column_to_impute:
        mu_col[column] = dataset[column].mean()
        sigma_col[column] = dataset[column].std()

    print(mu_col)
    print(sigma_col)

    return data, ground_truth, mu_col, sigma_col

### Without normalisation

In [39]:
data, ground_truth, mu_col, sigma_col = get_data(normalization=False)

mcmc = MCMC(NUTS(model_ground_truth), num_warmup=1000, num_samples=1000)
mcmc.run(random.PRNGKey(0), **data, ground_truth=ground_truth, nan_columns = [], mu=mu_col, sigma=sigma_col)
# Print the statistics of posterior samples collected during running this MCMC instance. (documentation)
mcmc.print_summary()

['year', 'month', 'day', 'latitude', 'longitude', 'altitude', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
{'wind_direction': 190.99474981223554, 'wind_speed': 3.5289203841692753}
{'wind_direction': 73.68470392526115, 'wind_speed': 1.9481537723963505}


sample: 100%|██████████| 2000/2000 [25:03<00:00,  1.33it/s, 255 steps of size 7.52e-04. acc. prob=0.95] 



                            mean       std    median      5.0%     95.0%     n_eff     r_hat
        bayes_altitude      0.00      0.00      0.00      0.00      0.00    553.61      1.01
       bayes_dew_point      0.11      0.03      0.11      0.06      0.17     41.35      1.01
        bayes_humidity     -0.01      0.01     -0.01     -0.03     -0.00     41.77      1.01
        bayes_latitude      0.04      0.01      0.05      0.03      0.06     41.87      1.03
       bayes_longitude     -0.03      0.01     -0.03     -0.04     -0.01     34.79      1.02
  bayes_precipitations      0.19      0.00      0.19      0.18      0.20     65.06      1.06
     bayes_temperature     -0.12      0.03     -0.11     -0.18     -0.07     41.35      1.01
  bayes_wind_direction      0.00      0.00      0.00      0.00      0.00    494.13      1.00
      bayes_wind_speed      0.22      0.01      0.22      0.20      0.23     28.59      1.00
               default      0.63      0.27      0.65     -0.00      0

### With normalisation

In [54]:
data, ground_truth, mu_col, sigma_col = get_data(normalization=True)

mcmc = MCMC(NUTS(model_ground_truth), num_warmup=1000, num_samples=1000)
mcmc.run(random.PRNGKey(0), **data, ground_truth=ground_truth, nan_columns = [], mu=mu_col, sigma=sigma_col)
# Print the statistics of posterior samples collected during running this MCMC instance. (documentation)
mcmc.print_summary()

['year', 'month', 'day', 'latitude', 'longitude', 'altitude', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
{'wind_direction': 0.5392158462791505, 'wind_speed': 0.16563164737739422}
{'wind_direction': 0.20802645111422322, 'wind_speed': 0.09143757397135283}


sample: 100%|██████████| 2000/2000 [04:41<00:00,  7.09it/s, 127 steps of size 2.86e-02. acc. prob=0.94]



                            mean       std    median      5.0%     95.0%     n_eff     r_hat
        bayes_altitude      0.46      0.06      0.46      0.35      0.56   1018.32      1.00
       bayes_dew_point      0.68      0.63      0.68     -0.27      1.82    319.50      1.00
        bayes_humidity      0.50      0.23      0.50      0.08      0.85    266.96      1.00
        bayes_latitude      0.08      0.06      0.08     -0.02      0.18   1020.18      1.00
       bayes_longitude     -0.18      0.06     -0.18     -0.27     -0.09    762.92      1.00
  bayes_precipitations     13.49      0.27     13.49     13.04     13.93   1082.97      1.00
     bayes_temperature     -1.13      0.53     -1.13     -1.98     -0.23    296.47      1.00
  bayes_wind_direction      0.71      0.07      0.71      0.59      0.83    769.34      1.00
      bayes_wind_speed      4.71      0.17      4.70      4.43      4.99    597.59      1.00
               default      0.17      0.12      0.17     -0.04      0

In [55]:
from numpyro.infer import Predictive

posterior = mcmc.get_samples()
ground_truth_pred = Predictive(model_ground_truth, posterior)(random.PRNGKey(1), **data,  nan_columns = [])["ground_truth"]
ground_truth_pred = ground_truth_pred.mean(axis=0)

print(len(ground_truth_pred))
print(ground_truth_pred)
print(ground_truth)

82120
[1.7823089 3.3897796 4.3065987 ... 1.7417682 3.3764615 3.7425666]
[ 3.4 11.7  0.6 ...  4.4  5.4  1.2]


In [57]:
from sklearn.metrics import mean_absolute_error

def m_mape(y_true, y_predict):
    n = len(y_true)
    At = np.array(y_true) + 1
    Ft = np.array(y_predict) + 1

    res = ((100/n)*(np.sum(np.abs((Ft-At)/At))))
    return res

print("MAPE : ", m_mape(ground_truth, ground_truth_pred))
print("Mean Absolute Error : ", mean_absolute_error(ground_truth, ground_truth_pred))

MAPE :  115.05463869277702
Mean Absolute Error :  2.4109965171861836


# Imputation part


In [60]:
dataset = pd.read_csv("../data/X_station_day.csv")
del dataset['station_id']
ground_truth = dataset.ground_truth.values
del dataset['ground_truth']

columns = dataset.columns.tolist()
print(columns)

normalisation_infos = get_normalization_infos(dataset, columns=columns)
dataset = normalize(dataset, normalisation_infos)

# column_to_impute = ['wind_direction', 'temperature', 'humidity', 'dew_point', 'precipitations']
column_to_impute = ['wind_direction'] # , 'wind_speed']
dataset = create_nans(dataset, column_to_impute, 0.01)

print(dataset['precipitations'].isna().sum())

"""print(dataset.dew_point.values[0])
dataset.dew_point.values[0] = np.nan
print(dataset.dew_point.values[0])"""

"""dew_point_mu = dataset['dew_point'].mean()
dew_point_sigma = dataset['dew_point'].std()

print(dew_point_mu, dew_point_sigma)"""

data = dict(
    latitude = dataset.latitude.values,
    longitude = dataset.longitude.values,
    altitude = dataset.altitude.values,
    wind_direction = dataset.wind_direction.values,
    wind_speed = dataset.wind_speed.values,
    temperature = dataset.temperature.values,
    humidity = dataset.humidity.values,
    dew_point = dataset.dew_point.values,
    precipitations = dataset.precipitations.values,
)

mu_col = dict()
sigma_col = dict()

for column in column_to_impute:
    mu_col[column] = dataset[column].mean()
    sigma_col[column] = dataset[column].std()

print(mu_col)
print(sigma_col)

# year = dataset.year.values,
# month = dataset.month.values,
# day = dataset.day.values,

['year', 'month', 'day', 'latitude', 'longitude', 'altitude', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
0
{'wind_direction': 0.5392392325063013}
{'wind_direction': 0.20799025990088416}


In [61]:
# WITH NaNS (ratio 0.4)
mcmc = MCMC(NUTS(model_ground_truth), num_warmup=1000, num_samples=1000)
mcmc.run(random.PRNGKey(0), **data, ground_truth=ground_truth, nan_columns = column_to_impute, mu=mu_col, sigma=sigma_col)
# Print the statistics of posterior samples collected during running this MCMC instance. (documentation)
mcmc.print_summary()

sample: 100%|██████████| 2000/2000 [05:39<00:00,  5.88it/s, 127 steps of size 2.77e-02. acc. prob=0.93]



                            mean       std    median      5.0%     95.0%     n_eff     r_hat
        bayes_altitude      0.46      0.06      0.46      0.35      0.56    843.40      1.00
       bayes_dew_point      0.79      0.68      0.80     -0.26      1.98    396.56      1.00
        bayes_humidity      0.45      0.26      0.44      0.04      0.86    355.29      1.00
        bayes_latitude      0.08      0.07      0.08     -0.02      0.19   1052.25      1.00
       bayes_longitude     -0.18      0.05     -0.17     -0.26     -0.08    768.84      1.00
  bayes_precipitations     13.46      0.27     13.47     12.98     13.86   1091.90      1.00
     bayes_temperature     -1.22      0.57     -1.24     -2.16     -0.29    365.83      1.00
  bayes_wind_direction      0.72      0.07      0.72      0.61      0.84   1413.85      1.00
      bayes_wind_speed      4.71      0.17      4.71      4.45      5.01   1137.55      1.00
               default      0.18      0.13      0.19     -0.04      0

In [62]:
from numpyro.infer import Predictive

posterior = mcmc.get_samples()
ground_truth_pred = Predictive(model_ground_truth, posterior)(random.PRNGKey(1), **data,  nan_columns = [])["ground_truth"]
ground_truth_pred = ground_truth_pred.mean(axis=0)
# print("Accuracy:", (survived_pred == survived).sum() / survived.shape[0])

print(len(ground_truth_pred))
print(ground_truth_pred)
print(ground_truth)

82120
[1.7825868 3.3929303 4.3053565 ... 1.7442638       nan 3.7456872]
[ 3.4 11.7  0.6 ...  4.4  5.4  1.2]


In [63]:
from sklearn.metrics import mean_absolute_error

def m_mape(y_true, y_predict):
    n = len(y_true)
    At = np.array(y_true) + 1
    Ft = np.array(y_predict) + 1

    res = ((100/n)*(np.sum(np.abs((Ft-At)/At))))
    return res

print("MAPE : ", m_mape(ground_truth, ground_truth_pred))
print("Mean Absolute Error : ", mean_absolute_error(ground_truth, ground_truth_pred))

MAPE :  nan


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## Imputation by inference

In [69]:
def model_inference_wind_direction(
        latitude, longitude, altitude, wind_speed, temperature, humidity, dew_point, precipitations, mu=None, sigma=None, wind_direction=None,
        nan_columns = None
):
    lat, long, alt, w_s, temp, hum, d_pt, prec = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    default = numpyro.sample("default", dist.Normal(0.0, 0.2))
    if latitude is not None:
        bayes_latitude = numpyro.sample('bayes_latitude', dist.Normal(0, 1))
        lat = bayes_latitude * latitude
    if longitude is not None:
        bayes_longitude = numpyro.sample('bayes_longitude', dist.Normal(0, 1))
        long = bayes_longitude * longitude
    if altitude is not None:
        bayes_altitude = numpyro.sample('bayes_altitude', dist.Normal(0, 1))
        alt = bayes_altitude * altitude
    if wind_speed is not None:
            bayes_wind_speed = numpyro.sample('bayes_wind_speed', dist.Normal(0, 1))
            w_s = bayes_wind_speed * wind_speed
    if temperature is not None:
            bayes_temperature = numpyro.sample('bayes_temperature', dist.Normal(0, 1))
            temp = bayes_temperature * temperature
    if humidity is not None:
            bayes_humidity = numpyro.sample('bayes_humidity', dist.Normal(0, 1))
            hum = bayes_humidity * humidity
    if dew_point is not None:
            bayes_dew_point = numpyro.sample('bayes_dew_point', dist.Normal(0, 1))
            d_pt = bayes_dew_point * dew_point
    if precipitations is not None:
            bayes_precipitations = numpyro.sample('bayes_precipitations', dist.Normal(0, 1))
            prec = bayes_precipitations * precipitations

    sigma_model = numpyro.sample("sigma", dist.Exponential(1.0))
    mu_model = default + lat + long + alt + w_s + temp + hum + d_pt + prec
    # print("sigma", sigma_model, "mu", mu_model)
    numpyro.sample("wind_direction", dist.Normal(mu_model, sigma_model), obs=wind_direction)

#### Get data and simulate some NaNs

In [106]:
import random as rd

dataset = pd.read_csv("../data/X_station_day.csv")
del dataset['station_id']

columns = ['latitude', 'longitude', 'altitude', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
print(columns)

normalisation_infos = get_normalization_infos(dataset, columns=columns)
dataset.loc[:, columns] = normalize(dataset.loc[:, columns], normalisation_infos)

ratio = 0.01
nanidx = rd.sample(range(0, dataset.shape[0]), int(ratio*dataset.shape[0]))

true_wind_direction = dataset.iloc[nanidx]['wind_direction']

dataset.loc[nanidx, 'wind_direction'] = np.nan * np.ones(shape=(int(ratio*dataset.shape[0]),1))

# column_to_impute = ['wind_direction']
# dataset = create_nans(dataset, column_to_impute, 0.2)

print(dataset['wind_direction'].isna().sum())


['latitude', 'longitude', 'altitude', 'wind_direction', 'wind_speed', 'temperature', 'humidity', 'dew_point', 'precipitations']
821


#### Separate the data with NaNs from the others and create the data to use to fit the wind_direction inference model

In [107]:
# nanidx = np.array(np.isnan(dataset['wind_direction']).astype(int)).nonzero()[0]
dataset_to_impute = dataset.iloc[nanidx]

dataset = dataset.drop(index=nanidx)

print(dataset['wind_direction'].isna().sum())


wind_direction = dataset.wind_direction.values

data = dict(
    latitude = dataset.latitude.values,
    longitude = dataset.longitude.values,
    altitude = dataset.altitude.values,
    wind_speed = dataset.wind_speed.values,
    temperature = dataset.temperature.values,
    humidity = dataset.humidity.values,
    dew_point = dataset.dew_point.values,
    precipitations = dataset.precipitations.values,
)

data_to_impute = dict(
    latitude = dataset_to_impute.latitude.values,
    longitude = dataset_to_impute.longitude.values,
    altitude = dataset_to_impute.altitude.values,
    wind_speed = dataset_to_impute.wind_speed.values,
    temperature = dataset_to_impute.temperature.values,
    humidity = dataset_to_impute.humidity.values,
    dew_point = dataset_to_impute.dew_point.values,
    precipitations = dataset_to_impute.precipitations.values,
)

mu_col = dict()
sigma_col = dict()

print('done')

0
done


#### Fit the inference model and predict the NaNs values using MCMC

In [108]:
mcmc = MCMC(NUTS(model_inference_wind_direction), num_warmup=1000, num_samples=1000)
mcmc.run(random.PRNGKey(0), **data, wind_direction=wind_direction, nan_columns = [], mu=mu_col, sigma=sigma_col)
# Print the statistics of posterior samples collected during running this MCMC instance. (documentation)
mcmc.print_summary()

sample: 100%|██████████| 2000/2000 [10:46<00:00,  3.09it/s, 511 steps of size 9.43e-03. acc. prob=0.94] 


                            mean       std    median      5.0%     95.0%     n_eff     r_hat
        bayes_altitude      0.05      0.00      0.05      0.04      0.05    843.54      1.00
       bayes_dew_point      4.29      0.08      4.29      4.17      4.43    209.02      1.00
        bayes_humidity     -1.22      0.03     -1.22     -1.28     -1.18    205.04      1.00
        bayes_latitude      0.01      0.00      0.01      0.01      0.02    777.40      1.00
       bayes_longitude      0.01      0.00      0.01      0.01      0.02    912.83      1.00
  bayes_precipitations      0.27      0.01      0.27      0.25      0.29    514.32      1.00
     bayes_temperature     -3.34      0.07     -3.35     -3.46     -3.24    206.57      1.00
      bayes_wind_speed      0.41      0.01      0.41      0.40      0.42    596.05      1.00
               default      0.55      0.01      0.55      0.53      0.57    219.65      1.00
                 sigma      0.19      0.00      0.19      0.19      0




In [110]:
from numpyro.infer import Predictive

posterior = mcmc.get_samples()
imputed_wind_direction = Predictive(model_inference_wind_direction, posterior)(random.PRNGKey(1), **data_to_impute,  nan_columns = [])["wind_direction"]
imputed_wind_direction = imputed_wind_direction.mean(axis=0)
print(imputed_wind_direction)

[0.5168244  0.5720476  0.46291927 0.53275234 0.52036256 0.58803654
 0.6444264  0.5604491  0.59646636 0.46356136 0.44745475 0.4814365
 0.46347293 0.6227878  0.5474984  0.46606064 0.42729208 0.42836532
 0.56511474 0.56706834 0.5496394  0.56678545 0.5541903  0.5637314
 0.65486413 0.37978706 0.48043746 0.5382263  0.56331927 0.47657967
 0.5330153  0.5557544  0.5605088  0.48071998 0.5136304  0.54831254
 0.7503254  0.48924768 0.56286377 0.5210387  0.48444483 0.5598504
 0.44455767 0.5128112  0.64183354 0.50881094 0.44235817 0.48448855
 0.36878693 0.5361409  0.35595688 0.631098   0.581248   0.64907986
 0.46712467 0.5839535  0.5088763  0.63443595 0.58829314 0.46381167
 0.5940712  0.42956138 0.6038817  0.4795781  0.564923   0.59653765
 0.6537263  0.4132622  0.65263826 0.4647091  0.5498843  0.5468058
 0.50139153 0.5938816  0.4911879  0.6545148  0.6172427  0.44738272
 0.5863588  0.6068583  0.62498045 0.5758318  0.6407848  0.34550214
 0.5602764  0.5603523  0.51498866 0.437353   0.60678655 0.49965245

In [111]:
from sklearn.metrics import mean_absolute_error

def m_mape(y_true, y_predict):
    n = len(y_true)
    At = np.array(y_true) + 1
    Ft = np.array(y_predict) + 1

    res = ((100/n)*(np.sum(np.abs((Ft-At)/At))))
    return res

print(true_wind_direction)
print('______________________')
print(imputed_wind_direction)

print("MAPE : ", m_mape(true_wind_direction, imputed_wind_direction))
print("Mean Absolute Error : ", mean_absolute_error(true_wind_direction, imputed_wind_direction))

22953    0.638160
68189    0.584990
34520    0.565933
3536     0.435478
69060    0.410893
           ...   
57205    0.642513
10219    0.654982
22149    0.520762
57103    0.699800
53691    0.518527
Name: wind_direction, Length: 821, dtype: float64
______________________
[0.5168244  0.5720476  0.46291927 0.53275234 0.52036256 0.58803654
 0.6444264  0.5604491  0.59646636 0.46356136 0.44745475 0.4814365
 0.46347293 0.6227878  0.5474984  0.46606064 0.42729208 0.42836532
 0.56511474 0.56706834 0.5496394  0.56678545 0.5541903  0.5637314
 0.65486413 0.37978706 0.48043746 0.5382263  0.56331927 0.47657967
 0.5330153  0.5557544  0.5605088  0.48071998 0.5136304  0.54831254
 0.7503254  0.48924768 0.56286377 0.5210387  0.48444483 0.5598504
 0.44455767 0.5128112  0.64183354 0.50881094 0.44235817 0.48448855
 0.36878693 0.5361409  0.35595688 0.631098   0.581248   0.64907986
 0.46712467 0.5839535  0.5088763  0.63443595 0.58829314 0.46381167
 0.5940712  0.42956138 0.6038817  0.4795781  0.564923   0.5965

#### Predict ground_truth

In [112]:
dataset_to_impute['wind_direction'] = imputed_wind_direction
dataset = pd.concat([dataset, dataset_to_impute], axis=0)
ground_truth = dataset.ground_truth.values

print(dataset['wind_direction'].isna().sum())

data = dict(
    latitude = dataset.latitude.values,
    longitude = dataset.longitude.values,
    altitude = dataset.altitude.values,
    wind_direction = dataset.wind_direction.values,
    wind_speed = dataset.wind_speed.values,
    temperature = dataset.temperature.values,
    humidity = dataset.humidity.values,
    dew_point = dataset.dew_point.values,
    precipitations = dataset.precipitations.values,
)

mu_col = dict()
sigma_col = dict()

0


In [113]:
mcmc = MCMC(NUTS(model_ground_truth), num_warmup=1000, num_samples=1000)
mcmc.run(random.PRNGKey(0), **data, ground_truth=ground_truth, nan_columns = [], mu=mu_col, sigma=sigma_col)
# Print the statistics of posterior samples collected during running this MCMC instance. (documentation)
mcmc.print_summary()

sample: 100%|██████████| 2000/2000 [04:43<00:00,  7.05it/s, 127 steps of size 2.73e-02. acc. prob=0.94]



                            mean       std    median      5.0%     95.0%     n_eff     r_hat
        bayes_altitude      0.46      0.07      0.46      0.35      0.56    987.55      1.00
       bayes_dew_point      0.68      0.64      0.72     -0.28      1.83    288.92      1.00
        bayes_humidity      0.49      0.23      0.49      0.09      0.87    256.55      1.00
        bayes_latitude      0.08      0.07      0.08     -0.02      0.18   1018.01      1.00
       bayes_longitude     -0.18      0.06     -0.18     -0.27     -0.09    592.50      1.00
  bayes_precipitations     13.48      0.27     13.48     13.04     13.94   1278.73      1.00
     bayes_temperature     -1.13      0.53     -1.15     -2.02     -0.28    280.21      1.00
  bayes_wind_direction      0.71      0.07      0.71      0.59      0.83    633.82      1.00
      bayes_wind_speed      4.71      0.17      4.70      4.45      4.99    537.05      1.00
               default      0.18      0.12      0.18     -0.03      0

In [114]:
from numpyro.infer import Predictive

posterior = mcmc.get_samples()
ground_truth_pred = Predictive(model_ground_truth, posterior)(random.PRNGKey(1), **data,  nan_columns = [])["ground_truth"]
ground_truth_pred = ground_truth_pred.mean(axis=0)

print(len(ground_truth_pred))
print(ground_truth_pred)
print(ground_truth)

82120
[1.7830335 3.3896437 4.306458  ... 1.5338191 2.7733316 1.2954884]
[ 3.4 11.7  0.6 ...  0.2  0.   0.4]


In [115]:
from sklearn.metrics import mean_absolute_error

def m_mape(y_true, y_predict):
    n = len(y_true)
    At = np.array(y_true) + 1
    Ft = np.array(y_predict) + 1

    res = ((100/n)*(np.sum(np.abs((Ft-At)/At))))
    return res

print("MAPE : ", m_mape(ground_truth, ground_truth_pred))
print("Mean Absolute Error : ", mean_absolute_error(ground_truth, ground_truth_pred))

MAPE :  115.1272251514365
Mean Absolute Error :  2.411414615989577
