In [1]:
!date

Sun Feb 27 18:07:25 PST 2022


In [2]:
import altair
import beanmachine.ppl as bm
import numpy as np
import pandas as pd
import torch
import torch.distributions as dist

In [29]:
BLUE = "#4e79a7"
ORANGE = "#f28e2b"
RED = "#e15759"
CYAN = "#76b7b2"
GREEN = "#59a14f"
YELLOW = "#edc948"
PURPLE = "#b07aa1"
PINK = "#ff9da7"
BROWN = "#9c755f"
GRAY = "#bab0ac"

In [3]:
x = torch.tensor(
    [0, 3, 9, 14, 15, 19, 20, 21, 30, 35, 40, 41, 42, 43, 54, 56, 67, 69, 72, 88],
    dtype=torch.float32,
)
y = torch.tensor(
    [33, 68, 34, 34, 37, 71, 37, 44, 48, 49, 53, 49, 50, 48, 56, 60, 61, 63, 44, 71],
    dtype=torch.float32,
)
err = torch.tensor(
    [
        3.6,
        3.9,
        2.6,
        3.4,
        3.8,
        3.8,
        2.2,
        2.1,
        2.3,
        3.8,
        2.2,
        2.8,
        3.9,
        3.1,
        3.4,
        2.6,
        3.4,
        3.7,
        2.0,
        3.5,
    ]
)


In [31]:
error_bars = (
    altair.Chart()
    .mark_errorbar(color=BLUE)
    .encode(altair.X("x"), altair.Y("y"), altair.YError("err"))
)
points = (
    altair.Chart()
    .mark_circle(color=BLUE)
    .encode(x=altair.X("x"), y=altair.Y("y", scale=altair.Scale(zero=False)))
)
altair.layer(points, error_bars, data=pd.DataFrame({"x": x, "y": y, "err": err}))


In [5]:
@bm.random_variable
def beta_0() -> bm.RVIdentifier:
    return dist.Normal(0, 10)


@bm.random_variable
def beta_1() -> bm.RVIdentifier:
    return dist.Normal(0, 10)


@bm.random_variable
def sigma_o() -> bm.RVIdentifier:
    return dist.Gamma(1, 1)


@bm.random_variable
def theta() -> bm.RVIdentifier:
    return dist.Beta(2, 5)


@bm.functional
def likelihood(i: int) -> float:
    mu = beta_0() + beta_1() * x[i]
    ll1 = torch.log(theta()) + dist.Normal(mu, sigma_o()).log_prob(y[i])
    ll2 = torch.log(1 - theta()) + dist.Normal(mu, err[i]).log_prob(y[i])
    return torch.logaddexp(ll1, ll2).exp()


@bm.random_variable
def d(i: int) -> bm.RVIdentifier:
    return dist.Bernoulli(likelihood(i))


```
for (n in 1:N) {
  target += log_sum_exp(log(lambda)
                          + normal_lpdf(y[n] | mu[1], sigma[1]),
                        log1m(lambda)
                          + normal_lpdf(y[n] | mu[2], sigma[2]));
}
```

In [6]:
altair.Chart(
    pd.DataFrame().assign(
        theta=torch.arange(0, 1, 0.01),
        p=lambda df: dist.Beta(2, 5).log_prob(torch.tensor(df["theta"])).exp(),
    )
).mark_line().encode(x="theta", y="p")


In [7]:
queries = [beta_0(), beta_1(), sigma_o(), theta()]
observations = {d(i): torch.tensor(1.0) for i in range(len(x))}

In [8]:
samples = bm.GlobalNoUTurnSampler().infer(
    queries=queries,
    observations=observations,
    num_samples=4000,
    num_adaptive_samples=4000,
    num_chains=1,
)

Samples collected: 100%|██████████| 8000/8000 [53:42<00:00,  2.48it/s]     


In [9]:
bm.Diagnostics(samples).summary()

Unnamed: 0,avg,std,2.5%,50%,97.5%,n_eff
beta_0()[],31.532238,1.700976,28.297164,31.52553,34.900478,1761.174072
beta_1()[],0.46222,0.039588,0.382067,0.462829,0.538872,1697.758179
sigma_o()[],12.869358,2.022978,9.426688,12.718417,17.258504,2320.91748
theta()[],0.270068,0.1069,0.095128,0.260704,0.509922,2611.940674


In [33]:
mu = 31.5 + 0.5 * x
z1 = dist.Normal(mu, err).log_prob(y).exp() * 0.73
z2 = dist.Normal(mu, 12.9).log_prob(y).exp() * 0.27

In [34]:
p_outlier = z2 / (z1 + z2)

In [35]:
p_outlier

tensor([0.1006, 1.0000, 0.0901, 0.1805, 0.1100, 1.0000, 0.3247, 0.0856, 0.0749,
        0.0982, 0.0733, 0.1218, 0.1188, 0.2324, 0.1114, 0.0705, 0.1566, 0.1254,
        1.0000, 0.1775])

In [36]:
error_bars = (
    altair.Chart()
    .mark_errorbar()
    .encode(
        altair.X("x"), altair.Y("y"), altair.YError("err"), altair.Color("is_outlier")
    )
)
points = (
    altair.Chart()
    .mark_circle()
    .encode(
        altair.X("x"),
        altair.Y("y", scale=altair.Scale(zero=False)),
        altair.Color("is_outlier"),
    )
)
altair.layer(
    points,
    error_bars,
    data=pd.DataFrame({"x": x, "y": y, "err": err, "is_outlier": p_outlier > 0.5}),
)
