# Notebook description

This is an exemplary notebook that includes the generation of a half-synthetic dataset based on IHDP data. The individual
treatment effect is linear to one of the covariates in this example. There is no noise in this example either.
The basic structure is taken from: 
https://justcause.readthedocs.io/en/latest/usage.html#quick-overview

In [24]:
from sklearn.utils import check_random_state  # ensures usable random state
from justcause.data.utils import generate_data
import numpy as np
from numpy.random import RandomState
from scipy.special import expit
from sklearn.utils import check_random_state
from justcause.data.sets.ihdp import get_ihdp_covariates

covs = get_ihdp_covariates()

In [37]:
def outcome(covariates, *, random_state: RandomState, **kwargs):
    random_state = check_random_state(random_state)

    # define tau
    tau = random_state.normal(covariates["x_8"], 0, size=len(covariates))

    y_0 = random_state.normal(0, 0.2, size=len(covariates))
    y_1 = y_0 + tau
    mu_0, mu_1 = y_0, y_1  # no noise for this example
    return mu_0, mu_1, y_0, y_1

In [41]:
def treatment(covariates, *, random_state: RandomState, **kwargs):
    random_state = check_random_state(random_state)
    return random_state.binomial(1, 0.5, size=len(covariates))

t = treatment(covs, random_state=0)
print(covs)
print(t)

         x_0       x_1       x_2       x_3       x_4       x_5  x_6  x_7  x_8  \
0   1.397395  0.996346 -1.105624 -0.879606  0.308569 -1.023402  1.0  0.0  0.0   
1   0.269033  0.196818  0.383828  0.161703 -0.629189  1.460832  1.0  0.0  1.0   
2   1.051537  1.795874 -1.105624  0.161703 -0.629189  0.963985  1.0  0.0  1.0   
3   0.662446  0.196818 -0.733261 -0.879606  0.371086 -0.692171  1.0  0.0  0.0   
4   0.856992  1.795874  0.011465 -0.879606  0.558638  0.301522  0.0  1.0  1.0   
..       ...       ...       ...       ...       ...       ...  ...  ...  ...   
42  0.381437  0.196818 -1.105624 -0.879606  0.808706 -0.692171  1.0  0.0  1.0   
17 -1.283006 -1.402238  1.128554 -0.879606  0.058500 -1.189018  1.0  0.0  0.0   
52  1.138001  0.996346 -0.733261  0.161703  0.746189  2.123294  0.0  0.0  1.0   
70 -1.853672 -2.201766  1.500917  0.161703 -0.129052 -1.023402  1.0  0.0  0.0   
68  0.381437 -0.202946 -0.733261 -0.879606  0.808706 -1.520249  1.0  0.0  0.0   

    x_9  ...  x_15  x_16  x

In [42]:
replications = generate_data(
    covariates,
    treatment,
    outcome,
    n_samples=747,  # Optional but 747 is the maximum available with IHDP covariates
    n_replications=100,
    random_state=0  # Fix random_state for replicability
)

In [43]:
print(replications)

[            x_0       x_1       x_2       x_3       x_4       x_5  x_6  x_7  \
0      0.251740  0.596582 -0.733261 -0.879606  1.871499 -0.360940  1.0  0.0   
100   -0.115735 -0.602710  0.011465 -0.879606  0.558638 -1.520249  1.0  1.0   
200    1.159618 -0.202946 -0.733261 -0.879606  0.808706 -0.360940  0.0  0.0   
300    0.748911  0.596582 -0.733261 -0.879606 -0.504155  0.798369  0.0  0.0   
400    0.900224  1.396110 -0.360898  2.244320 -2.004568  0.632754  0.0  0.0   
...         ...       ...       ...       ...       ...       ...  ...  ...   
74200  0.316588  0.596582 -0.733261  0.161703 -0.191569 -0.360940  1.0  1.0   
74300  0.513295  0.596582  0.756191  1.203011 -0.066534  2.620141  1.0  0.0   
74400  0.965072  1.396110 -1.105624 -0.879606  0.746189  0.301522  1.0  0.0   
74500 -1.607248 -1.402238  0.756191  0.161703 -1.441913 -1.023402  0.0  0.0   
74600 -0.310280  0.196818  0.756191  2.244320  0.746189  0.798369  0.0  0.0   

       x_8  x_9  ...  x_24  sample_id  t         y