In [1]:
import datetime
import numpy as np
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow_probability as tfp
import tensorflow.keras.layers as tfkl
tfd,tfpl = tfp.distributions,tfp.layers
import tensorflow.keras.backend as tfkb
from tensorflow.keras.callbacks import Callback
from sklearn.preprocessing import StandardScaler
# from networks import fc_net, p_x_z, p_t_z, p_y_tz, q_t_x, q_y_tx, q_z_txy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from tensorflow.keras.optimizers import SGD
from evaluation import *
#################################IHDP Data
# data information 
t_dim = 1
y_dim, default_y_scale = 1,tf.exp(0.)
M = None        # batch size during training
z_dim = 20          # latent z dimension
lamba = 1e-4    # weight decay
nh, h = 3, 200  # number and size of hidden layers
binfeats = [i for i in np.arange(6,25,1)]
numfeats = [i for i in range(6)]
x_bin_dim = len(binfeats)
x_num_dim = len(numfeats)
################################################
activation_global = 'elu'

def fc_net(input_shape, layers, out_layers = [], activation = activation_global, lamba = 1e-4):
    net = tfk.Sequential([tfkl.InputLayer([input_shape])])
    for hidden in layers:
        net.add(tfkl.Dense(
            hidden, 
            activation = activation,
            kernel_regularizer = tf.keras.regularizers.l2(lamba),
            kernel_initializer='RandomNormal',
            )
        )
    if len(out_layers) > 0:
        [outdim, activation_out] = out_layers
        net.add(tfkl.Dense(outdim, activation = activation_out))
    return net

2022-03-08 22:08:22.291167: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#@title First load the data! (Click Play)
import numpy as np
from sklearn.preprocessing import StandardScaler
!wget -nc http://www.fredjo.com/files/ihdp_npci_1-100.train.npz
!wget -nc http://www.fredjo.com/files/ihdp_npci_1-100.test.npz 

def load_IHDP_data(training_data,testing_data,i):
    with open(training_data,'rb') as trf, open(testing_data,'rb') as tef:
        train_data=np.load(trf); test_data=np.load(tef)
        y=np.concatenate(   (train_data['yf'][:,i],   test_data['yf'][:,i])).astype('float32') #most GPUs only compute 32-bit floats
        t=np.concatenate(   (train_data['t'][:,i],    test_data['t'][:,i])).astype('float32')
        x=np.concatenate(   (train_data['x'][:,:,i],  test_data['x'][:,:,i]),axis=0).astype('float32')
        mu_0=np.concatenate((train_data['mu0'][:,i],  test_data['mu0'][:,i])).astype('float32')
        mu_1=np.concatenate((train_data['mu1'][:,i],  test_data['mu1'][:,i])).astype('float32')
        ycf=np.concatenate((train_data['ycf'][:,i],  test_data['ycf'][:,i])).astype('float32')

        data={'x':x,'t':t,'y':y,'t':t,'mu_0':mu_0,'mu_1':mu_1}
        data['t']=data['t'].reshape(-1,1) #we're just padding one dimensional vectors with an additional dimension 
        data['y']=data['y'].reshape(-1,1)
        data['ycf'] = ycf.reshape(-1,1)
        
        #rescaling y between 0 and 1 often makes training of DL regressors easier
        data['y_scaler'] = StandardScaler().fit(data['y'])
        data['ys'] = data['y_scaler'].transform(data['y'])

    return data

ind = 7
rep = 1
data = load_IHDP_data(training_data='./ihdp_npci_1-100.train.npz',testing_data='./ihdp_npci_1-100.test.npz',i = ind)
for key in data:
    if key != 'y_scaler':
        data[key] = np.repeat(data[key],repeats = rep, axis = 0)
data['x'][0,],binfeats,numfeats

文件 “ihdp_npci_1-100.train.npz” 已经存在；不获取。

文件 “ihdp_npci_1-100.test.npz” 已经存在；不获取。



(array([-0.65613806, -1.0024741 , -0.360898  ,  0.16170253,  0.24605164,
        -0.8577868 ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ],
       dtype=float32),
 [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
 [0, 1, 2, 3, 4, 5])

In [3]:
# class CEVAE(tf.keras.Model):
#     def __init__(self):
#         super(CEVAE, self).__init__()
#         ########################################
#         # networks
#         self.activation = 'elu'
#         # CEVAE Model (decoder)
#         self.t_dim = t_bin_dim
#         self.q_y_xt_shared_hqy = fc_net(x_bin_dim + x_num_dim, (nh - 1) * [h], [])
#         self.q_y_xt0_mu = fc_net(h, [h], [y_dim, None])
#         self.q_y_xt1_mu = fc_net(h, [h], [y_dim, None])

#     def call(self, data, training=False):
#         if training:
#             x,t = data[0],data[1]
#             hqy = self.q_y_xt_shared_hqy(x)
#             qy_t0_mu = self.q_y_xt0_mu(hqy)
#             qy_t1_mu = self.q_y_xt1_mu(hqy)
#             # y_loc =  t * qy_t1_mu + (1-t) * qy_t0_mu
#             # return tfd.Normal(
#             #     loc =  y_loc, 
#             #     scale = tf.ones_like(y_loc),
#             #     )
#             y0 = tfd.Normal(
#                 loc =  qy_t0_mu, 
#                 scale = tf.ones_like(qy_t0_mu),
#                 )
#             y1 = tfd.Normal(
#                 loc =  qy_t1_mu, 
#                 scale = tf.ones_like(qy_t1_mu),
#                 )
#             return y0,y1
#         else:
#             x = data
#             hqy = self.q_y_xt_shared_hqy(x)
#             qy_t0_mu = self.q_y_xt0_mu(hqy)
#             qy_t1_mu = self.q_y_xt1_mu(hqy)

#             y0 = tfd.Normal(
#                 loc =  qy_t0_mu, 
#                 scale = tf.ones_like(qy_t0_mu),
#                 )
#             y1 = tfd.Normal(
#                 loc =  qy_t1_mu, 
#                 scale = tf.ones_like(qy_t1_mu),
#                 )
#             return y0,y1


#     def cevae_loss(self, data, pred, training = False):
#         # if training:
#         #     _, t_train, y_train = data[0],data[1],data[2]
#         #     y_pred = pred
#         #     loss = y_pred.log_prob(y_train)
#         #     loss = -tfkb.mean(loss)
#         #     return lossxs
#         # else:
#         #     _, t_train, y_train = data[0],data[1],data[2]
#         #     y0,y1 = pred
#         #     loss = y0.log_prob(y_train)*(1-t_train) + y1.log_prob(y_train)* t_train
#         #     loss = -tfkb.mean(loss)
#         #     return loss
#         _, t_train, y_train = data[0],data[1],data[2]
#         y0,y1 = pred
#         loss = y0.log_prob(y_train)*(1-t_train) + y1.log_prob(y_train)* t_train
#         loss = -tfkb.mean(loss)
#         return loss

#     def train_step(self, data):
#         data = data[0]
#         x,t,_ = data
        
#         with tf.GradientTape() as tape:
#             pred = self([x,t], training=True)  # Forward pass
#             loss = self.cevae_loss(data = data, pred = pred, training = True)
#         # Compute gradients
#         trainable_vars = self.trainable_variables
#         gradients = tape.gradient(loss, trainable_vars)
#         # Update weights
#         self.optimizer.apply_gradients(zip(gradients, trainable_vars))
#         metrics = {
#             "loss": loss,
#         }
#         return metrics

#     def test_step(self, data):
#         # Unpack the data. Its structure depends on your model and
#         # on what you pass to `fit()`.
#         data = data[0]
#         x,t,y = data
#         with tf.GradientTape() as tape:
#             pred = self(x, training=False)  # Forward pass
#             loss = self.cevae_loss(data = data, pred = pred, training = False)
#             y0, y1 = pred[0].sample(),pred[1].sample()
#         metrics = {"loss":loss,"y0": tfkb.mean(y0),"y1": tfkb.mean(y1)}
#         return metrics


In [4]:
class metrics_for_cevae(Callback):
    def __init__(self,data, verbose=0):   
        super(metrics_for_cevae, self).__init__()
        self.data=data #feed the callback the full dataset
        self.verbose=verbose

        #needed for PEHEnn; Called in self.find_ynn
        self.data['o_idx']=tf.range(self.data['t'].shape[0])
        self.data['c_idx']=self.data['o_idx'][self.data['t'].squeeze()==0] #These are the indices of the control units
        self.data['t_idx']=self.data['o_idx'][self.data['t'].squeeze()==1] #These are the indices of the treated units
        # ['x', 't', 'y', 'mu_0', 'mu_1', 'y_scaler', 'ys', 'o_idx', 'c_idx', 't_idx']
        self.y = tf.cast(data['y'],tf.float32)
        self.t = tf.cast(data['t'],tf.float32)
        self.y_cf = tf.cast(data['ycf'],tf.float32)
        self.mu0 = tf.cast(data['mu_0'],tf.float32)
        self.mu1 = tf.cast(data['mu_1'],tf.float32)
        if self.mu0 is not None and self.mu1 is not None:
            self.true_ite = self.mu1 - self.mu0

    def rmse_ite(self, ypred1, ypred0):
        idx1, idx0 = self.t, 1-self.t
        ite1, ite0 = (self.y - ypred0) * idx1, (ypred1 - self.y)*idx0
        pred_ite = ite1 + ite0
        return tf.math.sqrt(tfkb.mean(tf.math.square(self.true_ite - pred_ite)))

    def abs_ate(self, ypred1, ypred0):
        return tf.math.abs(tfkb.mean(ypred1 - ypred0) - tfkb.mean(self.true_ite))

    def pehe(self, ypred1, ypred0):
        return tf.math.sqrt(tfkb.mean(tf.math.square((self.mu1 - self.mu0) - (ypred1 - ypred0))))

    def y_errors(self, y0, y1):
        ypred = (1 - self.t) * y0 + self.t * y1
        ypred_cf = self.t * y0 + (1 - self.t) * y1
        return self.y_errors_pcf(ypred, ypred_cf)

    def y_errors_pcf(self, ypred, ypred_cf):
        rmse_factual = tf.math.sqrt(tfkb.mean(tf.math.square(ypred - self.y)))
        rmse_cfactual = tf.math.sqrt(tfkb.mean(tf.math.square(ypred_cf - self.y_cf)))
        return rmse_factual, rmse_cfactual

    def calc_stats(self, ypred1, ypred0):
        ite = self.rmse_ite(ypred1, ypred0)
        ate = self.abs_ate(ypred1, ypred0)
        pehe = self.pehe(ypred1, ypred0)
        return ite, ate, pehe

    def get_concat_pred(self,pred):
        ypred0, ypred1 = pred
        ypred0 = ypred0.sample()
        ypred1 = ypred1.sample()
        try:
            y_pred0,y_pred1 = self.data['y_scaler'].inverse_transform(ypred0),self.data['y_scaler'].inverse_transform(ypred1)
        except:
            y_pred0 = self.data['y_scaler'].inverse_transform(tf.expand_dims(ypred0,-1))
            y_pred1 = self.data['y_scaler'].inverse_transform(tf.expand_dims(ypred1,-1))
        y_pred0, y_pred1 = tf.squeeze(y_pred0),tf.squeeze(y_pred1)
        return tf.cast(y_pred0,tf.float32), tf.cast(y_pred1,tf.float32)

    def on_epoch_end(self, epoch, logs={}):
        pred = self.model(self.data['x'])
        y_infer = pred[0]
        ypred0, ypred1 = self.get_concat_pred(y_infer)
        ite, ate, pehe = self.calc_stats(ypred1, ypred0)
        tf.summary.scalar("ate", data=tfkb.mean(ypred1 - ypred0), step=epoch)
        tf.summary.scalar("ite_error", data=ite, step=epoch)
        tf.summary.scalar("ate_error", data=ate, step=epoch)
        tf.summary.scalar("pehe_error",data=pehe, step=epoch)
        
        out_str=f' — ite: {ite:.4f}  — ate: {ate:.4f} — pehe: {pehe:.4f} '
        
        if self.verbose > 0: print(out_str)

In [5]:
from networks import *
class CEVAE(tf.keras.Model):
    def __init__(self):
        super(CEVAE, self).__init__()
        ########################################
        # networks
        self.activation = 'elu'
        # CEVAE Model 
        ## (encoder)
        self.q_y_tx = q_y_tx(x_bin_dim, x_num_dim, y_dim, t_dim, nh, h)
        self.q_t_x = q_t_x(x_bin_dim, x_num_dim, t_dim, nh, h)
        self.q_z_txy = q_z_txy(x_bin_dim, x_num_dim, y_dim, t_dim, z_dim, nh,h)
        ## (decoder)
        self.p_x_z = p_x_z(x_bin_dim, x_num_dim, z_dim, nh, h)
        self.p_t_z = p_t_z(t_dim, z_dim, nh, h)
        self.p_y_tz = p_y_tz(y_dim, t_dim, z_dim, nh, h)
        

    def call(self, data, training=False):
        if training:
            x_train,t_train = data
            # encoder
            t_infer = self.q_t_x(x_train)
            t_infer_sample = tf.cast(t_infer.sample(), tf.float32)
            
            y_infer = self.q_y_tx(x_train)
            y0_infer, y1_infer = y_infer
            y_infer_sample = y0_infer.sample() * (1-t_infer_sample) + y1_infer.sample() * t_infer_sample
            
            txy = tf.concat([tf.cast(t_infer_sample,tf.float32), y_infer_sample, x_train],-1)
            z_infer = self.q_z_txy(txy)
            z_infer_sample = z_infer.sample()
            # decoder
            ## p(x|z)
            x_con,x_bin = self.p_x_z(z_infer_sample)
            ## p(t|z)
            t = self.p_t_z(z_infer_sample)
            ## p(y|t,z)
            y = self.p_y_tz(tf.concat([t_train,z_infer_sample],-1) )
            
            return y_infer,t_infer,z_infer,y,t,x_con,x_bin
        else:
            x_train = data
            # encoder
            t_infer = self.q_t_x(x_train)
            t_infer_sample = tf.cast(t_infer.sample(), tf.float32)
            
            y_infer = self.q_y_tx(x_train)
            y0_infer, y1_infer = y_infer
            y_infer_sample = y0_infer.sample() * (1-t_infer_sample) + y1_infer.sample() * t_infer_sample
            
            txy = tf.concat([tf.cast(t_infer_sample,tf.float32), y_infer_sample, x_train],-1)
            z_infer = self.q_z_txy(txy)
            z_infer_sample = z_infer.sample()
            return y_infer,t_infer,z_infer


    def cevae_loss(self, data, pred, training = False):
        x_train, t_train, y_train = data[0],data[1],data[2]
        x_train_num, x_train_bin = x_train[:,:x_num_dim],x_train[:,x_num_dim:]
        y_infer,t_infer,z_infer,y,t,x_con,x_bin = pred
        y0,y1 = y_infer

        # reconstruct loss
        recon_x_num = x_bin.log_prob(x_train_num)
        tf.print(tf.shape(recon_x_num))
        recon_x_num = tfkb.sum(recon_x_num,1)
        tf.print(tf.shape(recon_x_num))
        recon_y = y.log_prob(y_train)
        recon_t = t.log_prob(t_train)

        # kl loss
        z_infer_sample = z_infer.sample()
        z = tfd.Normal(loc = [0] * 20, scale = [1]*20)
        kl_z = tfkb.sum((z.log_prob(z_infer_sample) - z_infer.log_prob(z_infer_sample)), -1)
        
        # aux loss
        aux_y = y0.log_prob(y_train)*(1-t_train) + y1.log_prob(y_train)* t_train
        aux_t = t_infer.log_prob(t_train)
    

        loss = -tfkb.mean( recon_y + recon_t + aux_y + aux_t + kl_z)
        return loss

    def train_step(self, data):
        data = data[0]
        x,t,y = data
        
        with tf.GradientTape() as tape:
            pred = self([x,t], training=True)  # Forward pass
            loss = self.cevae_loss(data = data, pred = pred, training = True)
        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        metrics = {
            "loss": loss,
        }
        return metrics

    def test_step(self, data):
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        data = data[0]
        x,t,y = data
        with tf.GradientTape() as tape:
            pred = self([x,t], training=True)  # Forward pass
        y_infer = pred[0]
        loss = self.cevae_loss(data = data, pred = pred, training = False)
        y0, y1 = y_infer[0].sample(),y_infer[1].sample()
        metrics = {"loss":loss,"y0": tfkb.mean(y0),"y1": tfkb.mean(y1)}
        return metrics


In [6]:
#Colab command to allow us to run Colab in TF2
!rm -rf ./logs/ 
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(log_dir + "/metrics")
file_writer.set_as_default()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
%reload_ext tensorboard 

model = CEVAE()
### MAIN CODE ####
val_split=0.2
batch_size=64
verbose=True
i = 0
tf.random.set_seed(i)
np.random.seed(i)
 
callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=40, min_delta=0), 
        #40 is Shi's recommendation patience for this dataset, but you should tune for your data 
        ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto',
                          min_delta=0, cooldown=0, min_lr=0),
        #This learning rate scheduling is quite agressive which seems good for this dataset
        metrics_for_cevae(data,verbose),
        tensorboard_callback
    ]
    
#optimizer hyperparameters
learning_rate = 5e-5
model.compile(
    optimizer=tf.keras.optimizers.Adam(
        learning_rate = learning_rate, 
        # momentum = momentum, 
        # nesterov=True
        )
    )

model.fit(
    [data['x'],data['t'],data['ys']],
    callbacks=callbacks,
    validation_split=val_split,
    epochs=140,
    batch_size=200,
    verbose=verbose
    )
print("Done!")

Epoch 1/140
[200 6]
[200]
[200]
[197 6]
[197]
[150 6]
[150]
 — ite: 4.2017  — ate: 2.3762 — pehe: 4.7951 
Epoch 2/140
[200 6]
[200]
[200]
[197 6]
[197]
[150 6]
[150]
 — ite: 4.1992  — ate: 2.0923 — pehe: 4.6289 


In [None]:
%tensorboard --logdir logs/fit