In [5]:
import datetime
import numpy as np
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow_probability as tfp
import tensorflow.keras.layers as tfkl
tfd,tfpl = tfp.distributions,tfp.layers
import tensorflow.keras.backend as tfkb
from tensorflow.keras.callbacks import Callback
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from tensorflow.keras.optimizers import SGD
from evaluation import *
from cevae_networks import *
################################################
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--scale_penalize',    type = float, default = 0.001,  help = '')
parser.add_argument('--learning_rate',     type = float, default = 0.001,  help = '')
parser.add_argument('--default_y_scale',   type = float, default = 1.,  help = '')
parser.add_argument('--t_dim',     type = int, default = 1,  help = '')
parser.add_argument('--y_dim',     type = int, default = 1,  help = '')
parser.add_argument('--x_dim',     type = int, default = 25, help = '')
parser.add_argument('--z_dim',     type = int, default = 20, help = '')
parser.add_argument('--x_num_dim', type = int, default = 6,  help = '')
parser.add_argument('--x_bin_dim', type = int, default = 19, help = '')
parser.add_argument('--nh', type = int, default = 3, help = 'number of hidden layers')
parser.add_argument('--h',  type = int, default = 200, help = 'number of hidden units')
args = parser.parse_args([])
################################################
!wget -nc http://www.fredjo.com/files/ihdp_npci_1-100.train.npz
!wget -nc http://www.fredjo.com/files/ihdp_npci_1-100.test.npz 

def load_IHDP_data(training_data,testing_data,i):
    with open(training_data,'rb') as trf, open(testing_data,'rb') as tef:
        train_data=np.load(trf); test_data=np.load(tef)
        y=np.concatenate(   (train_data['yf'][:,i],   test_data['yf'][:,i])).astype('float32') #most GPUs only compute 32-bit floats
        t=np.concatenate(   (train_data['t'][:,i],    test_data['t'][:,i])).astype('float32')
        x=np.concatenate(   (train_data['x'][:,:,i],  test_data['x'][:,:,i]),axis=0).astype('float32')
        mu_0=np.concatenate((train_data['mu0'][:,i],  test_data['mu0'][:,i])).astype('float32')
        mu_1=np.concatenate((train_data['mu1'][:,i],  test_data['mu1'][:,i])).astype('float32')
        ycf=np.concatenate((train_data['ycf'][:,i],  test_data['ycf'][:,i])).astype('float32')
        data={'x':x,'t':t,'y':y,'t':t,'mu_0':mu_0,'mu_1':mu_1}
        data['t']=data['t'].reshape(-1,1) #we're just padding one dimensional vectors with an additional dimension 
        data['y']=data['y'].reshape(-1,1)
        data['ycf'] = ycf.reshape(-1,1)
        #rescaling y between 0 and 1 often makes training of DL regressors easier
        data['y_scaler'] = StandardScaler().fit(data['y'])
        data['ys'] = data['y_scaler'].transform(data['y'])
    return data
ind = 7
# rep = 5
# rep = 1
# data = load_IHDP_data(training_data='./ihdp_npci_1-100.train.npz',testing_data='./ihdp_npci_1-100.test.npz',i = ind)
# for key in data:
#     if key != 'y_scaler':
#         data[key] = np.repeat(data[key],repeats = rep, axis = 0)
# np.shape(data['x'])
data_train = load_IHDP_data(training_data='./ihdp_npci_1-100.train.npz',testing_data='./ihdp_npci_1-100.train.npz',i = ind)
data_valid = load_IHDP_data(training_data='./ihdp_npci_1-100.test.npz',testing_data='./ihdp_npci_1-100.test.npz',i = ind)
np.shape(data_train['x'])

文件 “ihdp_npci_1-100.train.npz” 已经存在；不获取。

文件 “ihdp_npci_1-100.test.npz” 已经存在；不获取。



(1344, 25)

In [6]:
class EpsilonLayer(tfkl.Layer):
    def __init__(self):
        super(EpsilonLayer, self).__init__()
    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.epsilon = self.add_weight(name='epsilon',
                                       shape=[1, 1],
                                       initializer='RandomNormal',
                                       #  initializer='ones',
                                       trainable=True)
        super(EpsilonLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, **kwargs):
        #note there is only one epsilon were just duplicating it for conformability
        return self.epsilon * tf.ones_like(inputs)[:, 0:1]

class CEVAE(tf.keras.Model):
    def __init__(self):
        super(CEVAE, self).__init__()
        ########################################
        # networks
        self.activation = 'elu'
        # CEVAE Model 
        ## (encoder)
        self.q_y_tx = q_y_tx(args.x_bin_dim, args.x_num_dim, args.y_dim, args.t_dim, args.nh, args.h)
        self.q_t_x = q_t_x(args.x_bin_dim, args.x_num_dim, args.t_dim, args.nh, args.h)
        self.q_z_txy = q_z_txy(args.x_bin_dim, args.x_num_dim, args.y_dim, args.t_dim, args.z_dim, args.nh, args.h)
        ## (decoder)
        self.p_x_z = p_x_z(args.x_bin_dim, args.x_num_dim, args.z_dim, args.nh, args.h)
        self.p_t_z = p_t_z(args.t_dim, args.z_dim, args.nh, args.h)
        self.p_y_tz = p_y_tz(args.y_dim, args.t_dim, args.z_dim, args.nh, args.h)
        self.epsilon_layer = EpsilonLayer()
        self.beta = 1

    def call(self, data, training=False):
        if training:
            x_train,t_train = data
            # encoder
            t_infer = self.q_t_x(x_train)
            t_infer_sample = tf.cast(t_infer.sample(), tf.float32)
            
            y_infer = self.q_y_tx(x_train)
            y0_infer, y1_infer = y_infer
            y_infer_sample = y0_infer.sample() * (1-t_infer_sample) + y1_infer.sample() * t_infer_sample
            
            txy = tf.concat([tf.cast(t_infer_sample,tf.float32), y_infer_sample, x_train],-1)
            z_infer = self.q_z_txy(txy)
            z_infer_sample = z_infer.sample()
            # decoder
            ## p(x|z)
            x_num,x_bin = self.p_x_z(z_infer_sample)
            ## p(t|z)
            t = self.p_t_z(z_infer_sample)
            ## p(y|t,z)
            y = self.p_y_tz(tf.concat([t_train,z_infer_sample],-1) )
            epsilon = self.epsilon_layer(t_infer_sample)
            
            return y_infer,t_infer,z_infer,y,t,x_num,x_bin,epsilon
        else:
            x_train = data
            # encoder
            t_infer = self.q_t_x(x_train)
            t_infer_sample = tf.cast(t_infer.sample(), tf.float32)
            y_infer = self.q_y_tx(x_train)
            y0_infer, y1_infer = y_infer
            y_infer_sample = y0_infer.sample() * (1-t_infer_sample) + y1_infer.sample() * t_infer_sample
            txy = tf.concat([tf.cast(t_infer_sample,tf.float32), y_infer_sample, x_train],-1)
            z_infer = self.q_z_txy(txy)
            z_infer_sample = z_infer.loc

            t1z = tf.concat([tf.ones_like(t_infer_sample),z_infer_sample],-1)
            t0z = tf.concat([tf.zeros_like(t_infer_sample),z_infer_sample],-1)
            y0 = self.p_y_tz(t0z)
            y1 = self.p_y_tz(t1z)
            y = [y0,y1]
            return y,t_infer,z_infer


    def cevae_loss(self, data, pred, training = False):
        x_train, t_train, y_train = data[0],data[1],data[2]
        x_train_num, x_train_bin = x_train[:,:args.x_num_dim],x_train[:,args.x_num_dim:]
        y_infer,t_infer,z_infer,y,t,x_num,x_bin = pred
        y0,y1 = y_infer
        # reconstruct loss
        recon_x_num = tfkb.sum(x_num.log_prob(x_train_num), 1)
        recon_x_bin = tfkb.sum(x_bin.log_prob(x_train_bin), 1)
        recon_y = tfkb.sum(y.log_prob(y_train), 1)
        recon_t = tfkb.sum(t.log_prob(t_train), 1)
        # kl loss
        z_infer_sample = z_infer.sample()
        z = tfd.Normal(loc = [0] * 20, scale = [1]*20)
        kl_z = tfkb.sum((z.log_prob(z_infer_sample) - z_infer.log_prob(z_infer_sample)), -1)
        # aux loss
        aux_y = tfkb.sum(y0.log_prob(y_train)*(1-t_train) + y1.log_prob(y_train)* t_train, 1)
        aux_t = tfkb.sum(t_infer.log_prob(t_train), 1)
        loss = -tfkb.mean(recon_x_bin + recon_x_num + recon_y + recon_t + aux_y + aux_t + kl_z)
        # target regularization
        y_pred = y0.loc * (1-t_train) + y1.loc * t_train
        t_pred = tf.math.sigmoid(t.logits)
        cc = t_train/t_pred - (1-t_train) / (1-t_pred)
        t_reg = tf.math.square(y_pred + epsilon * cc - y_train)
        # loss += tfkb.mean(t_reg) * self.beta
        return loss

    def train_step(self, data):
        data = data[0]
        x,t,y = data
        with tf.GradientTape() as tape:
            pred = self([x,t], training=True)  # Forward pass
            loss = self.cevae_loss(data = data, pred = pred, training = True)
        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        metrics = {"loss": loss}
        return metrics

    def test_step(self, data):
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        data = data[0]
        x,t,y = data
        with tf.GradientTape() as tape:
            pred = self([x,t], training=True)  # Forward pass
        y_infer = pred[0]
        loss = self.cevae_loss(data = data, pred = pred, training = False)
        y0, y1 = y_infer[0].sample(),y_infer[1].sample()
        ate = tfkb.mean(y1) - tfkb.mean(y0)
        metrics = {"loss":loss,"y0": tfkb.mean(y0),"y1": tfkb.mean(y1),'ate_afte_scaled': ate}
        return metrics


In [7]:
#Colab command to allow us to run Colab in TF2
!rm -rf ./logs/ 
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(log_dir + "/metrics")
file_writer.set_as_default()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
%reload_ext tensorboard 

model = CEVAE()
### MAIN CODE ####
val_split=0.2
batch_size=64
verbose=True
i = 0
tf.random.set_seed(i)
np.random.seed(i)
 
callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='var_loss', patience=40, min_delta=0), 
        #40 is Shi's recommendation patience for this dataset, but you should tune for your data 
        ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=verbose, mode='auto',
                          min_delta=0, cooldown=0, min_lr=0),
        #This learning rate scheduling is quite agressive which seems good for this dataset
        metrics_for_cevae(data_train,'train',verbose),
        metrics_for_cevae(data_valid,'valid',verbose),
        tensorboard_callback
    ]
    
#optimizer hyperparameters
learning_rate = 5e-5
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = learning_rate))

model.fit(
    [data_train['x'],data_train['t'],data_train['ys']],
    callbacks=callbacks,
    validation_data=[[data_valid['x'],data_valid['t'],data_valid['ys']]],
    epochs=180,
    batch_size=200,
    verbose=verbose
    )
print("Done!")

Epoch 1/180
 — ite: 5.4625  — ate: 3.7916 — pehe: 5.4625 
 — ite: 5.3001  — ate: 3.0135 — pehe: 5.3001 
Epoch 2/180
 — ite: 5.2954  — ate: 3.5478 — pehe: 5.2954 
 — ite: 5.6729  — ate: 3.4777 — pehe: 5.6729 
Epoch 3/180
 — ite: 5.3299  — ate: 3.6891 — pehe: 5.3299 
 — ite: 5.8791  — ate: 3.4839 — pehe: 5.8791 
Epoch 4/180
 — ite: 5.1054  — ate: 3.3641 — pehe: 5.1054 
 — ite: 5.5572  — ate: 2.6706 — pehe: 5.5572 
Epoch 5/180
 — ite: 5.2716  — ate: 3.5804 — pehe: 5.2716 
 — ite: 5.7522  — ate: 3.0596 — pehe: 5.7522 
Epoch 6/180
 — ite: 5.3037  — ate: 3.4423 — pehe: 5.3037 
 — ite: 5.8085  — ate: 3.1676 — pehe: 5.8085 
Epoch 7/180
 — ite: 5.1549  — ate: 3.3192 — pehe: 5.1549 
 — ite: 5.6572  — ate: 2.8054 — pehe: 5.6572 
Epoch 8/180
 — ite: 4.9609  — ate: 3.0710 — pehe: 4.9609 
 — ite: 5.3555  — ate: 2.0377 — pehe: 5.3555 
Epoch 9/180
 — ite: 4.8803  — ate: 2.8854 — pehe: 4.8803 
 — ite: 5.4052  — ate: 2.5566 — pehe: 5.4052 
Epoch 10/180
 — ite: 4.7231  — ate: 2.7125 — pehe: 4.7231 
 — it

In [8]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6007 (pid 45269), started 2 days, 2:04:06 ago. (Use '!kill 45269' to kill it.)