Skip to content

Commit

Permalink
added code for running experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
annatrella committed Jun 1, 2022
1 parent 25e49a9 commit 80ccb78
Show file tree
Hide file tree
Showing 5 changed files with 1,173 additions and 0 deletions.
Binary file added code/.DS_Store
Binary file not shown.
251 changes: 251 additions & 0 deletions code/rl_algorithm_candidates.py
@@ -0,0 +1,251 @@
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from scipy.stats import bernoulli

"""## RL Algorithms
---
"""

## CLIPPING VALUES ##
MIN_CLIP_VALUE = 0.35
MAX_CLIP_VALUE = 0.75
# Advantage Time Feature Dimensions
D_advantage = 3
# Baseline Time Feature Dimensions
D_baseline = 4
# Number of Posterior Draws
NUM_POSTERIOR_SAMPLES = 5000

## HELPERS ##
def sigmoid(x):
return 1 / (1 + np.exp(-x))

def process_alg_state(env_state, env_type):
if env_type == 'stat':
baseline_state = np.array([env_state[0], env_state[1], env_state[3], 1])
advantage_state = np.delete(baseline_state, 2)
else:
baseline_state = np.array([env_state[0], env_state[1], env_state[4], 1])
advantage_state = np.delete(baseline_state, 2)

return advantage_state, baseline_state

class RLAlgorithmCandidate():
def __init__(self, alg_type, cluster_size, update_cadence):
self.alg_type = alg_type
self.cluster_size = cluster_size
self.update_cadence = update_cadence
# process_alg_state is a global function
self.process_alg_state_func = process_alg_state

def action_selection(self, advantage_state, baseline_state):
return 0

def update(self, advantage_states, baseline_states, actions, pis, rewards):
return 0

def get_cluster_size(self):
return self.cluster_size

def get_update_cadence(self):
return self.update_cadence

"""### Bayesian Linear Regression Thompson Sampler
---
#### Helper Functions
---
"""

## POSTERIOR HELPERS ##
# create the feature vector given state, action, and action selection probability
def create_big_phi(advantage_states, baseline_states, actions, probs):
big_phi = np.hstack((baseline_states, np.multiply(advantage_states.T, probs).T, \
np.multiply(advantage_states.T, (actions - probs)).T,))
return big_phi

def compute_posterior_var(Phi, sigma_n_squared, prior_sigma):
return np.linalg.inv(1/sigma_n_squared * Phi.T @ Phi + np.linalg.inv(prior_sigma))

def compute_posterior_mean(Phi, R, sigma_n_squared, prior_mu, prior_sigma):
# return np.linalg.inv(1/sigma_n_squared * X.T @ X + np.linalg.inv(prior_sigma)) \
# @ (1/sigma_n_squared * X.T @ y + (prior_mu @ np.linalg.inv(prior_sigma)).T)
return compute_posterior_var(Phi, sigma_n_squared, prior_sigma) \
@ (1/sigma_n_squared * Phi.T @ R + np.linalg.inv(prior_sigma) @ prior_mu)

# update posterior distribution
def update_posterior_w(Phi, R, sigma_n_squared, prior_mu, prior_sigma):
mean = compute_posterior_mean(Phi, R, sigma_n_squared, prior_mu, prior_sigma)
var = compute_posterior_var(Phi, sigma_n_squared, prior_sigma)

return mean, var

def get_beta_posterior_draws(posterior_mean, posterior_var):
# grab last D_advantage of mean vector
beta_post_mean = posterior_mean[-D_advantage:]
# grab right bottom corner D_advantage x D_advantage submatrix
beta_post_var = posterior_var[-D_advantage:,-D_advantage:]

return np.random.multivariate_normal(beta_post_mean, beta_post_var, NUM_POSTERIOR_SAMPLES)

## ACTION SELECTION ##
# we calculate the posterior probability of P(R_1 > R_0) clipped
# we make a Bernoulli draw with prob. P(R_1 > R_0) of the action
def bayes_lr_action_selector(beta_posterior_draws, advantage_state):
num_positive_preds = len(np.where(beta_posterior_draws @ advantage_state > 0)[0])
posterior_prob = num_positive_preds / len(beta_posterior_draws)
clipped_prob = max(min(MAX_CLIP_VALUE, posterior_prob), MIN_CLIP_VALUE)
return bernoulli.rvs(clipped_prob), clipped_prob

""" #### BLR Algorithm Object
---
"""

class BayesianLinearRegression(RLAlgorithmCandidate):
def __init__(self, cluster_size, update_cadence):
super(BayesianLinearRegression, self).__init__('blr', cluster_size, update_cadence)

self.PRIOR_MU = np.zeros(D_baseline + D_advantage + D_advantage)
self.PRIOR_SIGMA = 5 * np.eye(len(self.PRIOR_MU))
self.SIGMA_N_2 = 3526.747
# initial draws are from the prior
self.beta_posterior_draws = get_beta_posterior_draws(self.PRIOR_MU, self.PRIOR_SIGMA)

def action_selection(self, advantage_state, baseline_state):
return bayes_lr_action_selector(self.beta_posterior_draws, advantage_state)

def update(self, advantage_states, baseline_states, actions, pis, rewards):
Phi = create_big_phi(advantage_states, baseline_states, actions, pis)
posterior_mean, posterior_var = update_posterior_w(Phi, rewards, self.SIGMA_N_2, self.PRIOR_MU, self.PRIOR_SIGMA)
self.beta_posterior_draws = get_beta_posterior_draws(posterior_mean, posterior_var)

"""### Zero-Inflated Poisson with Metropolis-Hasting (MH) Sampler
---
"""

from scipy.stats import uniform
from scipy.stats import multivariate_normal
import math

def prior_density(w_b_0, w_p_0, w_b_1, w_p_1):
log_prior_w_b_0_density = multivariate_normal.logpdf(w_b_0, mean=np.zeros(D_baseline), cov=np.ones(D_baseline))
log_prior_w_b_1_density = multivariate_normal.logpdf(w_b_1, mean=np.zeros(D_advantage), cov=np.ones(D_advantage))
log_prior_w_p_0_density = multivariate_normal.logpdf(w_p_0, mean=np.zeros(D_baseline), cov=np.ones(D_baseline))
log_prior_w_p_1_density = multivariate_normal.logpdf(w_p_1, mean=np.zeros(D_advantage), cov=np.ones(D_advantage))

return log_prior_w_b_0_density + log_prior_w_b_1_density + \
log_prior_w_p_0_density + log_prior_w_p_1_density

## Potential Problem: What if the reward is not an integer?
def llkhd_density(advantage_state, baseline_state, action, \
w_b_0, w_p_0, w_b_1, w_p_1, obs):
bern_p = 1 - sigmoid(baseline_state @ w_b_0 + \
action * advantage_state @ w_b_1)
x = baseline_state @ w_p_0 + action * advantage_state @ w_p_1
lam = np.exp(x)
# ref: https://discourse.pymc.io/t/zero-inflated-poisson-log-lik/2664
# density of a 0-inflated poisson
if obs == 0:
return np.log((1 - bern_p) + bern_p * np.exp(-lam))
else:
return np.log(bern_p) + (-lam) + obs * np.log(lam) - math.log(np.math.factorial(int(obs)))

def log_posterior_density(advantage_states, baseline_states, actions, \
w_b_0, w_p_0, w_b_1, w_p_1, obs):
log_prior = prior_density(w_b_0, w_p_0, w_b_1, w_p_1)
log_llkhd = np.sum([llkhd_density(advantage_states[i], baseline_states[i], actions[i], \
w_b_0, w_p_0, w_b_1, w_p_1, obs[i]) for i in range(len(advantage_states))])

return log_prior + log_llkhd

## we want an acceptance rate of 25-50% for MH
## if not, tune step_size
# ref for choosing step size: https://stackoverflow.com/questions/28686900/how-to-decide-the-step-size-when-using-metropolis-hastings-algorithm
def metropolis_hastings(advantage_states, baseline_states, actions, Y, \
step_size=0.01, num_steps=10000):
w_b_0_old, w_p_0_old = np.random.randn(D_baseline) * 0.5, np.random.randn(D_baseline) * 0.5
w_b_1_old, w_p_1_old = np.random.randn(D_advantage) * 0.5, np.random.randn(D_advantage) * 0.5
log_post_dist = lambda w_b_0, w_p_0, w_b_1, w_p_1 : log_posterior_density(advantage_states, baseline_states, actions, \
w_b_0, w_p_0, w_b_1, w_p_1, Y)
old_logp_val = log_post_dist(w_b_0_old, w_p_0_old, w_b_1_old, w_p_1_old)
accepted_samples = []
num_accepts = 0
for step in range(num_steps):
if step > 0 and step % 1000 == 0:
print("ITERATION: {}, Acceptance Rate for MH is {}".format(step, num_accepts / step))
# propose a new sample
w_b_0_prop = multivariate_normal(mean=w_b_0_old, cov=step_size**2 * np.eye(D_baseline)).rvs()
w_p_0_prop = multivariate_normal(mean=w_p_0_old, cov=step_size**2 * np.eye(D_baseline)).rvs()
w_b_1_prop = multivariate_normal(mean=w_b_1_old, cov=step_size**2 * np.eye(D_advantage)).rvs()
w_p_1_prop = multivariate_normal(mean=w_p_1_old, cov=step_size**2 * np.eye(D_advantage)).rvs()
# accept or reject
U = uniform.rvs()
prop_logp_val = log_post_dist(w_b_0_prop, w_p_0_prop, w_b_1_prop, w_p_1_prop)
accept_prob = prop_logp_val - old_logp_val
if np.log(U) < accept_prob:
accepted_samples.append(np.concatenate((w_b_0_prop, w_p_0_prop, w_b_1_prop, w_p_1_prop), axis=0))
old_logp_val = prop_logp_val
w_b_0_old = w_b_0_prop
w_p_0_old = w_p_0_prop
w_b_1_old = w_b_1_prop
w_p_1_old = w_p_1_prop
num_accepts += 1
else:
accepted_samples.append(np.concatenate((w_b_0_old, w_p_0_old, w_b_1_old, w_p_1_old), axis=0))

return accepted_samples

# defined burn-in
BURN_IN = .1
# define thinning
THIN = 2

def burn_and_thin(samples):
N = len(samples)
burn_thin_samples = samples[int(BURN_IN * N)::THIN]

return burn_thin_samples

def bayes_zero_inflated_pred(advantage_state, baseline_state, sample, action):
w_b_0, w_p_0 = sample[:D_baseline], sample[D_baseline:2 * D_baseline]
w_b_1, w_p_1 = sample[2 * D_baseline:2 * D_baseline + D_advantage], sample[2 * D_baseline + D_advantage:]
bern_p = 1 - sigmoid(baseline_state @ w_b_0 + action * advantage_state @ w_b_1)
# poisson component
x = baseline_state @ w_p_0 + action * advantage_state @ w_p_1
poisson_lam = np.exp(x)

return bern_p * poisson_lam

def bayes_zero_inflated_action_selector(advantage_state, baseline_state, samples):
posterior_preds_0 = np.apply_along_axis(lambda sample: bayes_zero_inflated_pred(advantage_state, baseline_state, sample, 0), axis=1, arr=samples)
posterior_preds_1 = np.apply_along_axis(lambda sample: bayes_zero_inflated_pred(advantage_state, baseline_state, sample, 1), axis=1, arr=samples)
diff = posterior_preds_1 - posterior_preds_0
posterior_prob = np.count_nonzero(diff > 0) / len(samples)
clipped_prob = max(min(MAX_CLIP_VALUE, posterior_prob), MIN_CLIP_VALUE)

return bernoulli.rvs(clipped_prob), clipped_prob

""" #### ZIP Algorithm Object
---
"""

class BayesianZeroInflatedPoisson(RLAlgorithmCandidate):
def __init__(self, cluster_size, update_cadence):
super(BayesianZeroInflatedPoisson, self).__init__('zero_infl', cluster_size, update_cadence)

self.PRIOR_MU = np.zeros(2 * D_baseline + 2 * D_advantage)
# ANNA TODO: may need to change this later to make it more uninformative, but remember to change it
# in the ZIP prior density function too
self.PRIOR_SIGMA = np.eye(len(self.PRIOR_MU))
# initial draws are from the prior
self.theta_posterior_draws = \
np.array([multivariate_normal(mean=self.PRIOR_MU, cov=self.PRIOR_SIGMA).rvs() for i in range(NUM_POSTERIOR_SAMPLES)])

def action_selection(self, advantage_state, baseline_state):
return bayes_zero_inflated_action_selector(advantage_state, baseline_state, self.theta_posterior_draws)

def update(self, advantage_states, baseline_states, actions, pis, rewards):
mh_samples = metropolis_hastings(advantage_states, baseline_states, actions, \
rewards, num_steps=int((THIN*NUM_POSTERIOR_SAMPLES) / (1 - BURN_IN)))
self.theta_posterior_draws = burn_and_thin(mh_samples)
141 changes: 141 additions & 0 deletions code/rl_experiments.py
@@ -0,0 +1,141 @@
from simulation_environments import *
from rl_algorithm_candidates import *

## GLOBAL VALUES ##
RECRUITMENT_RATE = 4
TRIAL_LENGTH_IN_WEEKS = 10

## REWARD ENGINEERING ##
def truncated_reward(outcome):
return min(outcome, 180)

# handles clustering
def run_experiment(alg_candidate, sim_env):
env_users = sim_env.get_users()
cluster_size = alg_candidate.get_cluster_size()
update_cadence = alg_candidate.get_update_cadence()
init_dict = {"baseline_states": np.empty((0, D_baseline), float), "advantage_states": np.empty((0, D_advantage), float), \
"env_states": np.empty((0, 5 if sim_env.get_env_type() == 'stat' else 6), float), "actions": np.empty((0, 1), float), "probs": np.empty((0, 1), float), "rewards": np.empty((0, 1), float)}
result = [(user_id, init_dict.copy()) for user_id in env_users]
for i in range(int(len(result) / cluster_size)):
print("CLUSTER: ", i)
# saves state, action, prob, reward values for entire cluster for update step
total_results = init_dict.copy()
cluster_user_ids = env_users[i * cluster_size: (i + 1) * cluster_size]
print(cluster_user_ids)
for j in range(2 * NUM_DAYS):
print("SESSION NUMBER: ", j)
for user_idx, user_id in enumerate(cluster_user_ids):
print("USER_ID: ", user_id)
## PROCESS STATE ##
session = sim_env.get_states_for_user(user_id)[j]
env_state = sim_env.process_env_state(session, j, result[(cluster_size * i) + user_idx][1]["rewards"])
advantage_state, baseline_state = alg_candidate.process_alg_state_func(env_state, sim_env.get_env_type())
## SAVE STATE VALUES ##
result[(cluster_size * i) + user_idx][1]["env_states"] = \
np.append(result[(cluster_size * i) + user_idx][1]["env_states"], env_state.reshape(1, -1), axis=0)
result[(cluster_size * i) + user_idx][1]["advantage_states"] = \
np.append(result[(cluster_size * i) + user_idx][1]["advantage_states"], advantage_state.reshape(1, -1), axis=0)
total_results["advantage_states"] = np.append(total_results["advantage_states"], advantage_state.reshape(1, -1), axis=0)
result[(cluster_size * i) + user_idx][1]["baseline_states"] = \
np.append(result[(cluster_size * i) + user_idx][1]["baseline_states"], baseline_state.reshape(1, -1), axis=0)
total_results["baseline_states"] = np.append(total_results["baseline_states"], baseline_state.reshape(1, -1), axis=0)
## ACTION SELECTION ##
action, action_prob = alg_candidate.action_selection(advantage_state, baseline_state)
## SAVE ACTION VALUES ##
result[(cluster_size * i) + user_idx][1]["actions"] = \
np.append(result[(cluster_size * i) + user_idx][1]["actions"], action)
total_results["actions"] = np.append(total_results["actions"], action)
result[(cluster_size * i) + user_idx][1]["probs"] = \
np.append(result[(cluster_size * i) + user_idx][1]["probs"], action_prob)
total_results["probs"] = np.append(total_results["probs"], action_prob)
## REWARD GENERATION ##
reward = truncated_reward(sim_env.generate_rewards(user_id, env_state, action))
## SAVE REWARD VALUES ##
result[(cluster_size * i) + user_idx][1]["rewards"] = \
np.append(result[(cluster_size * i) + user_idx][1]["rewards"], reward)
total_results["rewards"] = np.append(total_results["rewards"], reward)

if (j % update_cadence == 0 and j >= update_cadence - 1):
print("UPDATE TIME.")
alg_candidate.update(total_results["advantage_states"], total_results["baseline_states"], \
total_results["actions"], total_results["probs"], total_results["rewards"])

return result


# returns a int(NUM_USERS / RECRUITMENT_RATE) x RECRUITMENT_RATE array of user indices
# row index represents the week that they enter the study
def pre_process_users(total_trial_users):
results = []
for j, user in enumerate(total_trial_users):
results.append((j, int(j // RECRUITMENT_RATE), user))

return np.array(results)

### data structure can be a list of tuples (user_id, {rewards, actions, probs, states}) so it's easier to process
# and calculate regret for
### runs experiment with full pooling and incremental recruitment
def run_incremental_recruitment_exp(user_groups, alg_candidate, sim_env):
# users_groups will be a list of tuples where tuple[0] is the week of entry
# and tuple[1] is an array of user_ids
env_users = sim_env.get_users()
cluster_size = alg_candidate.get_cluster_size()
update_cadence = alg_candidate.get_update_cadence()
init_dict = {"baseline_states": np.empty((0, D_baseline), float), "advantage_states": np.empty((0, D_advantage), float), \
"env_states": np.empty((0, 5 if sim_env.get_env_type() == 'stat' else 6), float), "actions": np.empty((0, 1), float), "probs": np.empty((0, 1), float), "rewards": np.empty((0, 1), float)}
result = [(user_id, init_dict.copy()) for user_id in env_users]
total_results = init_dict.copy()
current_groups = user_groups[:4]
week = 0
while (len(current_groups) > 0):
print("Week: ", week)
for user_tuple in current_groups:
user_idx, user_entry_date, user_id = int(user_tuple[0]), int(user_tuple[1]), user_tuple[2]
user_states = sim_env.get_states_for_user(user_id)
# do action selection for 14 decision times (7 days)
for decision_idx in range(14):
## PROCESS STATE ##
j = (week - user_entry_date) * 14 + decision_idx
session = user_states[j]
env_state = sim_env.process_env_state(session, j, result[user_idx][1]["rewards"])
advantage_state, baseline_state = alg_candidate.process_alg_state_func(env_state, sim_env.get_env_type())
## SAVE STATE VALUES ##
result[user_idx][1]["env_states"] = \
np.append(result[user_idx][1]["env_states"], env_state.reshape(1, -1), axis=0)
result[user_idx][1]["advantage_states"] = \
np.append(result[user_idx][1]["advantage_states"], advantage_state.reshape(1, -1), axis=0)
total_results["advantage_states"] = np.append(total_results["advantage_states"], advantage_state.reshape(1, -1), axis=0)
result[user_idx][1]["baseline_states"] = \
np.append(result[user_idx][1]["baseline_states"], baseline_state.reshape(1, -1), axis=0)
total_results["baseline_states"] = np.append(total_results["baseline_states"], baseline_state.reshape(1, -1), axis=0)
## ACTION SELECTION ##
action, action_prob = alg_candidate.action_selection(advantage_state, baseline_state)
## SAVE ACTION VALUES ##
result[user_idx][1]["actions"] = \
np.append(result[user_idx][1]["actions"], action)
total_results["actions"] = np.append(total_results["actions"], action)
result[user_idx][1]["probs"] = \
np.append(result[user_idx][1]["probs"], action_prob)
total_results["probs"] = np.append(total_results["probs"], action_prob)
## REWARD GENERATION ##
reward = truncated_reward(sim_env.generate_rewards(user_id, env_state, action))
## SAVE REWARD VALUES ##
result[user_idx][1]["rewards"] = \
np.append(result[user_idx][1]["rewards"], reward)
total_results["rewards"] = np.append(total_results["rewards"], reward)

# update time at the end of each week
print("UPDATE TIME.")
alg_candidate.update(total_results["advantage_states"], total_results["baseline_states"], \
total_results["actions"], total_results["probs"], total_results["rewards"])
# handle adding or removing user groups
week += 1
if (week < len(user_groups)):
# add more users
current_groups = np.concatenate((current_groups, user_groups[RECRUITMENT_RATE * week: RECRUITMENT_RATE * week + RECRUITMENT_RATE]), axis=0)
# check if some user group finished the study
if (week > TRIAL_LENGTH_IN_WEEKS - 1):
current_groups = current_groups[RECRUITMENT_RATE:]

return result

0 comments on commit 80ccb78

Please sign in to comment.