Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
25e49a9
commit 80ccb78
Showing
5 changed files
with
1,173 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
# -*- coding: utf-8 -*- | ||
import pandas as pd | ||
import numpy as np | ||
from scipy.stats import bernoulli | ||
|
||
"""## RL Algorithms | ||
--- | ||
""" | ||
|
||
## CLIPPING VALUES ## | ||
MIN_CLIP_VALUE = 0.35 | ||
MAX_CLIP_VALUE = 0.75 | ||
# Advantage Time Feature Dimensions | ||
D_advantage = 3 | ||
# Baseline Time Feature Dimensions | ||
D_baseline = 4 | ||
# Number of Posterior Draws | ||
NUM_POSTERIOR_SAMPLES = 5000 | ||
|
||
## HELPERS ## | ||
def sigmoid(x): | ||
return 1 / (1 + np.exp(-x)) | ||
|
||
def process_alg_state(env_state, env_type): | ||
if env_type == 'stat': | ||
baseline_state = np.array([env_state[0], env_state[1], env_state[3], 1]) | ||
advantage_state = np.delete(baseline_state, 2) | ||
else: | ||
baseline_state = np.array([env_state[0], env_state[1], env_state[4], 1]) | ||
advantage_state = np.delete(baseline_state, 2) | ||
|
||
return advantage_state, baseline_state | ||
|
||
class RLAlgorithmCandidate(): | ||
def __init__(self, alg_type, cluster_size, update_cadence): | ||
self.alg_type = alg_type | ||
self.cluster_size = cluster_size | ||
self.update_cadence = update_cadence | ||
# process_alg_state is a global function | ||
self.process_alg_state_func = process_alg_state | ||
|
||
def action_selection(self, advantage_state, baseline_state): | ||
return 0 | ||
|
||
def update(self, advantage_states, baseline_states, actions, pis, rewards): | ||
return 0 | ||
|
||
def get_cluster_size(self): | ||
return self.cluster_size | ||
|
||
def get_update_cadence(self): | ||
return self.update_cadence | ||
|
||
"""### Bayesian Linear Regression Thompson Sampler | ||
--- | ||
#### Helper Functions | ||
--- | ||
""" | ||
|
||
## POSTERIOR HELPERS ## | ||
# create the feature vector given state, action, and action selection probability | ||
def create_big_phi(advantage_states, baseline_states, actions, probs): | ||
big_phi = np.hstack((baseline_states, np.multiply(advantage_states.T, probs).T, \ | ||
np.multiply(advantage_states.T, (actions - probs)).T,)) | ||
return big_phi | ||
|
||
def compute_posterior_var(Phi, sigma_n_squared, prior_sigma): | ||
return np.linalg.inv(1/sigma_n_squared * Phi.T @ Phi + np.linalg.inv(prior_sigma)) | ||
|
||
def compute_posterior_mean(Phi, R, sigma_n_squared, prior_mu, prior_sigma): | ||
# return np.linalg.inv(1/sigma_n_squared * X.T @ X + np.linalg.inv(prior_sigma)) \ | ||
# @ (1/sigma_n_squared * X.T @ y + (prior_mu @ np.linalg.inv(prior_sigma)).T) | ||
return compute_posterior_var(Phi, sigma_n_squared, prior_sigma) \ | ||
@ (1/sigma_n_squared * Phi.T @ R + np.linalg.inv(prior_sigma) @ prior_mu) | ||
|
||
# update posterior distribution | ||
def update_posterior_w(Phi, R, sigma_n_squared, prior_mu, prior_sigma): | ||
mean = compute_posterior_mean(Phi, R, sigma_n_squared, prior_mu, prior_sigma) | ||
var = compute_posterior_var(Phi, sigma_n_squared, prior_sigma) | ||
|
||
return mean, var | ||
|
||
def get_beta_posterior_draws(posterior_mean, posterior_var): | ||
# grab last D_advantage of mean vector | ||
beta_post_mean = posterior_mean[-D_advantage:] | ||
# grab right bottom corner D_advantage x D_advantage submatrix | ||
beta_post_var = posterior_var[-D_advantage:,-D_advantage:] | ||
|
||
return np.random.multivariate_normal(beta_post_mean, beta_post_var, NUM_POSTERIOR_SAMPLES) | ||
|
||
## ACTION SELECTION ## | ||
# we calculate the posterior probability of P(R_1 > R_0) clipped | ||
# we make a Bernoulli draw with prob. P(R_1 > R_0) of the action | ||
def bayes_lr_action_selector(beta_posterior_draws, advantage_state): | ||
num_positive_preds = len(np.where(beta_posterior_draws @ advantage_state > 0)[0]) | ||
posterior_prob = num_positive_preds / len(beta_posterior_draws) | ||
clipped_prob = max(min(MAX_CLIP_VALUE, posterior_prob), MIN_CLIP_VALUE) | ||
return bernoulli.rvs(clipped_prob), clipped_prob | ||
|
||
""" #### BLR Algorithm Object | ||
--- | ||
""" | ||
|
||
class BayesianLinearRegression(RLAlgorithmCandidate): | ||
def __init__(self, cluster_size, update_cadence): | ||
super(BayesianLinearRegression, self).__init__('blr', cluster_size, update_cadence) | ||
|
||
self.PRIOR_MU = np.zeros(D_baseline + D_advantage + D_advantage) | ||
self.PRIOR_SIGMA = 5 * np.eye(len(self.PRIOR_MU)) | ||
self.SIGMA_N_2 = 3526.747 | ||
# initial draws are from the prior | ||
self.beta_posterior_draws = get_beta_posterior_draws(self.PRIOR_MU, self.PRIOR_SIGMA) | ||
|
||
def action_selection(self, advantage_state, baseline_state): | ||
return bayes_lr_action_selector(self.beta_posterior_draws, advantage_state) | ||
|
||
def update(self, advantage_states, baseline_states, actions, pis, rewards): | ||
Phi = create_big_phi(advantage_states, baseline_states, actions, pis) | ||
posterior_mean, posterior_var = update_posterior_w(Phi, rewards, self.SIGMA_N_2, self.PRIOR_MU, self.PRIOR_SIGMA) | ||
self.beta_posterior_draws = get_beta_posterior_draws(posterior_mean, posterior_var) | ||
|
||
"""### Zero-Inflated Poisson with Metropolis-Hasting (MH) Sampler | ||
--- | ||
""" | ||
|
||
from scipy.stats import uniform | ||
from scipy.stats import multivariate_normal | ||
import math | ||
|
||
def prior_density(w_b_0, w_p_0, w_b_1, w_p_1): | ||
log_prior_w_b_0_density = multivariate_normal.logpdf(w_b_0, mean=np.zeros(D_baseline), cov=np.ones(D_baseline)) | ||
log_prior_w_b_1_density = multivariate_normal.logpdf(w_b_1, mean=np.zeros(D_advantage), cov=np.ones(D_advantage)) | ||
log_prior_w_p_0_density = multivariate_normal.logpdf(w_p_0, mean=np.zeros(D_baseline), cov=np.ones(D_baseline)) | ||
log_prior_w_p_1_density = multivariate_normal.logpdf(w_p_1, mean=np.zeros(D_advantage), cov=np.ones(D_advantage)) | ||
|
||
return log_prior_w_b_0_density + log_prior_w_b_1_density + \ | ||
log_prior_w_p_0_density + log_prior_w_p_1_density | ||
|
||
## Potential Problem: What if the reward is not an integer? | ||
def llkhd_density(advantage_state, baseline_state, action, \ | ||
w_b_0, w_p_0, w_b_1, w_p_1, obs): | ||
bern_p = 1 - sigmoid(baseline_state @ w_b_0 + \ | ||
action * advantage_state @ w_b_1) | ||
x = baseline_state @ w_p_0 + action * advantage_state @ w_p_1 | ||
lam = np.exp(x) | ||
# ref: https://discourse.pymc.io/t/zero-inflated-poisson-log-lik/2664 | ||
# density of a 0-inflated poisson | ||
if obs == 0: | ||
return np.log((1 - bern_p) + bern_p * np.exp(-lam)) | ||
else: | ||
return np.log(bern_p) + (-lam) + obs * np.log(lam) - math.log(np.math.factorial(int(obs))) | ||
|
||
def log_posterior_density(advantage_states, baseline_states, actions, \ | ||
w_b_0, w_p_0, w_b_1, w_p_1, obs): | ||
log_prior = prior_density(w_b_0, w_p_0, w_b_1, w_p_1) | ||
log_llkhd = np.sum([llkhd_density(advantage_states[i], baseline_states[i], actions[i], \ | ||
w_b_0, w_p_0, w_b_1, w_p_1, obs[i]) for i in range(len(advantage_states))]) | ||
|
||
return log_prior + log_llkhd | ||
|
||
## we want an acceptance rate of 25-50% for MH | ||
## if not, tune step_size | ||
# ref for choosing step size: https://stackoverflow.com/questions/28686900/how-to-decide-the-step-size-when-using-metropolis-hastings-algorithm | ||
def metropolis_hastings(advantage_states, baseline_states, actions, Y, \ | ||
step_size=0.01, num_steps=10000): | ||
w_b_0_old, w_p_0_old = np.random.randn(D_baseline) * 0.5, np.random.randn(D_baseline) * 0.5 | ||
w_b_1_old, w_p_1_old = np.random.randn(D_advantage) * 0.5, np.random.randn(D_advantage) * 0.5 | ||
log_post_dist = lambda w_b_0, w_p_0, w_b_1, w_p_1 : log_posterior_density(advantage_states, baseline_states, actions, \ | ||
w_b_0, w_p_0, w_b_1, w_p_1, Y) | ||
old_logp_val = log_post_dist(w_b_0_old, w_p_0_old, w_b_1_old, w_p_1_old) | ||
accepted_samples = [] | ||
num_accepts = 0 | ||
for step in range(num_steps): | ||
if step > 0 and step % 1000 == 0: | ||
print("ITERATION: {}, Acceptance Rate for MH is {}".format(step, num_accepts / step)) | ||
# propose a new sample | ||
w_b_0_prop = multivariate_normal(mean=w_b_0_old, cov=step_size**2 * np.eye(D_baseline)).rvs() | ||
w_p_0_prop = multivariate_normal(mean=w_p_0_old, cov=step_size**2 * np.eye(D_baseline)).rvs() | ||
w_b_1_prop = multivariate_normal(mean=w_b_1_old, cov=step_size**2 * np.eye(D_advantage)).rvs() | ||
w_p_1_prop = multivariate_normal(mean=w_p_1_old, cov=step_size**2 * np.eye(D_advantage)).rvs() | ||
# accept or reject | ||
U = uniform.rvs() | ||
prop_logp_val = log_post_dist(w_b_0_prop, w_p_0_prop, w_b_1_prop, w_p_1_prop) | ||
accept_prob = prop_logp_val - old_logp_val | ||
if np.log(U) < accept_prob: | ||
accepted_samples.append(np.concatenate((w_b_0_prop, w_p_0_prop, w_b_1_prop, w_p_1_prop), axis=0)) | ||
old_logp_val = prop_logp_val | ||
w_b_0_old = w_b_0_prop | ||
w_p_0_old = w_p_0_prop | ||
w_b_1_old = w_b_1_prop | ||
w_p_1_old = w_p_1_prop | ||
num_accepts += 1 | ||
else: | ||
accepted_samples.append(np.concatenate((w_b_0_old, w_p_0_old, w_b_1_old, w_p_1_old), axis=0)) | ||
|
||
return accepted_samples | ||
|
||
# defined burn-in | ||
BURN_IN = .1 | ||
# define thinning | ||
THIN = 2 | ||
|
||
def burn_and_thin(samples): | ||
N = len(samples) | ||
burn_thin_samples = samples[int(BURN_IN * N)::THIN] | ||
|
||
return burn_thin_samples | ||
|
||
def bayes_zero_inflated_pred(advantage_state, baseline_state, sample, action): | ||
w_b_0, w_p_0 = sample[:D_baseline], sample[D_baseline:2 * D_baseline] | ||
w_b_1, w_p_1 = sample[2 * D_baseline:2 * D_baseline + D_advantage], sample[2 * D_baseline + D_advantage:] | ||
bern_p = 1 - sigmoid(baseline_state @ w_b_0 + action * advantage_state @ w_b_1) | ||
# poisson component | ||
x = baseline_state @ w_p_0 + action * advantage_state @ w_p_1 | ||
poisson_lam = np.exp(x) | ||
|
||
return bern_p * poisson_lam | ||
|
||
def bayes_zero_inflated_action_selector(advantage_state, baseline_state, samples): | ||
posterior_preds_0 = np.apply_along_axis(lambda sample: bayes_zero_inflated_pred(advantage_state, baseline_state, sample, 0), axis=1, arr=samples) | ||
posterior_preds_1 = np.apply_along_axis(lambda sample: bayes_zero_inflated_pred(advantage_state, baseline_state, sample, 1), axis=1, arr=samples) | ||
diff = posterior_preds_1 - posterior_preds_0 | ||
posterior_prob = np.count_nonzero(diff > 0) / len(samples) | ||
clipped_prob = max(min(MAX_CLIP_VALUE, posterior_prob), MIN_CLIP_VALUE) | ||
|
||
return bernoulli.rvs(clipped_prob), clipped_prob | ||
|
||
""" #### ZIP Algorithm Object | ||
--- | ||
""" | ||
|
||
class BayesianZeroInflatedPoisson(RLAlgorithmCandidate): | ||
def __init__(self, cluster_size, update_cadence): | ||
super(BayesianZeroInflatedPoisson, self).__init__('zero_infl', cluster_size, update_cadence) | ||
|
||
self.PRIOR_MU = np.zeros(2 * D_baseline + 2 * D_advantage) | ||
# ANNA TODO: may need to change this later to make it more uninformative, but remember to change it | ||
# in the ZIP prior density function too | ||
self.PRIOR_SIGMA = np.eye(len(self.PRIOR_MU)) | ||
# initial draws are from the prior | ||
self.theta_posterior_draws = \ | ||
np.array([multivariate_normal(mean=self.PRIOR_MU, cov=self.PRIOR_SIGMA).rvs() for i in range(NUM_POSTERIOR_SAMPLES)]) | ||
|
||
def action_selection(self, advantage_state, baseline_state): | ||
return bayes_zero_inflated_action_selector(advantage_state, baseline_state, self.theta_posterior_draws) | ||
|
||
def update(self, advantage_states, baseline_states, actions, pis, rewards): | ||
mh_samples = metropolis_hastings(advantage_states, baseline_states, actions, \ | ||
rewards, num_steps=int((THIN*NUM_POSTERIOR_SAMPLES) / (1 - BURN_IN))) | ||
self.theta_posterior_draws = burn_and_thin(mh_samples) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
from simulation_environments import * | ||
from rl_algorithm_candidates import * | ||
|
||
## GLOBAL VALUES ## | ||
RECRUITMENT_RATE = 4 | ||
TRIAL_LENGTH_IN_WEEKS = 10 | ||
|
||
## REWARD ENGINEERING ## | ||
def truncated_reward(outcome): | ||
return min(outcome, 180) | ||
|
||
# handles clustering | ||
def run_experiment(alg_candidate, sim_env): | ||
env_users = sim_env.get_users() | ||
cluster_size = alg_candidate.get_cluster_size() | ||
update_cadence = alg_candidate.get_update_cadence() | ||
init_dict = {"baseline_states": np.empty((0, D_baseline), float), "advantage_states": np.empty((0, D_advantage), float), \ | ||
"env_states": np.empty((0, 5 if sim_env.get_env_type() == 'stat' else 6), float), "actions": np.empty((0, 1), float), "probs": np.empty((0, 1), float), "rewards": np.empty((0, 1), float)} | ||
result = [(user_id, init_dict.copy()) for user_id in env_users] | ||
for i in range(int(len(result) / cluster_size)): | ||
print("CLUSTER: ", i) | ||
# saves state, action, prob, reward values for entire cluster for update step | ||
total_results = init_dict.copy() | ||
cluster_user_ids = env_users[i * cluster_size: (i + 1) * cluster_size] | ||
print(cluster_user_ids) | ||
for j in range(2 * NUM_DAYS): | ||
print("SESSION NUMBER: ", j) | ||
for user_idx, user_id in enumerate(cluster_user_ids): | ||
print("USER_ID: ", user_id) | ||
## PROCESS STATE ## | ||
session = sim_env.get_states_for_user(user_id)[j] | ||
env_state = sim_env.process_env_state(session, j, result[(cluster_size * i) + user_idx][1]["rewards"]) | ||
advantage_state, baseline_state = alg_candidate.process_alg_state_func(env_state, sim_env.get_env_type()) | ||
## SAVE STATE VALUES ## | ||
result[(cluster_size * i) + user_idx][1]["env_states"] = \ | ||
np.append(result[(cluster_size * i) + user_idx][1]["env_states"], env_state.reshape(1, -1), axis=0) | ||
result[(cluster_size * i) + user_idx][1]["advantage_states"] = \ | ||
np.append(result[(cluster_size * i) + user_idx][1]["advantage_states"], advantage_state.reshape(1, -1), axis=0) | ||
total_results["advantage_states"] = np.append(total_results["advantage_states"], advantage_state.reshape(1, -1), axis=0) | ||
result[(cluster_size * i) + user_idx][1]["baseline_states"] = \ | ||
np.append(result[(cluster_size * i) + user_idx][1]["baseline_states"], baseline_state.reshape(1, -1), axis=0) | ||
total_results["baseline_states"] = np.append(total_results["baseline_states"], baseline_state.reshape(1, -1), axis=0) | ||
## ACTION SELECTION ## | ||
action, action_prob = alg_candidate.action_selection(advantage_state, baseline_state) | ||
## SAVE ACTION VALUES ## | ||
result[(cluster_size * i) + user_idx][1]["actions"] = \ | ||
np.append(result[(cluster_size * i) + user_idx][1]["actions"], action) | ||
total_results["actions"] = np.append(total_results["actions"], action) | ||
result[(cluster_size * i) + user_idx][1]["probs"] = \ | ||
np.append(result[(cluster_size * i) + user_idx][1]["probs"], action_prob) | ||
total_results["probs"] = np.append(total_results["probs"], action_prob) | ||
## REWARD GENERATION ## | ||
reward = truncated_reward(sim_env.generate_rewards(user_id, env_state, action)) | ||
## SAVE REWARD VALUES ## | ||
result[(cluster_size * i) + user_idx][1]["rewards"] = \ | ||
np.append(result[(cluster_size * i) + user_idx][1]["rewards"], reward) | ||
total_results["rewards"] = np.append(total_results["rewards"], reward) | ||
|
||
if (j % update_cadence == 0 and j >= update_cadence - 1): | ||
print("UPDATE TIME.") | ||
alg_candidate.update(total_results["advantage_states"], total_results["baseline_states"], \ | ||
total_results["actions"], total_results["probs"], total_results["rewards"]) | ||
|
||
return result | ||
|
||
|
||
# returns a int(NUM_USERS / RECRUITMENT_RATE) x RECRUITMENT_RATE array of user indices | ||
# row index represents the week that they enter the study | ||
def pre_process_users(total_trial_users): | ||
results = [] | ||
for j, user in enumerate(total_trial_users): | ||
results.append((j, int(j // RECRUITMENT_RATE), user)) | ||
|
||
return np.array(results) | ||
|
||
### data structure can be a list of tuples (user_id, {rewards, actions, probs, states}) so it's easier to process | ||
# and calculate regret for | ||
### runs experiment with full pooling and incremental recruitment | ||
def run_incremental_recruitment_exp(user_groups, alg_candidate, sim_env): | ||
# users_groups will be a list of tuples where tuple[0] is the week of entry | ||
# and tuple[1] is an array of user_ids | ||
env_users = sim_env.get_users() | ||
cluster_size = alg_candidate.get_cluster_size() | ||
update_cadence = alg_candidate.get_update_cadence() | ||
init_dict = {"baseline_states": np.empty((0, D_baseline), float), "advantage_states": np.empty((0, D_advantage), float), \ | ||
"env_states": np.empty((0, 5 if sim_env.get_env_type() == 'stat' else 6), float), "actions": np.empty((0, 1), float), "probs": np.empty((0, 1), float), "rewards": np.empty((0, 1), float)} | ||
result = [(user_id, init_dict.copy()) for user_id in env_users] | ||
total_results = init_dict.copy() | ||
current_groups = user_groups[:4] | ||
week = 0 | ||
while (len(current_groups) > 0): | ||
print("Week: ", week) | ||
for user_tuple in current_groups: | ||
user_idx, user_entry_date, user_id = int(user_tuple[0]), int(user_tuple[1]), user_tuple[2] | ||
user_states = sim_env.get_states_for_user(user_id) | ||
# do action selection for 14 decision times (7 days) | ||
for decision_idx in range(14): | ||
## PROCESS STATE ## | ||
j = (week - user_entry_date) * 14 + decision_idx | ||
session = user_states[j] | ||
env_state = sim_env.process_env_state(session, j, result[user_idx][1]["rewards"]) | ||
advantage_state, baseline_state = alg_candidate.process_alg_state_func(env_state, sim_env.get_env_type()) | ||
## SAVE STATE VALUES ## | ||
result[user_idx][1]["env_states"] = \ | ||
np.append(result[user_idx][1]["env_states"], env_state.reshape(1, -1), axis=0) | ||
result[user_idx][1]["advantage_states"] = \ | ||
np.append(result[user_idx][1]["advantage_states"], advantage_state.reshape(1, -1), axis=0) | ||
total_results["advantage_states"] = np.append(total_results["advantage_states"], advantage_state.reshape(1, -1), axis=0) | ||
result[user_idx][1]["baseline_states"] = \ | ||
np.append(result[user_idx][1]["baseline_states"], baseline_state.reshape(1, -1), axis=0) | ||
total_results["baseline_states"] = np.append(total_results["baseline_states"], baseline_state.reshape(1, -1), axis=0) | ||
## ACTION SELECTION ## | ||
action, action_prob = alg_candidate.action_selection(advantage_state, baseline_state) | ||
## SAVE ACTION VALUES ## | ||
result[user_idx][1]["actions"] = \ | ||
np.append(result[user_idx][1]["actions"], action) | ||
total_results["actions"] = np.append(total_results["actions"], action) | ||
result[user_idx][1]["probs"] = \ | ||
np.append(result[user_idx][1]["probs"], action_prob) | ||
total_results["probs"] = np.append(total_results["probs"], action_prob) | ||
## REWARD GENERATION ## | ||
reward = truncated_reward(sim_env.generate_rewards(user_id, env_state, action)) | ||
## SAVE REWARD VALUES ## | ||
result[user_idx][1]["rewards"] = \ | ||
np.append(result[user_idx][1]["rewards"], reward) | ||
total_results["rewards"] = np.append(total_results["rewards"], reward) | ||
|
||
# update time at the end of each week | ||
print("UPDATE TIME.") | ||
alg_candidate.update(total_results["advantage_states"], total_results["baseline_states"], \ | ||
total_results["actions"], total_results["probs"], total_results["rewards"]) | ||
# handle adding or removing user groups | ||
week += 1 | ||
if (week < len(user_groups)): | ||
# add more users | ||
current_groups = np.concatenate((current_groups, user_groups[RECRUITMENT_RATE * week: RECRUITMENT_RATE * week + RECRUITMENT_RATE]), axis=0) | ||
# check if some user group finished the study | ||
if (week > TRIAL_LENGTH_IN_WEEKS - 1): | ||
current_groups = current_groups[RECRUITMENT_RATE:] | ||
|
||
return result |
Oops, something went wrong.