In [1]:
import gym
import numpy as np

In [2]:
# create deterministic version of Frozen Lake
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False}
)

### Cross-Entropy Method

In [3]:
def run_sample(env, policy, action_size):
    state = env.reset()
    # get action counts for each sample
    action_count = np.zeros(np.shape(policy))
    total_reward = 0
    while True:
        action = np.random.choice(action_size,p=policy[state])
        action_count[state,action] += 1
        state, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            break
    
    return total_reward, action_count

def update_policy(policy, s_list, learning_rate, keep):
    # only keep best samples
    s_list.sort(key=lambda x: x[0], reverse=True)
    s_list = s_list[:keep]
    
    # get the action counts for the best performers in one array
    best_policy = np.zeros(np.shape(policy))
    for s in s_list:
        best_policy += s[1]
       
    # update policy using learning rate
    for i in range(len(best_policy)):
        total_actions = np.sum(best_policy[i])
        if total_actions > 0:
            policy[i] = (best_policy[i]/total_actions)*learning_rate + policy[i]*(1.-learning_rate) 
    
    # normalize the probabilities of the policy so that they sum to 1
    policy = policy / np.sum(policy,axis=1)[:,None]
    
    return policy

### FrozenLake Not Slippery

In [4]:
# create environment
env = gym.make('FrozenLakeNotSlippery-v0')
state_size = env.nS
action_size = env.nA

# hyperparameters
samples = 100
learning_rate = 0.1
trial = 100
keep_best = int(0.2*samples)

In [5]:
# initialize policy
policy_array = np.ones((state_size, action_size))/action_size

# run trials and collect samples with each trial
# update policy at end of each trial
for t in range(trial):
    sample_list = []
    for s in range(samples):
        reward, action_table = run_sample(env, policy_array, action_size)
        sample_list.append((reward, action_table))
    policy_array = update_policy(policy_array, sample_list, learning_rate, keep_best)
    

In [6]:
# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        action = np.argmax(policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 1.0 Average Length: 6.0


### FrozenLake Slippery

In [7]:
# create environment
env = gym.make('FrozenLake-v0')
state_size = env.nS
action_size = env.nA

# hyperparameters
samples = 100
learning_rate = 0.1
trial = 1000
keep_best = int(0.2*samples)

In [8]:
# initialize policy
policy_array = np.ones((state_size, action_size))/action_size

# run trials and collect samples with each trial
# update policy at end of each trial
for t in range(trial):
    sample_list = []
    for s in range(samples):
        reward, action_table = run_sample(env, policy_array, action_size)
        sample_list.append((reward, action_table))
    policy_array = update_policy(policy_array, sample_list, learning_rate, keep_best)

In [9]:
# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        action = np.argmax(policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 0.73 Average Length: 40.6


### Taxi Environment

In [10]:
# create environment
env = gym.make('Taxi-v2')
state_size = env.nS
action_size = env.nA

# hyperparameters
samples = 100
learning_rate = 0.1
trial = 1000
keep_best = int(0.5*samples)

In [11]:
policy_array = np.ones((state_size, action_size))/action_size

for t in range(trial):
    sample_list = []
    for s in range(samples):
        reward, action_table = run_sample(env, policy_array, action_size)
        sample_list.append((reward, action_table))
    policy_array = update_policy(policy_array, sample_list, learning_rate, keep_best)

In [12]:
# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        # choose action based on best action in that state
        action = np.argmax(policy_array[state])
        # choose action based policy distribution
        #action = np.random.choice(action_size,p=policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 8.32 Average Length: 12.68


## Cross-Entropy Method using PyTorch

In [13]:
import torch

if torch.cuda.is_available(): 
    device = torch.device('cuda') 
else:                                                   
    device = torch.device('cpu') 

In [24]:
# create environment
env = gym.make('FrozenLakeNotSlippery-v0')
state_size = env.nS
action_size = env.nA

# hyperparameters
samples = 100
learning_rate = 0.1
smoothing_factor = 1
trial = 200
keep_best = int(0.2*samples)

In [25]:
def run_sample_tensor(env, policy, state_size, action_size, device):
    state = env.reset()
    # get action counts for each sample
    action_count = torch.zeros((state_size, action_size))#.to(device)
    total_reward = torch.zeros((1))#.to(device)
    while True:
        action = np.random.choice(action_size, p=policy[state])
        action_count[state, action] += 1
        state, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            break
    
    return total_reward, action_count

def update_policy_tensor(policy, samp_tensor, rew_tensor, learn_rate,
                         smooth_factor, action_size, keep, device):
    # sort tensor by reward and return indices of best performers to samp_index
    samp_index = rew_tensor.sort(descending=True)[1]
    # get indices of best samples
    samp_index = samp_index[:keep]
    # only keep best samples by using samp_index and index_select
    samp_tensor = samp_tensor.index_select(0, samp_index)
    # sum the results to get the action counts by state and action
    samp_tensor = samp_tensor.sum(dim=0)
    # sum to get action counts by state
    action_count = samp_tensor.sum(dim=1)
    # only want states visited at least once
    mask = action_count.ge(0.5)
    # update policy with best samples
    policy[mask] = samp_tensor[mask]/action_count[mask,None]*learn_rate + policy[mask]*(1.-learn_rate)
    # normalize policy so that they sum to 1
    policy = policy / policy.sum(dim=1)[:,None]
     
    return policy

In [26]:
policy_tensor = torch.ones((state_size,action_size)).to(device)/action_size

for t in range(trial):
    sample_tensor = torch.zeros((samples, state_size, action_size)).to(device)
    reward_tensor = torch.zeros((samples)).to(device)
    policy_array = policy_tensor.cpu().numpy()

    for s in range(samples):
        reward, action_table_tensor = run_sample_tensor(env, policy_array, 
                                           state_size, action_size, device)
        reward_tensor[s] = reward
        sample_tensor[s] = action_table_tensor 
    policy_tensor = update_policy_tensor(policy_tensor, sample_tensor, reward_tensor, 
                         learning_rate, smoothing_factor, action_size, keep_best, device)

In [27]:
# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []
policy_array = policy_tensor.cpu().numpy()

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        # choose action based on best action in that state
        action = np.argmax(policy_array[state])
        # choose action based policy distribution
        #action = np.random.choice(action_size,p=policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 1.0 Average Length: 6.0


### FrozenLake Slippery using PyTorch

In [28]:
# create environment
env = gym.make('FrozenLake-v0')
state_size = env.nS
action_size = env.nA

# hyperparameters
samples = 100
learning_rate = 0.1
smoothing_factor = 1
trial = 1000
keep_best = int(0.2*samples)

policy_tensor = torch.ones((state_size,action_size)).to(device)/action_size

for t in range(trial):
    sample_tensor = torch.zeros((samples, state_size, action_size)).to(device)
    reward_tensor = torch.zeros((samples)).to(device)
    policy_array = policy_tensor.cpu().numpy()

    for s in range(samples):
        reward, action_table_tensor = run_sample_tensor(env, policy_array, 
                                           state_size, action_size, device)
        reward_tensor[s] = reward
        sample_tensor[s] = action_table_tensor 
    policy_tensor = update_policy_tensor(policy_tensor, sample_tensor, reward_tensor, 
                         learning_rate, smoothing_factor, action_size, keep_best, device)

In [29]:
# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []
policy_array = policy_tensor.cpu().numpy()

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        # choose action based on best action in that state
        action = np.argmax(policy_array[state])
        # choose action based policy distribution
        #action = np.random.choice(action_size,p=policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 0.59 Average Length: 36.88


### Taxi using PyTorch

In [30]:
# create environment
env = gym.make('Taxi-v2')
state_size = env.nS
action_size = env.nA

# hyperparameters
samples = 100
learning_rate = 0.1
smoothing_factor = 1
trial = 1000
keep_best = int(0.5*samples)

policy_tensor = torch.ones((state_size,action_size)).to(device)/action_size

for t in range(trial):
    sample_tensor = torch.zeros((samples, state_size, action_size)).to(device)
    reward_tensor = torch.zeros((samples)).to(device)
    policy_array = policy_tensor.cpu().numpy()

    for s in range(samples):
        reward, action_table_tensor = run_sample_tensor(env, policy_array, 
                                           state_size, action_size, device)
        reward_tensor[s] = reward
        sample_tensor[s] = action_table_tensor 
    policy_tensor = update_policy_tensor(policy_tensor, sample_tensor, reward_tensor, 
                         learning_rate, smoothing_factor, action_size, keep_best, device)

In [31]:
# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []
policy_array = policy_tensor.cpu().numpy()

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        # choose action based on best action in that state
        action = np.argmax(policy_array[state])
        # choose action based policy distribution
        #action = np.random.choice(action_size,p=policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 8.23 Average Length: 12.77
