# Main functions used in experiments

In [3]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm as _tqdm

def tqdm(*args, **kwargs):
    return _tqdm(*args, **kwargs, mininterval=1)  # Safety, do not overflow buffer
%matplotlib inline
import matplotlib.pyplot as plt
import sys

import random
import time
assert sys.version_info[:3] >= (3, 6, 0), "Make sure you have Python 3.6 installed!"

## Environment: Windy gridworld
Gives a reward of -1 for each step taken, while the final state is not reached

In [4]:
from windy_gridworld import WindyGridworldEnv
env = WindyGridworldEnv()
env??

## Policy

### Target policy (choose greedy vs non-greedy)
Greedy policy 

In [5]:
class GreedyPolicy(object):
    """
    A simple epsilon greedy policy.
    """
    def __init__(self, Q):
        self.Q = Q
    
    def get_probs(self, states, actions):
        """
        This method takes a list of states and a list of actions and returns a numpy array that contains 
        a probability of perfoming action in given state for every corresponding state action pair. 

        Args:
            states: a list of states.
            actions: a list of actions.

        Returns:
            Numpy array filled with probabilities (same length as states and actions)
        """   
        
        # Inefficient but kept same structure as below if we change policy later
        probs = [1 if a == np.argmax(self.Q[s]) else 0 for s,a in zip(states, actions)]
        
        return probs
        
    def sample_action(self, obs):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            obs: current state

        Returns:
            An action (int).
        """

        # find out what the max action is
        best_action = np.argmax(self.Q[obs])
        
        return best_action

In [6]:
class EpsilonGreedyPolicy(object):
    """
    A simple epsilon greedy policy.
    """
    def __init__(self, Q, epsilon):
        self.Q = Q
        self.epsilon = epsilon
        
    def get_probs(self, obs):
        # find out what the max action is
        max_index = np.argmax(self.Q[obs])
        
        # create equal probabilities for each action
        probs = np.zeros(self.Q[obs].shape) + (self.epsilon/(self.Q[obs].size))
        
        # add (1-epsilon) to the max action
        probs[max_index] += 1-self.epsilon
        
        return probs
        
    
    def sample_action(self, obs):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            obs: current state

        Returns:
            An action (int).
        """
        probs = self.get_probs(obs)
        
        # possible actions to choose from
        possible_actions = np.arange(0,self.Q[obs].size)
        
        # sample
        action = np.random.choice(possible_actions, p=probs)        
        
        return action

### Behavioural policy
Random policy in blackjack lab. 
TODO: experiment with behavioural policies to check which yield interesting results

In [7]:
class BehaviouralPolicy(object):
    """
    A behavioural policy
    """
    def __init__(self, nS, nA):
        self.probs = np.ones((nS, nA)) * 1/nA
        
    def get_probs(self, states, actions):
        """
        This method takes a list of states and a list of actions and returns a numpy array that contains 
        a probability of perfoming action in given state for every corresponding state action pair. 

        Args:
            states: a list of states.
            actions: a list of actions.

        Returns:
            Numpy array filled with probabilities (same length as states and actions)
        """        
        probs = [self.probs[s,a] for s,a in zip(states, actions)]
        
        return probs

    
    def sample_action(self, state):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            state: current state

        Returns:
            An action (int).
        """
        p_s = self.probs[state]
        
        return np.random.choice(range(0,self.probs.shape[1]), p=p_s)

In [8]:
bp = BehaviouralPolicy(env.nS, env.nA)

## Monte Carlo

## Sampling function given an env and policy
Function to sample an episode from the env.

In [9]:
def sample_episode(env, policy):
    """
    A sampling routine. Given environment and a policy samples one episode and returns states, actions, rewards
    and dones from environment's step function and policy's sample_action function as lists.

    Args:
        env: OpenAI gym environment.
        policy: A policy which allows us to sample actions with its sample_action method.

    Returns:
        Tuple of lists (states, actions, rewards, dones). All lists should have same length. 
        Hint: Do not include the state after the termination in the list of states.
    """
    # initialize
    states = []
    actions = []
    rewards = []
    dones = []
    
    # get a starting state
    s = env.reset()
    d = False
    
    # keep looping until done, don's save the terminal state
    while not d:
        states.append(s)
        a = policy.sample_action(s)
        s, r, d, _ = env.step(a)
        
        # save                
        actions.append(a)
        rewards.append(r)
        dones.append(d)
        

    return states, actions, rewards, dones

In [10]:
for episode in range(10):
    trajectory_data = sample_episode(env, bp)
#     print("Episode {}:\nStates {}\nActions {}\nRewards {}\nDones {}\n".format(episode,*trajectory_data))
    print(f"length of episode {episode}: {len(trajectory_data[0])}")

length of episode 0: 11938
length of episode 1: 3126
length of episode 2: 5623
length of episode 3: 10277
length of episode 4: 5629
length of episode 5: 1338
length of episode 6: 8800
length of episode 7: 230
length of episode 8: 9465
length of episode 9: 49732


### TO-DO: MC Ordinary Importance Sampling (make it work for windy gridworld)
Status: copied from MC_lab, not adapted to windy gridworld.
TODO: 
- make it work for Q values instead of V.
- update target policy's q values to make sure it learns something

In [11]:
## TODO
def mc_ordinary_importance_sampling(env, behavior_policy, target_policy, num_episodes, discount_factor=1.0,
                           sampling_function=sample_episode):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given target policy using behavior policy and weighted importance sampling.
    
    Args:
        env: OpenAI gym environment.
        behavior_policy: A policy used to collect the data.
        target_policy: A policy which value function we want to estimate.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        sampling_function: Function that generates data from one episode.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of current V and count of returns for each state
    # to calculate an update.
    V = defaultdict(float)
    returns_count = defaultdict(float)
    
    # sample episodes
    for i in tqdm(range(num_episodes)):
        states, actions, rewards, dones = sampling_function(env, behavior_policy)
        
        # extract target and behavioral probabilities
        target_probs = target_policy.get_probs(states, actions)
        behavioral_probs = behavior_policy.get_probs(states, actions)

        G = 0        
        
        # loop backwards over the trajectory
        for timestep in range(len(states)-1, -1, -1):
            s = states[timestep]
            r = rewards[timestep]
            G = discount_factor * G + r
            
            returns_count[s] += 1 

            # compute the ratio using the two probability lists
            ratio = np.prod([t/b for t, b in zip(target_probs[timestep:], behavioral_probs[timestep:])])

            # use every visit incremental method
            V[s] += 1/returns_count[s] * (ratio * G - V[s])
        
    return V

In [None]:
Q = np.zeros((env.nS, env.nA))
bp = BehaviouralPolicy(env.nS, env.nA)
gp = GreedyPolicy(Q)
V_10k = mc_ordinary_importance_sampling(env, bp, gp, num_episodes=10)

### MC: Weighted Importance Sampling

### TO-DO: same as above but weighted importance sampling
##### Eventually: merge the two functions into one with a weighted flag

In [21]:
def mc_weighted_importance_sampling(env, behavior_policy, target_policy, num_episodes, discount_factor=1.0,
                           sampling_function=sample_episode):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given target policy using behavior policy and ordinary importance sampling.
    
    Args:
        env: OpenAI gym environment.
        behavior_policy: A policy used to collect the data.
        target_policy: A policy which value function we want to estimate.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        sampling_function: Function that generates data from one episode.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # create a matrix defaultdict for the Q function and the sum of weights C
    Q = defaultdict(lambda: defaultdict(float))
    C = defaultdict(lambda: defaultdict(float))
    
    # sample episodes
    for i in tqdm(range(num_episodes)):
        states, actions, rewards, dones = sampling_function(env, behavior_policy)
        
        # extract target and behavioral probabilities
#         target_probs = target_policy.get_probs(states, actions)
        behavioral_probs = behavior_policy.get_probs(states, actions)

        G = 0
        W = 1
        
        # loop backwards over the trajectory
        for timestep in range(len(states)-1, -1, -1):
            # break out of the loop if the weights are 0
            
                
            s = states[timestep]
            r = rewards[timestep]
            a = actions[timestep]
            G = discount_factor * G + r
            
            C[s][a] += W
            
            # update Q function incrementally
            Q[s][a] += W/C[s][a] * (G - Q[s][a])
            
            # update the weights
            W *= (target_policy.get_probs(s)[a])/(behavioral_probs[timestep])
            
            # break out of the loop if the weights are 0
            if W == 0:
                break
            
    return Q

In [22]:
Q = np.zeros((env.nS, env.nA))
bp = BehaviouralPolicy(env.nS, env.nA)
gp = EpsilonGreedyPolicy(Q, epsilon=0.05)
Q_10k = mc_weighted_importance_sampling(env, bp, gp, num_episodes=200)




  0%|          | 0/200 [00:00<?, ?it/s][A[A[A


  1%|          | 2/200 [00:01<02:06,  1.56it/s][A[A[A


  2%|▎         | 5/200 [00:02<01:53,  1.72it/s][A[A[A


  4%|▎         | 7/200 [00:04<02:01,  1.59it/s][A[A[A


  4%|▍         | 9/200 [00:05<02:03,  1.55it/s][A[A[A


  7%|▋         | 14/200 [00:06<01:39,  1.86it/s][A[A[A


  8%|▊         | 16/200 [00:08<01:45,  1.74it/s][A[A[A


 10%|▉         | 19/200 [00:09<01:33,  1.94it/s][A[A[A


 11%|█         | 22/200 [00:10<01:28,  2.00it/s][A[A[A


 12%|█▎        | 25/200 [00:12<01:30,  1.94it/s][A[A[A


 15%|█▌        | 30/200 [00:13<01:16,  2.23it/s][A[A[A


 16%|█▋        | 33/200 [00:16<01:37,  1.71it/s][A[A[A


 18%|█▊        | 36/200 [00:17<01:29,  1.83it/s][A[A[A


 20%|█▉        | 39/200 [00:19<01:28,  1.82it/s][A[A[A


 21%|██        | 42/200 [00:23<01:58,  1.33it/s][A[A[A


 22%|██▏       | 44/200 [00:25<02:03,  1.27it/s][A[A[A


 24%|██▍       | 48/200 [00:26<01:37,  1.56it/s][

In [21]:
print(Q_10k)

defaultdict(<function mc_weighted_importance_sampling.<locals>.<lambda> at 0x1a1986f8c8>, {48: defaultdict(<class 'float'>, {3: -1.0}), 47: defaultdict(<class 'float'>, {2: -1.0})})


## Temporal Difference

### TO-DO: TD Ordinary Importance Sampling (make it work for gridworld)
Copied from TD_lab. Currently on-policy, needs to be off-policy.

Confused: do we need value functions instead of q-values? Do we even use importance weights in off-policy TD? Are there more off-policy TD methods besides SARSA?

In [12]:
def sarsa(env, policy, Q, num_episodes, discount_factor=1.0, alpha=0.5):
    """
    SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        policy: A policy which allows us to sample actions with its sample_action method.
        Q: Q value function, numpy array Q[s,a] -> state-action value.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        
    Returns:
        A tuple (Q, stats).
        Q is a numpy array Q[s,a] -> state-action value.
        stats is a list of tuples giving the episode lengths and returns.
    """
    
    # Keeps track of useful statistics
    stats = []
    
    for i_episode in tqdm(range(num_episodes)):
        i = 0
        R = 0
        
        # initial state is 3,0 in the grid (according to source code)
        s = env.reset()
        a = policy.sample_action(s)
        final_state_reached = False
        
        while True:
            # new actions
            s_prime, r, final_state, _ = env.step(a)
            
            # keep track of stats
            R += r
            i += 1    
            
            # sample action at state s_prime
            a_prime = policy.sample_action(s_prime)

            # update Q 
            Q[s][a] += alpha * (r + discount_factor * Q[s_prime][a_prime] - Q[s][a])    
    
            # update policy
            policy.Q = Q
            
            # if final state, terminate loop
            if final_state:
                break
        
            # update current s and a for next iteration
            s = s_prime
            a = a_prime
            
        stats.append((i, R))
        
    episode_lengths, episode_returns = zip(*stats)
    return Q, (episode_lengths, episode_returns)

### TO-DO: TD Weighted Importance Sampling (same as above but weighted)

In [None]:
## TD weighted importance sampling

## Experiments