# Main functions used in experiments

In [38]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm as _tqdm

def tqdm(*args, **kwargs):
    return _tqdm(*args, **kwargs, mininterval=1)  # Safety, do not overflow buffer
%matplotlib inline
import matplotlib.pyplot as plt
import sys

import random
import time
assert sys.version_info[:3] >= (3, 6, 0), "Make sure you have Python 3.6 installed!"

## Environment: Windy gridworld

In [47]:
from windy_gridworld import WindyGridworldEnv
env = WindyGridworldEnv()
env??

## Policy

### Target policy (choose greedy vs non-greedy)
Greedy policy 

In [89]:
class GreedyPolicy(object):
    """
    A simple epsilon greedy policy.
    """
    def __init__(self, Q):
        self.Q = Q
    
    def get_probs(self, states, actions):
        """
        This method takes a list of states and a list of actions and returns a numpy array that contains 
        a probability of perfoming action in given state for every corresponding state action pair. 

        Args:
            states: a list of states.
            actions: a list of actions.

        Returns:
            Numpy array filled with probabilities (same length as states and actions)
        """   
        
        # Inefficient but kept same structure as below if we change policy later
        probs = [1 if a == np.argmax(self.Q[s]) else 0 for s,a in zip(states, actions)]
        
        return probs
        
    def sample_action(self, obs):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            obs: current state

        Returns:
            An action (int).
        """

        # find out what the max action is
        best_action = np.argmax(self.Q[obs])
        
        return best_action

### Behavioural policy
Random policy in blackjack lab. 
TODO: experiment with behavioural policies to check which yield interesting results

In [82]:
class BehaviouralPolicy(object):
    """
    A behavioural policy
    """
    def __init__(self, nS, nA):
        self.probs = np.ones((nS, nA)) * 1/len(actions)
        
    def get_probs(self, states, actions):
        """
        This method takes a list of states and a list of actions and returns a numpy array that contains 
        a probability of perfoming action in given state for every corresponding state action pair. 

        Args:
            states: a list of states.
            actions: a list of actions.

        Returns:
            Numpy array filled with probabilities (same length as states and actions)
        """        
        probs = [self.probs[s,a] for s,a in zip(states, actions)]
        
        return probs

    
    def sample_action(self, state):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            state: current state

        Returns:
            An action (int).
        """
        p_s = self.probs[state]
        
        return np.random.choice(range(0,self.probs.shape[1]), p=p_s)

In [59]:
bp = BehaviouralPolicy(env.nS, env.nA)

## Monte Carlo

### TO-DO: Sampling function (check if suitable for env)
Function to sample an episode from the env.

In [52]:
def sample_episode(env, policy):
    """
    A sampling routine. Given environment and a policy samples one episode and returns states, actions, rewards
    and dones from environment's step function and policy's sample_action function as lists.

    Args:
        env: OpenAI gym environment.
        policy: A policy which allows us to sample actions with its sample_action method.

    Returns:
        Tuple of lists (states, actions, rewards, dones). All lists should have same length. 
        Hint: Do not include the state after the termination in the list of states.
    """
    # initialize
    states = []
    actions = []
    rewards = []
    dones = []
    
    # get a starting state
    s = env.reset()
    d = False
    
    # keep looping until done, don's save the terminal state
    while not d:
        states.append(s)
        a = policy.sample_action(s)
        s, r, d, _ = env.step(a)
        
        # save                
        actions.append(a)
        rewards.append(r)
        dones.append(d)
        

    return states, actions, rewards, dones

In [53]:
for episode in range(1):
    trajectory_data = sample_episode(env, bp)
    print("Episode {}:\nStates {}\nActions {}\nRewards {}\nDones {}\n".format(episode,*trajectory_data))

Episode 0:
States [30, 31, 30, 40, 40, 30, 20, 30, 31, 30, 20, 21, 22, 23, 3, 4, 4, 5, 5, 5, 6, 7, 6, 7, 6, 6, 6, 7, 7, 6, 7, 7, 7, 6, 7, 6, 6, 5, 5, 5, 5, 6, 7, 7, 7, 8, 8, 7, 6, 7, 7, 6, 7, 6, 6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 6, 6, 7, 6, 7, 7, 7, 7, 6, 5, 4, 4, 3, 2, 2, 1, 0, 1, 1, 2, 12, 11, 12, 22, 23, 23, 14, 3, 3, 3, 4, 3, 3, 3, 3, 2, 2, 2, 3, 3, 2, 2, 12, 13, 2, 12, 2, 12, 13, 4, 4, 4, 5, 4, 4, 3, 3, 3, 4, 3, 3, 2, 1, 2, 3, 3, 4, 4, 5, 4, 4, 4, 5, 6, 5, 4, 4, 5, 5, 5, 5, 4, 4, 4, 5, 4, 4, 5, 4, 5, 5, 5, 4, 4, 4, 4, 4, 4, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 8, 8, 7, 6, 7, 7, 7, 7, 7, 7, 8, 7, 8, 8, 7, 6, 5, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 7, 6, 6, 5, 4, 4, 5, 4, 5, 5, 6, 6, 6, 5, 5, 5, 5, 4, 5, 5, 6, 6, 6, 5, 4, 4, 3, 3, 4, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 4, 5, 4, 5, 4, 5, 5, 5, 5, 5, 6, 5, 4, 5, 5, 6, 6, 7, 7, 6, 6, 6, 5, 4, 3, 3, 3, 4, 3, 3, 3, 2, 2, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 6, 6, 5, 4, 4, 4, 4, 5, 6, 7, 7, 7, 8, 7, 6, 7

### TO-DO: MC Ordinary Importance Sampling (make it work for windy gridworld)
Status: copied from MC_lab, not adapted to windy gridworld.
TODO: 
- make it work for Q values instead of V.
- update target policy's q values to make sure it learns something

In [93]:
def mc_ordinary_importance_sampling(env, behavior_policy, target_policy, num_episodes, discount_factor=1.0,
                           sampling_function=sample_episode):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given target policy using behavior policy and ordinary importance sampling.
    
    Args:
        env: OpenAI gym environment.
        behavior_policy: A policy used to collect the data.
        target_policy: A policy which value function we want to estimate.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        sampling_function: Function that generates data from one episode.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of current V and count of returns for each state
    # to calculate an update.
    V = defaultdict(float)
    returns_count = defaultdict(float)
    
    # sample episodes
    for i in tqdm(range(num_episodes)):
        states, actions, rewards, dones = sampling_function(env, behavior_policy)
        
        # extract target and behavioral probabilities
        target_probs = target_policy.get_probs(states, actions)
        behavioral_probs = behavior_policy.get_probs(states, actions)

        G = 0        
        
        # loop backwards over the trajectory
        for timestep in range(len(states)-1, -1, -1):
            s = states[timestep]
            r = rewards[timestep]
            G = discount_factor * G + r
            
            returns_count[s] += 1 

            # compute the ratio using the two probability lists
            ratio = np.prod([t/b for t, b in zip(target_probs[timestep:], behavioral_probs[timestep:])])

            # use every visit incremental method
            V[s] += 1/returns_count[s] * (ratio * G - V[s])
        
    return V

In [95]:
Q = np.zeros((env.nS, env.nA))
bp = BehaviouralPolicy(env.nS, env.nA)
gp = GreedyPolicy(Q)
V_10k = mc_ordinary_importance_sampling(env, bp, gp, num_episodes=10)



  0%|          | 0/10 [00:00<?, ?it/s][A[A

 10%|█         | 1/10 [00:04<00:37,  4.22s/it][A[A

 20%|██        | 2/10 [00:15<00:50,  6.37s/it][A[A

KeyboardInterrupt: 

### MC: Weighted Importance Sampling

### TO-DO: same as above but weighted importance sampling

In [None]:
## TODO

## Temporal Difference

### TO-DO: TD Ordinary Importance Sampling (make it work for gridworld)
Copied from TD_lab. Currently on-policy, needs to be off-policy.

Confused: do we need value functions instead of q-values? Do we even use importance weights in off-policy TD? Are there more off-policy TD methods besides SARSA?

In [12]:
def sarsa(env, policy, Q, num_episodes, discount_factor=1.0, alpha=0.5):
    """
    SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        policy: A policy which allows us to sample actions with its sample_action method.
        Q: Q value function, numpy array Q[s,a] -> state-action value.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        
    Returns:
        A tuple (Q, stats).
        Q is a numpy array Q[s,a] -> state-action value.
        stats is a list of tuples giving the episode lengths and returns.
    """
    
    # Keeps track of useful statistics
    stats = []
    
    for i_episode in tqdm(range(num_episodes)):
        i = 0
        R = 0
        
        # initial state is 3,0 in the grid (according to source code)
        s = env.reset()
        a = policy.sample_action(s)
        final_state_reached = False
        
        while True:
            # new actions
            s_prime, r, final_state, _ = env.step(a)
            
            # keep track of stats
            R += r
            i += 1    
            
            # sample action at state s_prime
            a_prime = policy.sample_action(s_prime)

            # update Q 
            Q[s][a] += alpha * (r + discount_factor * Q[s_prime][a_prime] - Q[s][a])    
    
            # update policy
            policy.Q = Q
            
            # if final state, terminate loop
            if final_state:
                break
        
            # update current s and a for next iteration
            s = s_prime
            a = a_prime
            
        stats.append((i, R))
        
    episode_lengths, episode_returns = zip(*stats)
    return Q, (episode_lengths, episode_returns)

### TO-DO: TD Weighted Importance Sampling (same as above but weighted)

In [None]:
## TD weighted importance sampling

## Experiments