In [1]:
# Usual imports£
import numpy as np
import matplotlib.pyplot as plt

# emphatic library imports
from TD import EmphaticTD, OffTD

from models import Model # Class to store model characteristic
from policies import Policy, LeftRightPolicy, RandomPolicy # Class to store and generate markov chain matrix
from utils import comparatorTD # Tool to evaluate the TD algorithms on a model

In [2]:
%matplotlib notebook

In [3]:
# The algorithm you want to compare
algos = [
    EmphaticTD(alpha = 0.001, lambdas=0),
    OffTD(alpha = 0.001, lambdas=0),
]

# The tool to compare quickly
comparator = comparatorTD(algos, colors = ["red", "blue"], names = ["EmphaticTD", "OffTD"])

# Two states model

### Definition

In [4]:
# Define the target and behaviro policies
pi = LeftRightPolicy(n = 2, p_right = 1) # Always to the right
mu = LeftRightPolicy(n = 2) # Uniform is default

# Define the model
model = Model(
            ## Model definition
                features = [1, 2], # features function
                R = np.zeros((2,2)), # transition reward
                pi = pi, # target policy
                mu = mu, # behavior policy
    
            ## Parameters for TD
                I = [1, 0], # interest function
                discounts = [0.9, 0.9], # discount function
    
            ## Initialisation
                S0 = 0,
                theta0 = 1,
             ) 

### Running particles

In [5]:
T, N = 5000, 200
comparator.run(model, T, N)

emphatic TD has been computed for 5000 steps and 200 particles.
offTD has been computed for 5000 steps and 200 particles.


### Theta estimates

In [6]:
#plt.rcParams.update({'font.size': 13}) # for Latex

In [7]:
comparator.plot_theta()

<IPython.core.display.Javascript object>

## With other rewards

### Definition

In [6]:
# Define the target and behaviro policies
pi = LeftRightPolicy(n = 2, p_right = 1) # Always to the right
mu = LeftRightPolicy(n = 2) # Uniform is default

# Define the model
model = Model(
            ## Model definition
                features = np.identity(2), # features function
                R = [[0,3], [0, 10]], # transition reward
                pi = pi, # target policy
                mu = mu, # behavior policy
                v_pi = [93, 100], # exact state value under pi
    
            ## Parameters for TD
                I = [1, 0], # interest function
                discounts = [0.9, 0.9], # discount function
    
            ## Initialisation
                S0 = 0,
                theta0 = 1,
             ) 

### Running particles

In [7]:
T, N = 25000, 200
comparator.run(model, T, N)

emphatic TD has been computed for 25000 steps and 200 particles.
offTD has been computed for 25000 steps and 200 particles.


### Theta estimates

In [23]:
comparator.plot_theta(0)

<IPython.core.display.Javascript object>

### MSVE

In [8]:
comparator.plot_msve()

<IPython.core.display.Javascript object>

## Bad behavior policy

In [11]:
# Define the target and behaviro policies
pi = LeftRightPolicy(n = 2, p_right = 1) # Always to the right
mu = RandomPolicy(n = 2) # Random behavior policy

# Define the model
model = Model(
            ## Model definition
                features = np.identity(2), # features function
                R = [[0,3], [0, 10]], # transition reward
                pi = pi, # target policy
                mu = mu, # behavior policy
                v_pi = [93, 100], # exact state value under pi
    
            ## Parameters for TD
                I = [1, 0], # interest function
                discounts = [0.9, 0.9], # discount function
    
            ## Initialisation
                S0 = 0,
                theta0 = 1,
             ) 

### Running particles

In [13]:
comparator.run(model, 25000, 50)

emphatic TD has been computed for 25000 steps and 50 particles.
offTD has been computed for 25000 steps and 50 particles.


### Theta estimates

In [15]:
comparator.plot_theta(0)

<IPython.core.display.Javascript object>

###  MSVE

In [14]:
comparator.plot_msve()

<IPython.core.display.Javascript object>

# Five states model

In [16]:
# Define the target and behaviro policies
pi = LeftRightPolicy(n = 5, p_right = 1) # Always to the right
mu = LeftRightPolicy(n = 5, p_left = 2/3) # Uniform is default

# Define the model
model = Model(
            ## Model definition
                # features function
                features = [[1, 0, 0],
                            [1, 1, 0],
                            [0, 1, 0],
                            [0, 1, 1],
                            [0, 0, 1]], 
                R = np.ones((5,5)), # transition reward
                pi = pi, # target policy
                mu = mu, # behavior policy
                v_pi = [4, 3, 2, 1, 1], # state value under pi
    
            ## Parameters for TD
                I = np.ones(5), # interest function
                discounts = [0, 1, 1, 1, 0], # discount function
    
            ## Initialisation
                S0 = 0,
                theta0 = 0,
             ) 

### Running particles

In [18]:
comparator.run(model, 50000, 200)

emphatic TD has been computed for 50000 steps and 200 particles.
offTD has been computed for 50000 steps and 200 particles.


### MSVE

In [20]:
comparator.plot_msve(ylim = (0, 6))

<IPython.core.display.Javascript object>

### Theta estimates

In [21]:
comparator.plot_theta(1)

<IPython.core.display.Javascript object>