<a href="https://colab.research.google.com/github/Tinynja/Sarsa-phi-EB/blob/main/rl_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
!pip install torch torchvision pyvirtualdisplay matplotlib seaborn pandas numpy pathlib gym
!sudo apt-get install xvfb

# Run this cell

# type hinting 
from typing import Sequence, Tuple, Dict, Any, Optional

import numpy as np

# torch stuff
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch import optim

# env
import gym
from gym.wrappers import Monitor

# data manipulation, colab dispaly, and plotting
import pandas as pd
import matplotlib.pyplot as plt


# misc util
import random, glob, base64, itertools
from pathlib import Path
from pprint import pprint

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-2.2-py3-none-any.whl (15 kB)
Collecting EasyProcess
  Downloading EasyProcess-0.3-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-0.3 pyvirtualdisplay-2.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,270 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.9 [784 kB]
Fetched 784 kB in 1s (643 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Rea

In [None]:
#######################################################################
# Copyright (C)                                                       #
# 2017-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from math import floor
from tqdm import tqdm

#######################################################################
# Following are some utilities for tile coding from Rich.
# To make each file self-contained, I copied them from
# http://incompleteideas.net/tiles/tiles3.py-remove
# with some naming convention changes
#
# Tile coding starts
class IHT:
    "Structure to handle collisions"
    def __init__(self, size_val):
        self.size = size_val
        self.overfull_count = 0
        self.dictionary = {}

    def count(self):
        return len(self.dictionary)

    def full(self):
        return len(self.dictionary) >= self.size

    def get_index(self, obj, read_only=False):
        d = self.dictionary
        if obj in d:
            return d[obj]
        elif read_only:
            return None
        size = self.size
        count = self.count()
        if count >= size:
            if self.overfull_count == 0: print('IHT full, starting to allow collisions')
            self.overfull_count += 1
            return hash(obj) % self.size
        else:
            d[obj] = count
            return count

def hash_coords(coordinates, m, read_only=False):
    if isinstance(m, IHT): return m.get_index(tuple(coordinates), read_only)
    if isinstance(m, int): return hash(tuple(coordinates)) % m
    if m is None: return coordinates

def tiles(iht_or_size, num_tilings, floats, ints=None, read_only=False):
    """returns num-tilings tile indices corresponding to the floats and ints"""
    if ints is None:
        ints = []
    qfloats = [floor(f * num_tilings) for f in floats]
    tiles = []
    for tiling in range(num_tilings):
        tilingX2 = tiling * 2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append((q + b) // num_tilings)
            b += tilingX2
        coords.extend(ints)
        tiles.append(hash_coords(coords, iht_or_size, read_only))
    return tiles
# Tile coding ends
#######################################################################

# all possible actions
env = BreakoutEnv(Breakout)
ACTIONS = range(4)

# bound for position and velocity
POSITION_MIN = -1.2
POSITION_MAX = 0.5
VELOCITY_MIN = -0.07
VELOCITY_MAX = 0.07

# discount is always 1.0 in these experiments
DISCOUNT = 1.0

# use optimistic initial value, so it's ok to set epsilon to 0
EPSILON = 0

# maximum steps per episode
STEP_LIMIT = 5000


# get action at @position and @velocity based on epsilon greedy policy and @valueFunction  #########################    use our own get_action. modified it, may work as intended
def get_action(observation, valueFunction):
    if np.random.binomial(1, EPSILON) == 1:
        return np.random.choice(ACTIONS)
    values = []
    for action in ACTIONS:
        values.append(valueFunction.value(observation))  
    return np.argmax(values) - 1



# replacing trace update rule
# @trace: old trace (will be modified)
# @activeTiles: current active tile indices
# @lam: lambda
# @return: new trace for convenience
def replacing_trace(trace, activeTiles, lam):
    active = np.in1d(np.arange(len(trace)), activeTiles)
    trace[active] = 1
    trace[~active] *= lam * DISCOUNT
    return trace



# wrapper class for Sarsa(lambda)
class Sarsa:
    # In this example I use the tiling software instead of implementing standard tiling by myself
    # One important thing is that tiling is only a map from (state, action) to a series of indices
    # It doesn't matter whether the indices have meaning, only if this map satisfy some property
    # View the following webpage for more information
    # http://incompleteideas.net/sutton/tiles/tiles3.html
    # @maxSize: the maximum # of indices
    #the hashing is a lfa?
    def __init__(self, step_size, lam, trace_update=replacing_trace, num_of_tilings=8, max_size=2048):
        self.max_size = max_size
        self.num_of_tilings = num_of_tilings
        self.trace_update = trace_update
        self.lam = lam

        # divide step size equally to each tiling
        self.step_size = step_size / num_of_tilings

        self.hash_table = IHT(max_size)

        # weight for each tile
        self.weights = np.zeros(max_size) #max size is the number of features?

        # trace for each tile
        self.trace = np.zeros(max_size)

        # position and velocity needs scaling to satisfy the tile software
        self.position_scale = self.num_of_tilings / (POSITION_MAX - POSITION_MIN)
        self.velocity_scale = self.num_of_tilings / (VELOCITY_MAX - VELOCITY_MIN)

    # get indices of active tiles for given state and action
    def get_active_tiles(self, position, velocity, action):
        # I think positionScale * (position - position_min) would be a good normalization.
        # However positionScale * position_min is a constant, so it's ok to ignore it.
        active_tiles = tiles(self.hash_table, self.num_of_tilings,
                            [self.position_scale * position, self.velocity_scale * velocity],
                            [action])
        return active_tiles

    # estimate the value of given state and action
    def value(self, observation):
        active_tiles = np.nonzero(observation)
        return np.sum(self.weights[active_tiles])

    # learn with given state, action and target
    def learn(self, observation, target):
        active_tiles = np.nonzero(observation)
        estimation = np.sum(self.weights[active_tiles])
        delta = target - estimation
        if self.trace_update == replacing_trace:
            self.trace_update(self.trace, active_tiles, self.lam)
        else:
            raise Exception('Unexpected Trace Type')
        self.weights += self.step_size * delta * self.trace


# play Mountain Car for one episode based on given method @evaluator
# @return: total steps in this episode
def play(evaluator, env):

    action = random.choice(ACTIONS)
    steps = 0
    while True:
        next_observation, reward, done, info = env.step(action)
        next_action = get_action(next_observation, evaluator)    #########################    use our own get_action  ??? modified it, may work as intented
        steps += 1
        target = reward + DISCOUNT * evaluator.value(next_observation)          ############# use our own value function ??? modified it, may work as intented
        evaluator.learn(observation, target)
        observation = next_observation
        action = next_action
        if done:
            break
        if steps >= STEP_LIMIT:
            print('Step Limit Exceeded!')
            break
    return steps

# figure 12.10, effect of the lambda and alpha on early performance of Sarsa(lambda)
def figure_12_10():
    runs = 30
    episodes = 50
    alphas = np.arange(1, 8) / 4.0
    lams = [0.99, 0.95, 0.5, 0]

    steps = np.zeros((len(lams), len(alphas), runs, episodes))
    for lamInd, lam in enumerate(lams):
        for alphaInd, alpha in enumerate(alphas):
            for run in tqdm(range(runs)):
                evaluator = Sarsa(alpha, lam, replacing_trace)
                for ep in range(episodes):
                    step = play(evaluator, env)
                    steps[lamInd, alphaInd, run, ep] = step

    # average over episodes
    steps = np.mean(steps, axis=3)

    # average over runs
    steps = np.mean(steps, axis=2)

    for lamInd, lam in enumerate(lams):
        plt.plot(alphas, steps[lamInd, :], label='lambda = %s' % (str(lam)))
    plt.xlabel('alpha * # of tilings (8)')
    plt.ylabel('averaged steps per episode')
    plt.ylim([180, 300])
    plt.legend()

    plt.savefig('figure_12_10.png')
    plt.close()


if __name__ == '__main__':
    figure_12_10()
    figure_12_11()


In [None]:
class BaseAgent:
  """ The base agent class function.
  """
  def __init__(self):
    #nothing for now
    self.gamma = 1
    self.features = 3
    self.rhos = np.ones(self.features) #stores the rho_i values


  def takeAction(self, t):
    phis = [[0,1,0],[0,1,0],[0,1,0],[1,0,1]]
    return phis[t]


  def updateRho_i(self, counts, t):
    M = self.features
    for i in range(M): #M is the number of features of phi
      counts[i] += 1 #since we add phi to the seen states, all the counts are increased by one for t+1
      self.rhos[i] = (counts[i]+0.5)/(t+1)
    return 0


  def PHI_EB(self, beta, t_end, evaluator, env):
    t = 0
    M = self.features #number of features
    counts = np.zeros(M)
    states = np.zeros((t_end,M)) #stores the previous phis for all timesteps

    action = np.random.choice(range(4))
    old_phi = env.observe()

    while t < t_end:
      #observe phi(s), reward
      phi, reward, done, info = env.step(action)
      if done:
        break
      next_action = get_action(phi, evaluator)
      #phi = self.takeAction(t) # dummy vector until we can generate episodes
      #reward = 1
      #print(states)
      #print(phi)
      
      #compute rho_t(phi) (feature visit-density)
      if t > 0:
        rho_t = 1
        for i in range(M):
          counts[i] = 0
          for step in range(t):
            if phi[i] == states[step,i]:
              counts[i] += 1
          self.rhos[i] = (counts[i]+0.5)/(t+1)
          rho_t = rho_t*self.rhos[i]
      else:
        rho_t = 0.5**M
      print('rho_t: '+ str(rho_t))

      #update all rho_i with observed phi
      states[t] = phi
      self.updateRho_i(counts, t+1)
      
      #compute rho_t+1(phi)
      new_rho_t = 1
      for i in range(M):
        new_rho_t = new_rho_t*self.rhos[i]

      #compute Nhat_t(s)
      Nhat_t = rho_t*(1-new_rho_t)/(new_rho_t-rho_t)

      #compute R(s,a) (empirical reward)
      explorationBonus = beta/np.sqrt(Nhat_t)

      reward = reward + explorationBonus
      print(explorationBonus)
      #pass phi(s) and reward to RL algo to update theta_t
      target = reward + self.gamma * evaluator.value(phi)          ############# use our own value function ??? modified it, may work as intented
      evaluator.learn(old_phi), target)

      old_phi = phi
      action = next_action
      
      theta_end = 0

      t += 1

    return theta_end



In [None]:
import torch


In [None]:
# Optimized by Amine
class SarsaPhiEB:
    def __init__(self, env, gamma=1, beta=1):
        self.env = env
        self.gamma = gamma
        self.beta = beta

        # self.rhos = np.ones(self.features) #stores the rho_i values

    def updateRho_i(self, counts, t):
        M = self.features
        for i in range(M): #M is the number of features of phi
            counts[i] += 1 #since we add phi to the seen states, all the counts are increased by one for t+1
            self.rhos[i] = (counts[i]+0.5)/(t+1)
        return 0

    def generate_action(self, observation):
        pass

    def learn_episode(self):
        t = 0
        counts = torch.zeros_like(self.env._observe())
        done = False
        while not done:
            action = 
            observation, reward, done, info = self.env.step()
        #observe phi(s), reward
        phi = self.takeAction(t) # dummy vector until we can generate episodes
        reward = 1
        print(states)
        print(phi)
        
        #compute rho_t(phi) (feature visit-density)
        if t > 0:
            rho_t = 1
            for i in range(M):
            counts[i] = 0
            for step in range(t):
                if phi[i] == states[step,i]:
                counts[i] += 1
            self.rhos[i] = (counts[i]+0.5)/(t+1)
            rho_t = rho_t*self.rhos[i]
        else:
            rho_t = 0.5**M
        print('rho_t: '+ str(rho_t))

        #update all rho_i with observed phi
        states[t] = phi
        self.updateRho_i(counts, t+1)
        
        #compute rho_t+1(phi)
        new_rho_t = 1
        for i in range(M):
            new_rho_t = new_rho_t*self.rhos[i]

        #compute Nhat_t(s)
        Nhat_t = rho_t*(1-new_rho_t)/(new_rho_t-rho_t)

        #compute R(s,a) (empirical reward)
        explorationBonus = beta/np.sqrt(Nhat_t)

        reward = reward + explorationBonus
        print(explorationBonus)
        #pass phi(s) and reward to RL algo to update theta_t
        
        theta_end = 0

        t += 1

        return theta_end



In [None]:
#testing
testAgent = BaseAgent()
testAgent.PHI_EB(1,4)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[0, 1, 0]
rho_t: 0.125
2.026846838838127
[[0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[0, 1, 0]
rho_t: 0.421875
0.9393491802183486
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[0, 1, 0]
rho_t: 0.5787037037037038
0.6910415772863893
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
[1, 0, 1]
rho_t: 0.001953125
3.630407155555409


0

In [None]:
print('Hello Bob!')