In [5]:
from __future__ import print_function, division
from builtins import range
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import Counter
import math

In [2]:
def print_values(V,g):
    for i in range(g.width):
        print('------------------------------------------------------------')
        for j in range(g.height):
            v=V.get((i,j),0)
            if v>=0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print('')

def print_policy(P,g):
    for i in range(g.width):
        print('-------------------------------------------------------------')
        for j in range(g.height):
            a=P.get((i,j),' ')
            print("  %s  |" % a, end="")
        print('')
        
def max_dict(d):
    max_key=None
    max_val=float('-inf')
    for k,v in d.itmes():
        if v>max_val:
            max_val=v
            max_key=k
    return max_key, max_val

In [3]:
class Grid: # Environment
  def __init__(self, width, height, start):
    # i is vertical axis, j is horizontal
    self.width = width
    self.height = height
    self.i = start[0]
    self.j = start[1]
    self.start=start

  def set(self, rewards, actions, obey_prob):
    # rewards should be a dict of: (i, j): r (row, col): reward
    # actions should be a dict of: (i, j): A (row, col): list of possible actions
    self.rewards = rewards
    self.actions = actions
    self.obey_prob = obey_prob

  def non_terminal_states(self):
    return self.actions.keys()

  def terminal_states(self):
    return [x for x in self.rewards.keys() if x not in self.actions.keys()]

  def set_state(self, s):
    self.i = s[0]
    self.j = s[1]

  def current_state(self):
    return (self.i, self.j)

  def is_terminal(self, s):
    return s not in self.actions

  def stochastic_move(self, action):
    p = np.random.random()
    if p <= self.obey_prob:
      return action
    if action == 'U' or action == 'D':
      return np.random.choice(['L', 'R'])
    elif action == 'L' or action == 'R':
      return np.random.choice(['U', 'D'])

  def move(self, action):
    actual_action = self.stochastic_move(action)
    if actual_action in self.actions[(self.i, self.j)]:
      if actual_action == 'U':
        self.i -= 1
      elif actual_action == 'D':
        self.i += 1
      elif actual_action == 'R':
        self.j += 1
      elif actual_action == 'L':
        self.j -= 1
    return self.rewards.get((self.i, self.j), 0)

  def step(self,action):
    actual_action=self.stochastic_move(action)
    new_state=[self.i,self.j]
    if actual_action in self.actions[(self.i, self.j)]:
      if actual_action == 'U':
        new_state[0] -= 1
      elif actual_action == 'D':
        new_state[0] += 1
      elif actual_action == 'R':
        new_state[1] += 1
      elif actual_action == 'L':
        new_state[1] -= 1
    new_state=tuple(new_state)
    reward=self.rewards[new_state]
    done=self.is_terminal(new_state)
    return new_state,reward,done
            

  def check_move(self, action):
    i = self.i
    j = self.j
    # check if legal move first
    if action in self.actions[(self.i, self.j)]:
      if action == 'U':
        i -= 1
      elif action == 'D':
        i += 1
      elif action == 'R':
        j += 1
      elif action == 'L':
        j -= 1
    # return a reward (if any)
    return (i,j)

  def get_transition_probs(self, action):
    # returns a list of (probability, reward, s') transition tuples
    probs = []
    state = self.check_move(action)
    probs.append((self.obey_prob, state))
    disobey_prob = 1 - self.obey_prob
    if not (disobey_prob > 0.0):
      return probs
    if action == 'U' or action == 'D':
      state = self.check_move('L')
      probs.append((disobey_prob / 2, state))
      state = self.check_move('R')
      probs.append((disobey_prob / 2, state))
    elif action == 'L' or action == 'R':
      state = self.check_move('U')
      probs.append((disobey_prob / 2, state))
      state = self.check_move('D')
      probs.append((disobey_prob / 2, state))
    return probs

  def game_over(self):
    # returns true if game is over, else false
    # true if we are in a state where no actions are possible
    return (self.i, self.j) not in self.actions

  def all_states(self):
    # possibly buggy but simple way to get all states
    # either a position that has possible next actions
    # or a position that yields a reward
    return set(self.actions.keys()) | set(self.rewards.keys())


def standard_grid(obey_prob=1.0, step_cost=None):
  # define a grid that describes the reward for arriving at each state
  # and possible actions at each state
  # the grid looks like this
  # x means you can't go there
  # s means start position
  # number means reward at that state
  # .  .  .  1
  # .  x  . -1
  # s  .  .  .
  # obey_brob (float): the probability of obeying the command
  # step_cost (float): a penalty applied each step to minimize the number of moves (-0.1)
  g = Grid(3, 4, (2, 0))
  rewards = {(0, 3): 1, (1, 3): -1}
  actions = {
    (0, 0): ('D', 'R'),
    (0, 1): ('L', 'R'),
    (0, 2): ('L', 'D', 'R'),
    (1, 0): ('U', 'D'),
    (1, 2): ('U', 'D', 'R'),
    (2, 0): ('U', 'R'),
    (2, 1): ('L', 'R'),
    (2, 2): ('L', 'R', 'U'),
    (2, 3): ('L', 'U'),
  }
  g.set(rewards, actions, obey_prob)
  if step_cost is not None:
    g.rewards.update({
      (0, 0): step_cost,
      (0, 1): step_cost,
      (0, 2): step_cost,
      (1, 0): step_cost,
      (1, 2): step_cost,
      (2, 0): step_cost,
      (2, 1): step_cost,
      (2, 2): step_cost,
      (2, 3): step_cost,
    })
  return g

In [11]:
num_episodes=10000
max_steps_per_episode=1000

learning_rate=0.1
discount_rate=0.9

In [12]:
grid=standard_grid(obey_prob=0.8, step_cost=-0.1)
q_table={}
for i in grid.actions.keys():
    q_table.update({
            i:{}
        })
for i in grid.actions.keys():
    for j in grid.actions[i]:
        q_table[i].update({
            j:0
        })

N_table={}
for i in grid.actions.keys():
    N_table.update({
            i:{}
        })
for i in grid.actions.keys():
    for j in grid.actions[i]:
        N_table[i].update({
            j:1
        })
        
rewards_all_episodes=[]
Steps=[]
for episode in range(num_episodes):
    grid.set_state(grid.start)
    state=(grid.i,grid.j)
    done=False
    rewards_current_episode=0
    for step in range(max_steps_per_episode):
        max_value=float('-inf')
        Nsum=sum(N_table[state].values())
        for i in q_table[state].keys():
            value=q_table[state][i]+math.sqrt(2*math.log(Nsum)/N_table[state][i])
            if value>max_value:
                    max_value=value
                    action=i
        
        N_table[state][action]+=1
        new_state,reward,done=grid.step(action)    
        
        if done==True:
            q_table[state][action]=q_table[state][action]*(1-learning_rate)+\
               learning_rate*(reward)
        
        else:
            q_table[state][action]=q_table[state][action]*(1-learning_rate)+\
               learning_rate*(reward+discount_rate*max(q_table[new_state].values()))

        state=new_state
        grid.set_state(state)
        rewards_current_episode+=reward

        if done ==True:
            break
    Steps.append(step)
    rewards_all_episodes.append(rewards_current_episode)

In [13]:
rewards_per_thousand_episodes=np.split(np.array(rewards_all_episodes),num_episodes/1000)
count=1000
rewards_thousand_episodes=[]
for r in rewards_per_thousand_episodes:
    rewards_thousand_episodes.append(sum(r/1000))
    print(count,": ",str(sum(r/1000)))
    count+=1000

1000 :  0.08650000000000062
2000 :  0.40739999999999954
3000 :  0.3387999999999997
4000 :  0.4158999999999989
5000 :  0.40149999999999886
6000 :  0.4089999999999986
7000 :  0.42469999999999875
8000 :  0.40519999999999873
9000 :  0.40519999999999906
10000 :  0.4121999999999991


In [14]:
policy={}
for i in q_table.keys():
    max_q=float('-inf')
    for j in q_table[i].keys():
        if q_table[i][j]>max_q:
            max_q=q_table[i][j]
            policy.update({
                i:j
            })

In [15]:
    print('Policy (Q LEARNING Results):')
    print_policy(policy,grid)

Policy (Q LEARNING Results):
-------------------------------------------------------------
  R  |  R  |  R  |     |
-------------------------------------------------------------
  U  |     |  U  |     |
-------------------------------------------------------------
  U  |  R  |  U  |  L  |


In [18]:
def huge_grid(obey_prob=1.0, step_cost=None):

  g = Grid(10, 10, (9, 0))
  rewards = {(1, 9): 1, (2, 9): -1,(4,2):-0.2, (5,5):0.2}
  actions = {
      (0,0): ('R','D'),
      (0,9): ('L','D'),
      (9,0): ('R','U'),
      (9,9): ('L','U'),
  }
  for j in range(1,9):
      actions[(0,j)]=('L','R','D')
      actions[(9,j)]=('L','R','U')
  for i in range(1,9):
      actions[(i,0)]=('R','U','D')
  for i in range(3,9):
      actions[(i,9)]=('L','U','D')
  for i in range(1,9):
    for j in range(1,9):
        actions[(i,j)]=('L','R','U','D')
  actions.update({
      (6,4): ('L','U','D'),
      (5,5): ('L','R','U'),
      (6,6): ('R','U','D'),
      (7,5): ('L','R','D'),
      (7,1): ('L','U','D'),
      (6,2): ('L','R','U'),
      (7,3): ('R','U','D'),
      (8,2): (('L','R','D'))       
  })
  del actions[(6,5)]
  del actions[(7,2)]
  
  g.set(rewards, actions, obey_prob)
  if step_cost is not None:
        for i in range(0,10):
            for j in range(0,10):
                if (i,j) not in g.rewards.keys():
                    g.rewards.update({
                        (i,j): step_cost
                    })
  return g

In [19]:
grid=huge_grid(obey_prob=0.8, step_cost=-0.03)
q_table={}
for i in grid.actions.keys():
    q_table.update({
            i:{}
        })
for i in grid.actions.keys():
    for j in grid.actions[i]:
        q_table[i].update({
            j:0
        })

N_table={}
for i in grid.actions.keys():
    N_table.update({
            i:{}
        })
for i in grid.actions.keys():
    for j in grid.actions[i]:
        N_table[i].update({
            j:1
        })
        
rewards_all_episodes=[]
Steps=[]
for episode in range(num_episodes):
    grid.set_state(grid.start)
    state=(grid.i,grid.j)
    done=False
    rewards_current_episode=0
    for step in range(max_steps_per_episode):
        max_value=float('-inf')
        Nsum=sum(N_table[state].values())
        for i in q_table[state].keys():
            value=q_table[state][i]+math.sqrt(2*math.log(Nsum)/N_table[state][i])
            if value>max_value:
                    max_value=value
                    action=i
        
        N_table[state][action]+=1
        new_state,reward,done=grid.step(action)    
        
        if done==True:
            q_table[state][action]=q_table[state][action]*(1-learning_rate)+\
               learning_rate*(reward)
        
        else:
            q_table[state][action]=q_table[state][action]*(1-learning_rate)+\
               learning_rate*(reward+discount_rate*max(q_table[new_state].values()))

        state=new_state
        grid.set_state(state)
        rewards_current_episode+=reward

        if done ==True:
            break
    Steps.append(step)
    rewards_all_episodes.append(rewards_current_episode)

In [20]:
rewards_per_thousand_episodes=np.split(np.array(rewards_all_episodes),num_episodes/1000)
count=1000
rewards_thousand_episodes=[]
for r in rewards_per_thousand_episodes:
    rewards_thousand_episodes.append(sum(r/1000))
    print(count,": ",str(sum(r/1000)))
    count+=1000

1000 :  22.37921999999997
2000 :  28.451859999999968
3000 :  54.64745999999996
4000 :  58.033100000000026
5000 :  60.142599999999916
6000 :  61.58046000000001
7000 :  62.31444999999991
8000 :  40.12473999999998
9000 :  39.89719000000001
10000 :  50.188720000000025


In [45]:
Value_table={}
for i in grid.actions.keys():
    Value_table.update({
            i:{}
        })
for i in grid.actions.keys():
    for j in grid.actions[i]:
        Value_table[i].update({
            j:0
        })

N2_table={}
for i in grid.actions.keys():
    N2_table.update({
            i:{}
        })
for i in grid.actions.keys():
    for j in grid.actions[i]:
        N2_table[i].update({
            j:0
        })

In [46]:
for s in grid.actions.keys():  
    Nsum=sum(N_table[s].values())    
    for i in q_table[s].keys():
            Value_table[s][i]=q_table[s][i]+math.sqrt(2*math.log(Nsum)/N_table[s][i])
            N2_table[s][i]=math.sqrt(2*math.log(Nsum)/N_table[s][i])

In [31]:
policy={}
for i in Value_table.keys():
    max_q=float('-inf')
    for j in Value_table[i].keys():
        if Value_table[i][j]>max_q:
            max_q=Value_table[i][j]
            policy.update({
                i:j
            })

In [32]:
    print('Policy (Q LEARNING Results):')
    print_policy(policy,grid)

Policy (Q LEARNING Results):
-------------------------------------------------------------
  R  |  R  |  D  |  R  |  L  |  D  |  R  |  R  |  R  |  D  |
-------------------------------------------------------------
  R  |  D  |  R  |  U  |  D  |  U  |  U  |  U  |  R  |     |
-------------------------------------------------------------
  U  |  R  |  R  |  R  |  D  |  L  |  L  |  L  |  L  |     |
-------------------------------------------------------------
  R  |  U  |  R  |  R  |  D  |  R  |  R  |  D  |  U  |  D  |
-------------------------------------------------------------
  D  |  U  |  U  |  U  |  R  |  D  |  D  |  L  |  D  |  D  |
-------------------------------------------------------------
  R  |  D  |  D  |  R  |  U  |  R  |  L  |  L  |  L  |  L  |
-------------------------------------------------------------
  U  |  R  |  R  |  U  |  L  |     |  U  |  L  |  D  |  U  |
-------------------------------------------------------------
  D  |  U  |     |  U  |  L  |  R  |  D  |  U  |

In [39]:
num_episodes=100
max_steps=1000
rewards=[]
final_states=[]
for episode in range(num_episodes):
    grid.set_state(grid.start)
    state=(grid.i,grid.j)
    done=False
    rewards_current_episode=0
    for step in range(max_steps):
        action=policy[state]
        new_state,reward,done=grid.step(action)    
        state=new_state
        grid.set_state(state)
        rewards_current_episode+=reward
        if done ==True:
            break
    rewards.append(rewards_current_episode)
    final_states.append(state)

In [40]:
sum(rewards)/100

61.35719999999999

In [41]:
Counter(final_states)

Counter({(5, 6): 48,
         (5, 5): 33,
         (6, 6): 5,
         (4, 6): 4,
         (1, 9): 5,
         (4, 5): 4,
         (3, 5): 1})

In [47]:
N2_table

{(0, 0): {'R': 0.2743992044910143, 'D': 0.300791209205391},
 (0, 9): {'L': 0.30897201137466057, 'D': 0.12328518753846567},
 (9, 0): {'R': 0.049558650946276324, 'U': 0.05347067536743711},
 (9, 9): {'L': 0.30812523233983025, 'U': 0.31367748497331227},
 (0, 1): {'L': 0.3204477940241092,
  'R': 0.25010063602285443,
  'D': 0.2801548098352456},
 (9, 1): {'L': 0.11073092040680475,
  'R': 0.05929513156977885,
  'U': 0.07295906267258954},
 (0, 2): {'L': 0.305857100369865,
  'R': 0.22163489538844178,
  'D': 0.2629535931443656},
 (9, 2): {'L': 0.12558908935853233,
  'R': 0.070080573797403,
  'U': 0.08188675190008972},
 (0, 3): {'L': 0.3021590818360086,
  'R': 0.20167457698173946,
  'D': 0.24360843987511357},
 (9, 3): {'L': 0.145151382705672,
  'R': 0.09111732542603786,
  'U': 0.0887496828668596},
 (0, 4): {'L': 0.2945821853246959,
  'R': 0.19020378844773847,
  'D': 0.23326913656790393},
 (9, 4): {'L': 0.1649269564488605,
  'R': 0.12866954644638356,
  'U': 0.10237000046110326},
 (0, 5): {'L': 0.28

In [48]:
q_table

{(0, 0): {'R': 0.05417087910981031, 'D': 0.027096675222580233},
 (0, 9): {'L': 0.6963483245604483, 'D': 0.947945175781983},
 (9, 0): {'R': -0.038152931520906284, 'U': -0.0513160556573109},
 (9, 9): {'L': 0.0353847973323622, 'U': 0.029512987779240076},
 (0, 1): {'L': 0.03013187301898334,
  'R': 0.10263327861639043,
  'D': 0.06845402292048793},
 (9, 1): {'L': -0.06515996161991684,
  'R': -0.014599772860087436,
  'U': -0.007387968137452013},
 (0, 2): {'L': 0.08068876303027302,
  'R': 0.16555228048400308,
  'D': 0.1259774679716674},
 (9, 2): {'L': -0.03199403293169745,
  'R': 0.02213421889483858,
  'U': 0.021063605416552868},
 (0, 3): {'L': 0.13287072730482877,
  'R': 0.23727949994375408,
  'D': 0.19247879982857444},
 (9, 3): {'L': 0.0049975039979768,
  'R': 0.05939673891001094,
  'U': 0.06069682978908575},
 (0, 4): {'L': 0.19292568104999547,
  'R': 0.2949637169548241,
  'D': 0.253989754994271},
 (9, 4): {'L': 0.04091923773761056,
  'R': 0.07906910785089565,
  'U': 0.10091531117475237},
 (

In [49]:
Value_table

{(0, 0): {'R': 0.3285700836008246, 'D': 0.32788788442797123},
 (0, 9): {'L': 1.0053203359351088, 'D': 1.0712303633204487},
 (9, 0): {'R': 0.01140571942537004, 'U': 0.0021546197101262113},
 (9, 9): {'L': 0.34351002967219246, 'U': 0.3431904727525523},
 (0, 1): {'L': 0.35057966704309257,
  'R': 0.35273391463924486,
  'D': 0.3486088327557335},
 (9, 1): {'L': 0.04557095878688791,
  'R': 0.044695358709691414,
  'U': 0.06557109453513753},
 (0, 2): {'L': 0.386545863400138,
  'R': 0.3871871758724449,
  'D': 0.38893106111603304},
 (9, 2): {'L': 0.09359505642683488,
  'R': 0.09221479269224157,
  'U': 0.1029503573166426},
 (0, 3): {'L': 0.43502980914083733,
  'R': 0.43895407692549354,
  'D': 0.436087239703688},
 (9, 3): {'L': 0.1501488867036488,
  'R': 0.1505140643360488,
  'U': 0.14944651265594533},
 (0, 4): {'L': 0.48750786637469135,
  'R': 0.4851675054025626,
  'D': 0.48725889156217495},
 (9, 4): {'L': 0.20584619418647104,
  'R': 0.20773865429727922,
  'U': 0.20328531163585561},
 (0, 5): {'L': 