In [48]:
import numpy as np
import matplotlib.pyplot as plt
import gym
from tempfile import TemporaryFile
import random

In [49]:
env=gym.make('MountainCar-v0')

In [24]:
env.reset()
for _ in range(200):
  env.render(mode='rgb_array')
  action = env.action_space.sample()
  env.step(action)
env.close()


In [25]:
basehash = hash

class IHT:
    "Structure to handle collisions"
    def __init__(self, sizeval):
        self.size = sizeval                        
        self.overfullCount = 0
        self.dictionary = {}

    def __str__(self):
        "Prepares a string for printing whenever this object is printed"
        return "Collision table:" + \
               " size:" + str(self.size) + \
               " overfullCount:" + str(self.overfullCount) + \
               " dictionary:" + str(len(self.dictionary)) + " items"

    def count (self):
        return len(self.dictionary)
    
    def fullp (self):
        return len(self.dictionary) >= self.size
    
    def getindex (self, obj, readonly=False):
        d = self.dictionary
        if obj in d: return d[obj]
        elif readonly: return None
        size = self.size
        count = self.count()
        if count >= size:
            if self.overfullCount==0: print('IHT full, starting to allow collisions')
            self.overfullCount += 1
            return basehash(obj) % self.size
        else:
            d[obj] = count
            return count

def hashcoords(coordinates, m, readonly=False):
    if type(m)==IHT: return m.getindex(tuple(coordinates), readonly)
    if type(m)==int: return basehash(tuple(coordinates)) % m
    if m==None: return coordinates

In [26]:
from math import floor, log
from itertools import zip_longest

def tiles (ihtORsize, numtilings, floats, ints=[], readonly=False):
    """returns num-tilings tile indices corresponding to the floats and ints"""
    qfloats = [floor(f*numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling*2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append( (q + b) // numtilings )
            b += tilingX2
        coords.extend(ints)
        Tiles.append(hashcoords(coords, ihtORsize, readonly))
    return Tiles

def tileswrap (ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False):
    """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats"""
    qfloats = [floor(f*numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling*2
        coords = [tiling]
        b = tiling
        for q, width in zip_longest(qfloats, wrapwidths):
            c = (q + b%numtilings) // numtilings
            coords.append(c%width if width else c)
            b += tilingX2
        coords.extend(ints)
        Tiles.append(hashcoords(coords, ihtORsize, readonly))
    return Tiles

In [50]:
def sstate(state):
  x,y=state
  #x=8*x/(0.6+1.2)
  #xdot=8*xdot/(0.07+0.07)
  x=x+1.2
#[8*x/(0.5+1.2),8*xdot/(0.07+0.07)]
  scaleFactor1 = 16/(0.6+1.2)
  scaleFactor2 = 16/(0.07+0.07)
  
  p=x*scaleFactor1,y*scaleFactor2
  return p


In [51]:
def Q(state,action,weights):
  state=sstate(state)
  L=tiles(iht, 8, state,[action])
  K=0
  for tile in L:
      K=weights[tile]+K
  return K    

In [52]:
#choose action
def policy(state,weights):
  m=0
  a1=Q(state,1,weights)
  a2=Q(state,2,weights)
  a3=Q(state,0,weights)
  K=[0,1,2]
  L=max(a1,a2,a3)
  if L==a1:
    m=1
  if L==a2:
    m=2
  if L==a3:
    m=0
  return m  

In [53]:
maxSize = 2048
iht=maxSize
num_episode=11000
num_tilings=8

Gama=0.7
alpha=0.5/num_tilings
step=[]
H=0
#weights=np.random.randn(maxSize)
weights = [0]*maxSize


In [31]:
#Episodic semi_gradient Sarsa 
for i in range (num_episode):
  state  = env.reset()
  action=policy(state,weights)
  step.append(H)
  H=0
  done=False
  while not done:
    H=H+1
 
    state1, reward, done, info = env.step(action)
    if done:
      s=sstate(state)
      X=tiles(iht, 8, s,[action])
      m=alpha*(reward-Q(state,action,weights))#X(state,action)
      for tile in X:
       weights[tile]=weights[tile]+m
      break 
      
  #Take action A1 greedy from approximate q(state1,.,w)
    action1=policy(state1,weights)
    s=sstate(state)
    X=tiles(iht, 8, s,[action])
    m=alpha*(reward+(Gama*Q(state1,action1,weights))-Q(state,action,weights))
    for tile in X:
      weights[tile]=weights[tile]+m
    state=state1
    action=action1  

In [None]:
print(weights)

In [59]:
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('MountainCar-v0')
state=env.reset()
for _ in range(2000):
    env.render(mode='rgb_array')
    action = policy(state,weights)
    state, reward, done, info=env.step(action)
env.close()  

In [35]:
import Box2D
from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener)

import gym
from gym import spaces
from gym.utils import seeding, EzPickle
#env = gym.make("LunarLander-v2")

In [36]:
env = gym.make("LunarLander-v2")
#State has 8 components:horizontal and vertical position, horizontal and vertical velocity, angle and angular velocity, and left and right leg contact

Each simulation episode finishes if the lander crashes or comes to rest,receiving additional-100or+100points.Each leg ground contactis+10.Firingmainengineis-0.3pointseachframe.Firing sideengineis-0.03pointseachframe.Solvedis200points.

In [37]:
state=env.reset()
print(state)

[-7.8392026e-05  1.4152385e+00 -7.9551321e-03  1.9192407e-01
  9.7609729e-05  1.8019220e-03  0.0000000e+00  0.0000000e+00]


In [39]:
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline


env.reset()
for _ in range(700):
    
    env.render(mode='rgb_array')
    action = env.action_space.sample()
    env.step(action)
env.close()    

In [40]:
def Q(state,action,weights):
  #state=sstate(state)
  L=tiles(iht, 8, state,[action])
  K=0
  for tile in L:
      K=weights[tile]+K
  return K    

In [41]:
def policy(state,weights):
  m=0
  a1=Q(state,1,weights)
  a2=Q(state,2,weights)
  a3=Q(state,0,weights)
  a4=Q(state,3,weights)
  K=[0,1,2,3]
  L=max(a1,a2,a3)
  if L==a1:
    m=1
  if L==a2:
    m=2
  if L==a3:
    m=0
  if L==a4:
    m=3
  H=np.random.random()
  if H<=0.8:
    return m
  if H>0.8:
    K.remove(m)
    Z=random.choice(K)
    return Z

In [42]:
maxSize = 2048
iht=maxSize
num_episode=10000
num_tilings=8

Gama=0.7
alpha=0.5/num_tilings
step=[]
H=0
weights = [0]*maxSize

In [43]:
#Episodic semi_gradient Sarsa 
for i in range (num_episode):
  state  = env.reset()
  action=policy(state,weights)
  step.append(H)
  H=0
  done=False
  while not done:
    H=H+1
 
    state1, reward, done, info = env.step(action)
    if done:
    
      X=tiles(iht, 8, state,[action])
      m=alpha*(reward-Q(state,action,weights))
      for tile in X:
       weights[tile]=weights[tile]+m
      break 
      
 
    action1=policy(state1,weights)
    #s=sstate(state)
    X=tiles(iht, 8, state,[action])
    m=alpha*(reward+(Gama*Q(state1,action1,weights))-Q(state,action,weights))
    for tile in X:
      weights[tile]=weights[tile]+m
    state=state1
    action=action1  

In [44]:
print(weights)

[1.3389361459346423, -6.0989732149000595, -0.3740662442838705, 0.38121666451911285, -2.2523676746721053, 1.4362925126571866, -10.232578578721446, -1.6935557968282497, -2.7300955908815365, -1.870105039543281, 3.7047681304823006, -6.133926686119379, -9.268487900703374, -0.3749145218303531, -15.92587458760508, -8.003402215300092, -20.127890741013516, 7.941728309586182, 0.9005235387703198, -16.926504306656398, -9.406062434596244, -3.456412392037553, -12.28891746053587, -2.0343156928951878, -2.0612747698420586, -11.19986364157934, -11.585903571368556, -5.581382995160226, -9.97241443994683, -0.4109647579544754, -9.377480887810998, -7.328446885243762, 2.8272875746366877, -9.51314173816052, -1.190846947604427, -2.1309143012835134, -6.207201303608501, 0.22533812358900418, -0.1265802504659428, 2.1150143366311545, -4.350359980738795, -4.084862034064458, -9.136146484528242, -9.603961953096388, 0.22183768232651435, -17.104426778623953, -2.1676920226721217, 2.189820864614893, -12.328160845112372, -3

In [47]:
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make("LunarLander-v2")
state=env.reset()
for _ in range(600):
    
    env.render(mode='rgb_array')
    action = policy(state,weights)
    state, reward, done, info=env.step(action)
env.close()  