In [1]:
import gym
import numpy as np

In [2]:
env = gym.make("BipedalWalker-v3")



In [3]:
env.reset()
env.render()
while True:
    env.step(env.action_space.sample())
    env.render()


KeyboardInterrupt: 

In [5]:
SZ_ACTION_SPACE = env.action_space.n
SZ_OBS_SPACE = env.observation_space.n

In [14]:
class QLearner:
    def __init__(self,env,alpha=0.1 ,  gamma=0.99,epsilon=1,epsilon_decay_dec = 0.001,min_epsilon = 0.01):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay_dec
        self.Q = np.zeros((env.observation_space.n,env.action_space.n))
        self.min_epsilon = min_epsilon
    def run(self,episodes):
        state = self.env.reset()
        success_rate = []
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            while not done:
                if np.random.uniform(0,1)<self.epsilon:
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(self.Q[state,:])
                next_state, reward, done, info = self.env.step(action)
                total_reward += reward
                self.Q[state,action]= (1-self.alpha)*self.Q[state,action] + self.alpha*(reward + self.gamma*np.max(self.Q[next_state,:]))
                state = next_state
            if(episode%100==0):
                total_reward = self.evaluate(100)
                print("Episode: {}/{}, Success Rate: {}".format(episode,episodes,total_reward))
            self.epsilon = max(self.min_epsilon, np.exp(-self.epsilon_decay*episode))
            
            
    def evaluate(self,episodes,render=False):
        total_reward = 0
        
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            while not done:
                if render:
                    self.env.render()
                action = np.argmax(self.Q[state,:])
                next_state, reward, done, info = self.env.step(action)
                
                total_reward += reward
                state = next_state
        return total_reward/episodes


In [15]:
env_name = "FrozenLake-v0"
env = gym.make(env_name)
x = QLearner(env)

In [8]:
x.run(10000)

Episode: 0/10000, Success Rate: 0.0
Episode: 100/10000, Success Rate: 0.0
Episode: 200/10000, Success Rate: 0.08
Episode: 300/10000, Success Rate: 0.07
Episode: 400/10000, Success Rate: 0.07
Episode: 500/10000, Success Rate: 0.38
Episode: 600/10000, Success Rate: 0.33
Episode: 700/10000, Success Rate: 0.56
Episode: 800/10000, Success Rate: 0.6
Episode: 900/10000, Success Rate: 0.48
Episode: 1000/10000, Success Rate: 0.58
Episode: 1100/10000, Success Rate: 0.7
Episode: 1200/10000, Success Rate: 0.66
Episode: 1300/10000, Success Rate: 0.64
Episode: 1400/10000, Success Rate: 0.6
Episode: 1500/10000, Success Rate: 0.73
Episode: 1600/10000, Success Rate: 0.75
Episode: 1700/10000, Success Rate: 0.78
Episode: 1800/10000, Success Rate: 0.81
Episode: 1900/10000, Success Rate: 0.76
Episode: 2000/10000, Success Rate: 0.73
Episode: 2100/10000, Success Rate: 0.67
Episode: 2200/10000, Success Rate: 0.71
Episode: 2300/10000, Success Rate: 0.78
Episode: 2400/10000, Success Rate: 0.79
Episode: 2500/100

In [89]:
x.Q

array([[0.47353034, 0.45957649, 0.46133681, 0.45795379],
       [0.31451843, 0.36923785, 0.33471545, 0.42261253],
       [0.40177954, 0.39759038, 0.37314446, 0.40593967],
       [0.30250821, 0.23689135, 0.24378612, 0.39862575],
       [0.49552109, 0.34896256, 0.25260687, 0.27780119],
       [0.        , 0.        , 0.        , 0.        ],
       [0.34614926, 0.19814225, 0.18092307, 0.16099492],
       [0.        , 0.        , 0.        , 0.        ],
       [0.35592092, 0.43026885, 0.38395919, 0.53596223],
       [0.4986949 , 0.59041604, 0.51573299, 0.47652002],
       [0.64148871, 0.38534917, 0.34256716, 0.39987162],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.40979281, 0.57443168, 0.66077835, 0.56945183],
       [0.7274544 , 0.82849   , 0.73206156, 0.72575282],
       [0.        , 0.        , 0.        , 0.        ]])

In [10]:
x.evaluate(100)

0.74

In [16]:
env_name = "Taxi-v3"
env = gym.make(env_name)
x = QLearner(env)

In [17]:
x.run(100000)

Episode: 0/100000, Success Rate: -200.0
Episode: 100/100000, Success Rate: -217.91
Episode: 200/100000, Success Rate: -200.0
Episode: 300/100000, Success Rate: -200.0
Episode: 400/100000, Success Rate: -189.43
Episode: 500/100000, Success Rate: -187.23
Episode: 600/100000, Success Rate: -157.84
Episode: 700/100000, Success Rate: -153.61
Episode: 800/100000, Success Rate: -168.4
Episode: 900/100000, Success Rate: -119.81
Episode: 1000/100000, Success Rate: -113.92
Episode: 1100/100000, Success Rate: -78.58
Episode: 1200/100000, Success Rate: -78.51
Episode: 1300/100000, Success Rate: -93.42
Episode: 1400/100000, Success Rate: -66.03
Episode: 1500/100000, Success Rate: -53.71
Episode: 1600/100000, Success Rate: -45.61
Episode: 1700/100000, Success Rate: -37.29
Episode: 1800/100000, Success Rate: -24.75
Episode: 1900/100000, Success Rate: -12.29
Episode: 2000/100000, Success Rate: -22.72
Episode: 2100/100000, Success Rate: -33.39
Episode: 2200/100000, Success Rate: -12.32
Episode: 2300/10

In [None]:
x.evaluate(1,)