In [4]:
import gym
import numpy as np

In [5]:
class SARSA:
    def __init__(self,env,alpha=0.1 ,  gamma=0.99,epsilon=1,epsilon_decay_dec = 0.001,min_epsilon = 0.01):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay_dec
        self.Q = np.zeros((env.observation_space.n,env.action_space.n))
        self.min_epsilon = min_epsilon
    def run(self,episodes):
        state = self.env.reset()
        success_rate = []
        
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            if np.random.uniform(0,1)<self.epsilon:
                    action = self.env.action_space.sample()
            else:
                action = np.argmax(self.Q[state,:])
            while not done:
                next_state,reward, done,info = self.env.step(action)
                if np.random.uniform(0,1)<self.epsilon:
                    action_ = self.env.action_space.sample()
                else:
                    action_ = np.argmax(self.Q[next_state,:])
                self.Q[state,action] += self.alpha*(reward+ self.gamma* self.Q[next_state,action_] - self.Q[state,action])
                action = action_                
                state = next_state
            if(episode%100==0):
                total_reward = self.evaluate(100)
                print("Episode: {}/{}, Success Rate: {}".format(episode,episodes,total_reward))
            self.epsilon = max(self.min_epsilon, np.exp(-self.epsilon_decay*episode))
            
            
    def evaluate(self,episodes,render=False):
        total_reward = 0
        
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            while not done:
                if render:
                    self.env.render()
                action = np.argmax(self.Q[state,:])
                next_state, reward, done, info = self.env.step(action)
                
                total_reward += reward
                state = next_state
        return total_reward/episodes


In [6]:
env_name = "FrozenLake-v0"
env = gym.make(env_name)
x = SARSA(env)

In [7]:
x

<__main__.SARSA at 0x258ec36c2c8>

In [8]:
x.run(10000)

Episode: 0/10000, Success Rate: 0.0
Episode: 100/10000, Success Rate: 0.0
Episode: 200/10000, Success Rate: 0.0
Episode: 300/10000, Success Rate: 0.11
Episode: 400/10000, Success Rate: 0.16
Episode: 500/10000, Success Rate: 0.05
Episode: 600/10000, Success Rate: 0.3
Episode: 700/10000, Success Rate: 0.13
Episode: 800/10000, Success Rate: 0.37
Episode: 900/10000, Success Rate: 0.4
Episode: 1000/10000, Success Rate: 0.43
Episode: 1100/10000, Success Rate: 0.12
Episode: 1200/10000, Success Rate: 0.61
Episode: 1300/10000, Success Rate: 0.66
Episode: 1400/10000, Success Rate: 0.28
Episode: 1500/10000, Success Rate: 0.69
Episode: 1600/10000, Success Rate: 0.76
Episode: 1700/10000, Success Rate: 0.52
Episode: 1800/10000, Success Rate: 0.67
Episode: 1900/10000, Success Rate: 0.45
Episode: 2000/10000, Success Rate: 0.72
Episode: 2100/10000, Success Rate: 0.6
Episode: 2200/10000, Success Rate: 0.66
Episode: 2300/10000, Success Rate: 0.73
Episode: 2400/10000, Success Rate: 0.7
Episode: 2500/10000