In [9]:
import gym
import numpy as np
from collections import defaultdict

In [10]:
env  = gym.make('FrozenLake-v0')


In [11]:
class MonteCarlo:
    def __init__(self,env,epsilon=0.5,decay_epsilon=0.999,mode='every-visit',alpha = 0.1):
        self.mode=mode
        if mode == 'alpha':
            self.alpha = alpha
        self.env = env
        self.true_epsilon =epsilon
        self.epsilon = epsilon
        self.decay_epsilon = decay_epsilon
        self.Q = np.random.rand(env.observation_space.n,env.action_space.n)
        self.N = np.zeros([env.observation_space.n,env.action_space.n])
    def reset(self):
        self.Q = np.random.rand(self.env.observation_space.n,self.env.action_space.n)
        self.N = np.zeros([self.env.observation_space.n,self.env.action_space.n])
        self.epsilon = self.true_epsilon
    def run(self,iterations):
        rewardList = []
        for i in range(iterations):
            if self.mode =='single-visit':
                is_visited= defaultdict(lambda:0)
            state = self.env.reset()
            done = False
            resultList = []
            episode_reward = 0
            if (i%100 == 0 and i!=0):
                self.epsilon *= self.decay_epsilon
            while not done:
                if(np.random.rand()<self.epsilon):
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(self.Q[state,:])
                next_state ,reward,done,info = self.env.step(action)
                resultList.append((state,action))
                state = next_state
                episode_reward += reward
            rewardList.append(episode_reward)
            for state,action in resultList:
                if self.mode=='single-visit':
                    if(is_visited[(state,action)]) == 0:
                        self.N[state,action] += 1
                        self.Q[state,action]  = self.Q[state, action] + (1.0/self.N[state,action])*(reward - self.Q[state,action])
                        is_visited[(state,action)] = 1
                elif self.mode == 'alpha':
                    self.Q[state,action]  = self.Q[state, action] + (self.alpha)*(reward - self.Q[state,action])
                else:
                    self.N[state,action] += 1
                    self.Q[state,action]  = self.Q[state, action] + (1.0/self.N[state,action])*(reward - self.Q[state,action])
            if i % 10000 == 0:
                print("Success Rate = ",np.mean(rewardList)," Epsilon = ",self.epsilon)
                rewardList= []


In [4]:
env = gym.make('FrozenLake-v0')
mc = MonteCarlo(env,0.5,0.9995)
mc.run(1000000)


Success Rate =  0.0  Epsilon =  0.5
Success Rate =  0.0512  Epsilon =  0.475608765121167
Success Rate =  0.0719  Epsilon =  0.45240739492016346
Success Rate =  0.078  Epsilon =  0.4303378448593264
Success Rate =  0.088  Epsilon =  0.40934490195689754
Success Rate =  0.0914  Epsilon =  0.38937604665673076
Success Rate =  0.0931  Epsilon =  0.3703813214363394
Success Rate =  0.1018  Epsilon =  0.3523132058245669
Success Rate =  0.1128  Epsilon =  0.3351264975162037
Success Rate =  0.1163  Epsilon =  0.318778199286127
Success Rate =  0.1202  Epsilon =  0.3032274114200485
Success Rate =  0.1214  Epsilon =  0.2884352293927547
Success Rate =  0.1376  Epsilon =  0.2743646465378575
Success Rate =  0.1521  Epsilon =  0.2609804614655517
Success Rate =  0.1595  Epsilon =  0.24824918999676682
Success Rate =  0.1515  Epsilon =  0.23613898139338454
Success Rate =  0.1605  Epsilon =  0.22461953867495577
Success Rate =  0.1728  Epsilon =  0.21366204282256376
Success Rate =  0.1777  Epsilon =  0.203239

In [15]:
env = gym.make('FrozenLake-v0')
mc = MonteCarlo(env,0.5,0.9995,mode='single-visit')

In [16]:
mc.run(1000000)

Success Rate =  0.0  Epsilon =  0.5
Success Rate =  0.0625  Epsilon =  0.475608765121167
Success Rate =  0.0764  Epsilon =  0.45240739492016346
Success Rate =  0.0899  Epsilon =  0.4303378448593264
Success Rate =  0.0903  Epsilon =  0.40934490195689754
Success Rate =  0.1074  Epsilon =  0.38937604665673076
Success Rate =  0.1154  Epsilon =  0.3703813214363394
Success Rate =  0.124  Epsilon =  0.3523132058245669
Success Rate =  0.1343  Epsilon =  0.3351264975162037
Success Rate =  0.1364  Epsilon =  0.318778199286127
Success Rate =  0.1534  Epsilon =  0.3032274114200485
Success Rate =  0.1601  Epsilon =  0.2884352293927547
Success Rate =  0.1718  Epsilon =  0.2743646465378575
Success Rate =  0.1856  Epsilon =  0.2609804614655517
Success Rate =  0.19  Epsilon =  0.24824918999676682
Success Rate =  0.2021  Epsilon =  0.23613898139338454
Success Rate =  0.2098  Epsilon =  0.22461953867495577
Success Rate =  0.2252  Epsilon =  0.21366204282256376
Success Rate =  0.2411  Epsilon =  0.2032390