In [1]:
import gym
import numpy as np

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
SZ_ACTION_SPACE = env.action_space.n
SZ_OBS_SPACE = env.observation_space.n


In [6]:
class ValueIterations:
    def __init__(self,env,gamma=1.0):
        self.env = env
        self.gamma = gamma
        self.SZ_ACTION_SPACE = env.action_space.n
        self.SZ_OBS_SPACE = env.observation_space.n
        self.v = np.zeros(self.SZ_OBS_SPACE)
        self.pi = np.zeros(self.SZ_OBS_SPACE)
    def run(self, iterations=100000,epsilon = 1e-20):
        for i in range(iterations):
            prev_v = np.copy(self.v)
            for s in range(self.SZ_OBS_SPACE):
                q_sa = [sum([p*(r+prev_v[s_]) for p,s_,r,_ in self.env.P[s][a]]) for a in range(self.SZ_ACTION_SPACE)]
                self.v[s] = max(q_sa)
            if (np.sum(np.fabs(prev_v - self.v)) <= epsilon):
                print(f"Value Iteration Converged at iteration {i+1}")
                break
        for s in range(self.SZ_OBS_SPACE):
            q_sa = np.zeros(self.SZ_ACTION_SPACE)
            for a in range(self.SZ_ACTION_SPACE):
                for p,s_,r_,_ in self.env.P[s][a]:
                    q_sa[a] +=(p*(r_+self.gamma*self.v[s_]))
            self.pi[s] = np.argmax(q_sa)
    
    def evaluate(self,episodes,render=False,verbose=False):
        total_reward = 0
        for i in range(episodes):
            obs = env.reset()
            done = False
            step = 0
            ep_reward=0
            while True:
                if render:
                    env.render()
                obs,reward,done,_ = env.step(int(self.pi[obs]))
                ep_reward += reward
                step += 1
                if done:
                    if verbose:
                        print(f"Took {step} steps")
                    total_reward += ep_reward
                    break
        return total_reward/episodes




In [10]:
env_name = 'FrozenLake-v0'
gamma=1.0
env = gym.make(env_name)
vi = ValueIterations(env,gamma)
vi.run(iterations=100000)
print(vi.evaluate(episodes=10000,render=False,verbose=False))

Value Iteration Converged at iteration 1373
0.7336


In [13]:
env_name = 'FrozenLake8x8-v0'
gamma=1.0
env = gym.make(env_name)
vi = ValueIterations(env,gamma)
vi.run(iterations=100000)
print(vi.evaluate(episodes=10000,render=False,verbose=False))

Value Iteration Converged at iteration 2357
0.8764
