In [None]:
import sys
import os 
sys.path.append(os.path.abspath('..'))

In [None]:
from environments.grid_world import GridworldEnv
from algorithms.policy_iteration import policy_iteration
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Cell 3: create env and render initial state
env = GridworldEnv(width=4, height=4, start=(0,0), goals=[(3,3), (0, 0)],
                   step_reward=-1, goal_reward=-1, seed=123)

# Reset and print a few pieces of information
init_state = env.reset()
print("Initial state (row,col):", init_state)
print("Initial state index:", env.state_to_index(init_state))
print("Goal:", env.goals, "Goal index:", [env.state_to_index(goal) for goal in env.goals])

# Render to inspect visually
env.render()

In [None]:
policy, Vs, Vs_history = policy_iteration(env)

In [None]:
plt.figure(figsize=(5,4))
im = plt.imshow(Vs, origin='upper', interpolation='nearest')
plt.colorbar(im, label='State value V(s)')
plt.title("State-value function V(s) under uniform random policy")
# annotate values
for r in range(env.height):
  for c in range(env.width):
    plt.text(c, r, f"{Vs[r,c]:.2f}", ha='center', va='center', fontsize=8, color='white' if Vs[r,c] < Vs.max()/2 else 'black')
plt.tight_layout()
plt.show()

In [None]:
# Stack into a 2D array: shape (iterations, states)
Vs_array = np.vstack(Vs_history)

# Plot each state's value across iterations
num_states = Vs_array.shape[1]
iterations = np.arange(Vs_array.shape[0])

for s in range(num_states):
    plt.plot(iterations, Vs_array[:, s], marker='o', label=f"State {s}")

plt.xlabel("Iteration")
plt.ylabel("State Value")
plt.title("State Value Convergence Across Iterations")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
#plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Using max change in V to track convergence
deltas = []
for k in range(1, len(Vs_history)):
    delta = np.max(np.abs(Vs_history[k] - Vs_history[k-1]))
    deltas.append(delta)

In [None]:
plt.figure(figsize=(6,4))
plt.plot(range(1, len(deltas)+1), deltas, marker='o')
plt.yscale('log')  # log scale often makes convergence clearer
plt.xlabel('Policy Iteration Step')
plt.ylabel('Max ΔV')
plt.title('Policy Iteration Convergence (GridWorld)')
plt.grid(True)
plt.show()


In [None]:
rates = [deltas[k+1]/deltas[k] for k in range(len(deltas)-1)]
mean_rate = np.mean(rates)
print(f"Approximate mean linear convergence ratio: {mean_rate:.4f}")