In [29]:
from mdp import *
from utils import print_table
from numpy import random

def MDP_agent(location, pi):
    slip = random.choice([1, 0, -1], p=[0.1, 0.8, 0.1])
    action = pi[location]

    if action is None:
        return None

    if slip != 0:
        if action[0] != 0:
            action = (0, slip)
        else:
            action = (slip, 0)
    
    x = location[0] + action[0]
    x = min(3,max(0,x))
    y = location[1] + action[1]
    y = min(3,max(0,y))

    new_pos = (x, y)
    
    if new_pos == (1, 2):
        return location
    else:
        return new_pos
    
def output_agent_log(start_pos, pi):
    loc = start_pos

    while loc is not None:
        print(loc)
        loc = MDP_agent(loc, pi)

def get_GridMDP(step_cost, end_reward, gamma=0.9):
    return GridMDP([[step_cost, step_cost, step_cost, step_cost],
                    [step_cost, None, step_cost, step_cost],
                    [step_cost, step_cost, step_cost, step_cost],
                    [step_cost, -1, step_cost, end_reward]],
                    terminals=[(3, 0), (1, 0)], gamma=gamma)

def run_agent_simulation(step_cost, gamma=0.9):
    env = get_GridMDP(step_cost, 10, gamma)
    pi = best_policy(env, value_iteration(env, .001))
    print_table(env.to_arrows(pi))
    output_agent_log((3,3), pi)
    

In [30]:
def experiment_with_rewards(step_cost):
    for reward in [-1, 1, 5, 10]:  # Adjust reward values
        print(f"\nTesting with reward {reward} at (3, 0)")
        env = get_GridMDP(step_cost, reward)
        pi = best_policy(env, value_iteration(env, .001))
        print_table(env.to_arrows(pi))
        #output_agent_log((3,3), pi)

def experiment_with_gamma(step_cost, battery_life):
    for gamma in [0.1, 0.5, 0.9, 0.99]:  # Test different gamma values
        print(f"\nTesting with gamma = {gamma}")
        env = get_GridMDP(step_cost, gamma)
        pi = best_policy(env, value_iteration(env, .001))
        print_table(env.to_arrows(pi))
        output_agent_log_with_battery((3, 3), pi, battery_life)

def output_agent_log_with_battery(start_pos, pi, battery_life):
    loc = start_pos
    moves = 0
    
    while loc is not None and moves < battery_life:
        print(f"Step {moves}: {loc}")
        loc = MDP_agent(loc, pi)
        moves += 1
    
    if moves >= battery_life:
        print("Battery died.")
    else:
        print("Vacuum reached the charger.")

In [31]:
experiment_with_rewards(-0.5)


Testing with reward -1 at (3, 0)
v   >      v   v
v   None   v   v
v   v      v   v
>   .      >   .

Testing with reward 1 at (3, 0)
>   >      v   v
v   None   v   v
>   >      >   v
>   .      >   .

Testing with reward 5 at (3, 0)
>   >      v   v
v   None   v   v
>   >      >   v
^   .      >   .

Testing with reward 10 at (3, 0)
>   >      v   v
v   None   v   v
>   >      >   v
^   .      >   .


In [32]:
experiment_with_gamma(-0.5, 5)


Testing with gamma = 0.1
v   >      v   v
v   None   v   v
v   v      v   v
>   .      >   .
Step 0: (3, 3)
Step 1: (3, 2)
Step 2: (3, 1)
Step 3: (3, 1)
Step 4: (3, 0)
Battery died.

Testing with gamma = 0.5
>   >      v   v
v   None   v   v
>   >      >   v
>   .      >   .
Step 0: (3, 3)
Step 1: (2, 3)
Step 2: (2, 2)
Step 3: (2, 1)
Step 4: (2, 0)
Battery died.

Testing with gamma = 0.9
>   >      v   v
v   None   v   v
>   >      >   v
>   .      >   .
Step 0: (3, 3)
Step 1: (3, 2)
Step 2: (3, 1)
Step 3: (3, 0)
Vacuum reached the charger.

Testing with gamma = 0.99
>   >      v   v
v   None   v   v
>   >      >   v
>   .      >   .
Step 0: (3, 3)
Step 1: (3, 2)
Step 2: (3, 1)
Step 3: (3, 0)
Vacuum reached the charger.
