In [2]:
%load_ext autoreload
%autoreload 2

from IQ import train_agents, test_agents, IQLearningAgent
from myenv_5_sparse_reward import MyGridWorld

  from pkg_resources import resource_stream, resource_exists


In [3]:
from pathlib import Path
import sys
import os

current_path = Path.cwd()
project_root = None
for parent in [current_path] + list(current_path.parents):
    if parent.name == "distributed_project":
        project_root = parent
        break
os.chdir(project_root)
print(f"Set /distributed_project as working directory: {project_root}")

Set /distributed_project as working directory: /home/terra/Desktop/unimore/distributed_project


## Train in a gridworld 9x9 (actually 7x3)
Number of possible states: (7*3)* (7*3-1)*2 = 840

In [28]:
GRID_SIZE = 9

env = MyGridWorld(grid_size=GRID_SIZE)
agents = {}
action_size = env.action_space("agent1").n
obs_shape = env.observation_space("agent1").shape[0]

print(f"Agnets: {env.possible_agents}\nObservation space: {obs_shape}\nAction space: {action_size}")

for agent_id in env.possible_agents:
    agents[agent_id] = IQLearningAgent(
        agent_id=agent_id, 
        action_space_size=action_size, 
        obs_space_shape=obs_shape,
        learning_rate=0.1, 
        discount_factor=0.85,
        epsilon=1 # High initial exploration
    )

NUM_EPISODES = 10000
EPSILON_DECAY_RATE = 0.9995
MIN_EPSILON = 0.01

# TRAINING 
trained_agents = train_agents(
    env, 
    agents, 
    NUM_EPISODES, 
    EPSILON_DECAY_RATE, 
    MIN_EPSILON
)


Agnets: ['agent1', 'agent2']
Observation space: 7
Action space: 5
IQ TRAINING PHASE
Num training episodes: 10000
Espilon decaying rate: 0.9995
Episode 0/10000, Epsilon: 1.000, Episode joint_total_reward: -392
Episode 100/10000, Epsilon: 0.951, Episode joint_total_reward: -386
Episode 200/10000, Epsilon: 0.904, Episode joint_total_reward: -378
Episode 300/10000, Epsilon: 0.860, Episode joint_total_reward: 198
Episode 400/10000, Epsilon: 0.818, Episode joint_total_reward: -356
Episode 500/10000, Epsilon: 0.778, Episode joint_total_reward: -344
Episode 600/10000, Epsilon: 0.740, Episode joint_total_reward: 110
Episode 700/10000, Epsilon: 0.704, Episode joint_total_reward: -326
Episode 800/10000, Epsilon: 0.670, Episode joint_total_reward: -314
Episode 900/10000, Epsilon: 0.637, Episode joint_total_reward: -314
Episode 1000/10000, Epsilon: 0.606, Episode joint_total_reward: -274
Episode 1100/10000, Epsilon: 0.577, Episode joint_total_reward: -300
Episode 1200/10000, Epsilon: 0.548, Episode

In [29]:
# TESTING
env_test = MyGridWorld(grid_size=GRID_SIZE) 
test_agents(env_test, trained_agents, num_test_episodes=100)

env.close()
env_test.close()



IQ TEST PHASE
Num testing episodes: 100
Test Ep. 1/100: Status=SUCCESS, Reward=196.00, Steps=4
Test Ep. 21/100: Status=SUCCESS, Reward=198.00, Steps=5
Test Ep. 41/100: Status=SUCCESS, Reward=196.00, Steps=4
Test Ep. 61/100: Status=SUCCESS, Reward=194.00, Steps=7
Test Ep. 81/100: Status=SUCCESS, Reward=200.00, Steps=5
END OF IQ TESTING PHASE

TEST METRICS
Success rate: 100.00%
Avg reward per episode: 196.70
Avg steps per episode: 4.99


## Train in a gridworld 10x10 (actually 8x4)
Number of possible states: (8*4)* (8*4-1)*2 = 1984

In [37]:
GRID_SIZE = 10

env = MyGridWorld(grid_size=GRID_SIZE)
agents = {}
action_size = env.action_space("agent1").n
obs_shape = env.observation_space("agent1").shape[0]

print(f"Agnets: {env.possible_agents}\nObservation space: {obs_shape}\nAction space: {action_size}")

for agent_id in env.possible_agents:
    agents[agent_id] = IQLearningAgent(
        agent_id=agent_id, 
        action_space_size=action_size, 
        obs_space_shape=obs_shape,
        learning_rate=0.1, 
        discount_factor=0.85,
        epsilon=1 # High initial exploration
    )

NUM_EPISODES = 10000
EPSILON_DECAY_RATE = 0.9995
MIN_EPSILON = 0.01

# TRAINING 
trained_agents = train_agents(
    env, 
    agents, 
    NUM_EPISODES, 
    EPSILON_DECAY_RATE, 
    MIN_EPSILON
)


Agnets: ['agent1', 'agent2']
Observation space: 7
Action space: 5
IQ TRAINING PHASE
Num training episodes: 10000
Espilon decaying rate: 0.9995
Episode 0/10000, Epsilon: 1.000, Episode joint_total_reward: -384
Episode 100/10000, Epsilon: 0.951, Episode joint_total_reward: -390
Episode 200/10000, Epsilon: 0.904, Episode joint_total_reward: -396
Episode 300/10000, Epsilon: 0.860, Episode joint_total_reward: -398
Episode 400/10000, Epsilon: 0.818, Episode joint_total_reward: -376
Episode 500/10000, Epsilon: 0.778, Episode joint_total_reward: -358
Episode 600/10000, Epsilon: 0.740, Episode joint_total_reward: -306
Episode 700/10000, Epsilon: 0.704, Episode joint_total_reward: 194
Episode 800/10000, Epsilon: 0.670, Episode joint_total_reward: -350
Episode 900/10000, Epsilon: 0.637, Episode joint_total_reward: -332
Episode 1000/10000, Epsilon: 0.606, Episode joint_total_reward: 126
Episode 1100/10000, Epsilon: 0.577, Episode joint_total_reward: -348
Episode 1200/10000, Epsilon: 0.548, Episode

In [38]:
# TESTING
env_test = MyGridWorld(grid_size=GRID_SIZE) 
test_agents(env_test, trained_agents, num_test_episodes=100)

env.close()
env_test.close()



IQ TEST PHASE
Num testing episodes: 100
Test Ep. 1/100: Status=SUCCESS, Reward=156.00, Steps=24
Test Ep. 21/100: Status=TRUNCATED, Reward=-398.00, Steps=100
Test Ep. 41/100: Status=SUCCESS, Reward=164.00, Steps=20
Test Ep. 61/100: Status=SUCCESS, Reward=136.00, Steps=42
Test Ep. 81/100: Status=TRUNCATED, Reward=-398.00, Steps=100
END OF IQ TESTING PHASE

TEST METRICS
Success rate: 88.00%
Avg reward per episode: 92.50
Avg steps per episode: 33.27


## Train in a gridworld 11x11 (actually 9x5)

In [35]:
GRID_SIZE = 11

env = MyGridWorld(grid_size=GRID_SIZE)
agents = {}
action_size = env.action_space("agent1").n
obs_shape = env.observation_space("agent1").shape[0]

print(f"Agnets: {env.possible_agents}\nObservation space: {obs_shape}\nAction space: {action_size}")

for agent_id in env.possible_agents:
    agents[agent_id] = IQLearningAgent(
        agent_id=agent_id, 
        action_space_size=action_size, 
        obs_space_shape=obs_shape,
        learning_rate=0.1, 
        discount_factor=0.85,
        epsilon=1 # High initial exploration
    )

NUM_EPISODES = 10000
EPSILON_DECAY_RATE = 0.9995
MIN_EPSILON = 0.01

# TRAINING 
trained_agents = train_agents(
    env, 
    agents, 
    NUM_EPISODES, 
    EPSILON_DECAY_RATE, 
    MIN_EPSILON
)


Agnets: ['agent1', 'agent2']
Observation space: 7
Action space: 5
IQ TRAINING PHASE
Num training episodes: 10000
Espilon decaying rate: 0.9995
Episode 0/10000, Epsilon: 1.000, Episode joint_total_reward: -384
Episode 100/10000, Epsilon: 0.951, Episode joint_total_reward: -388
Episode 200/10000, Epsilon: 0.904, Episode joint_total_reward: -394
Episode 300/10000, Epsilon: 0.860, Episode joint_total_reward: -366
Episode 400/10000, Epsilon: 0.818, Episode joint_total_reward: -386
Episode 500/10000, Epsilon: 0.778, Episode joint_total_reward: -398
Episode 600/10000, Epsilon: 0.740, Episode joint_total_reward: -398
Episode 700/10000, Epsilon: 0.704, Episode joint_total_reward: -338
Episode 800/10000, Epsilon: 0.670, Episode joint_total_reward: -370
Episode 900/10000, Epsilon: 0.637, Episode joint_total_reward: -366
Episode 1000/10000, Epsilon: 0.606, Episode joint_total_reward: -332
Episode 1100/10000, Epsilon: 0.577, Episode joint_total_reward: -398
Episode 1200/10000, Epsilon: 0.548, Episo

In [36]:
# TESTING
env_test = MyGridWorld(grid_size=GRID_SIZE) 
test_agents(env_test, trained_agents, num_test_episodes=100)

env.close()
env_test.close()



IQ TEST PHASE
Num testing episodes: 100
Test Ep. 1/100: Status=TRUNCATED, Reward=-384.00, Steps=100
Test Ep. 21/100: Status=TRUNCATED, Reward=-394.00, Steps=100
Test Ep. 41/100: Status=TRUNCATED, Reward=-398.00, Steps=100
Test Ep. 61/100: Status=TRUNCATED, Reward=-398.00, Steps=100
Test Ep. 81/100: Status=TRUNCATED, Reward=-398.00, Steps=100
END OF IQ TESTING PHASE

TEST METRICS
Success rate: 0.00%
Avg reward per episode: -387.16
Avg steps per episode: 100.00


# Try with new raward

In [47]:
from pathlib import Path
import sys
import os

current_path = Path.cwd()
project_root = None
for parent in [current_path] + list(current_path.parents):
    if parent.name == "distributed_project":
        project_root = parent
        break
os.chdir(f"{project_root}/src")
print(f"Set /distributed_project as working directory: {project_root}")

Set /distributed_project as working directory: /home/terra/Desktop/unimore/distributed_project


In [1]:
%load_ext autoreload
%autoreload 2

from IQ import train_agents, test_agents, IQLearningAgent
from myenv_5_dense_reward2 import MyGridWorld

In [2]:
GRID_SIZE = 13

env = MyGridWorld(grid_size=GRID_SIZE)
agents = {}
action_size = env.action_space("agent1").n
obs_shape = env.observation_space("agent1").shape[0]

print(f"Agnets: {env.possible_agents}\nObservation space: {obs_shape}\nAction space: {action_size}")

for agent_id in env.possible_agents:
    agents[agent_id] = IQLearningAgent(
        agent_id=agent_id, 
        action_space_size=action_size, 
        obs_space_shape=obs_shape,
        learning_rate=0.1, 
        discount_factor=0.9,
        epsilon=1 # High initial exploration
    )

NUM_EPISODES = 10000
EPSILON_DECAY_RATE = 0.9997
MIN_EPSILON = 0.01

# TRAINING 
trained_agents = train_agents(
    env, 
    agents, 
    NUM_EPISODES, 
    EPSILON_DECAY_RATE, 
    MIN_EPSILON
)


Agnets: ['agent1', 'agent2']
Observation space: 7
Action space: 5
IQ TRAINING PHASE
Num training episodes: 10000
Espilon decaying rate: 0.9997
Episode 0/10000, Epsilon: 1.000, Episode joint_total_reward: -64.50957531163668
Episode 100/10000, Epsilon: 0.970, Episode joint_total_reward: -57.6741827843523
Episode 200/10000, Epsilon: 0.941, Episode joint_total_reward: -52.43306132842995
Episode 300/10000, Epsilon: 0.914, Episode joint_total_reward: -68.36473152464491
Episode 400/10000, Epsilon: 0.887, Episode joint_total_reward: -64.14360778170705
Episode 500/10000, Epsilon: 0.860, Episode joint_total_reward: -43.67446721602155
Episode 600/10000, Epsilon: 0.835, Episode joint_total_reward: -46.12421729062535
Episode 700/10000, Epsilon: 0.810, Episode joint_total_reward: -51.022547228726886
Episode 800/10000, Epsilon: 0.786, Episode joint_total_reward: -44.13517763705544
Episode 900/10000, Epsilon: 0.763, Episode joint_total_reward: -67.1688518254363
Episode 1000/10000, Epsilon: 0.741, Epis

In [None]:
# TESTING
env_test = MyGridWorld(grid_size=GRID_SIZE, render_mode="human") 
test_agents(env_test, trained_agents, num_test_episodes=100)

env.close()
env_test.close()



IQ TEST PHASE
Num testing episodes: 100
Test Ep. 1/100: Status=SUCCESS, Reward=197.75, Steps=13
Test Ep. 21/100: Status=SUCCESS, Reward=199.56, Steps=6
Test Ep. 41/100: Status=SUCCESS, Reward=196.32, Steps=15
