In [None]:
import sys
sys.path.append("../..")

In [None]:
import nest_asyncio
nest_asyncio.apply()

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)

In [None]:
from training.dqnetwork import DQNetwork

In [None]:
from environment.deepqlearning.phototaxis_env import PhototaxisEnv
from utils.reader import get_yaml_path, read_file

## Connect to Simulator

In [None]:
server_address = "localhost:50051"
client_name = "PhototaxisDQNClient"
env = PhototaxisEnv(server_address, client_name)
env.connect_to_client()

## Load Configurations


In [None]:
import glob
import os

# Load all phototaxis configuration files
config_dir = os.path.join("..", "..", "scripts", "resources", "generated", "phototaxis", "conf")
config_files = sorted(glob.glob(os.path.join(config_dir, "environment_*.yml")))

configs = []
for config_file in config_files:
    configs.append(read_file(config_file))

print(f"Loaded {len(configs)} configuration files")

# Initialize with the first config
env.init(configs[0])

## Network Architecture

In [None]:
neuron_count_per_hidden_layer = [128, 64]

## Hyperparameters

In [None]:
episode_count = 1000  # Total number of training episodes
episode_max_steps = 2000  # Maximum number of steps per episode

replay_memory_max_size = 100000  # Maximum number of transitions in replay memory
replay_memory_init_size = 10000  # Initial replay memory size before training starts
batch_size = 512  # Mini-batch size for training

step_per_update = 4  # Number of steps between action model updates
step_per_update_target_model = 1000  # Number of steps between target model updates

max_epsilon = 1.0  # Initial exploration probability
min_epsilon = 0.01  # Minimum exploration probability
epsilon_decay = 0.0002  # Decay rate for exploration probability

gamma = 0.99  # Discount factor for future rewards

moving_avg_window_size = 20  # Window size for moving average of rewards
moving_avg_stop_thr = 100  # Threshold for early stopping based on moving average

## Create Agent

In [None]:
from agent.scala_dqagent import DQAgent

agent1 = DQAgent(
    env,
    agent_id="00000000-0000-0000-0000-000000000001",
    action_model=DQNetwork(
        env.observation_space.shape,
        neuron_count_per_hidden_layer,
        env.action_space.n,
        summary=True,
    ),
    target_model=DQNetwork(
        env.observation_space.shape,
        neuron_count_per_hidden_layer,
        env.action_space.n,
        summary=False,
    ),
    epsilon_max=max_epsilon,
    epsilon_min=min_epsilon,
    gamma=gamma,
    replay_memory_max_size=replay_memory_max_size,
    replay_memory_init_size=replay_memory_init_size,
    batch_size=batch_size,
    step_per_update=step_per_update,
    step_per_update_target_model=step_per_update_target_model,
    moving_avg_window_size=moving_avg_window_size,
    moving_avg_stop_thr=moving_avg_stop_thr,
    episode_max_steps=episode_max_steps,
    episodes=episode_count,
)

agents = [agent1]

## Training

In [None]:
import time
from training.multi_agent_dqlearning import DQLearning

train_start_time = time.time()

trainer = DQLearning(
    env,
    agents,
    configs,
    episode_count=episode_count,
    episode_max_steps=episode_max_steps,
)
train_rewards = trainer.simple_dqn_training()

train_finish_time = time.time()
train_elapsed_time = train_finish_time - train_start_time
train_avg_episode_time = train_elapsed_time / episode_count

print(f"Train time: {train_elapsed_time / 60.0:.1f}m [{train_avg_episode_time:.1f}s]")

Training DQN:  64%|██████▍   | 640/1000 [52:58<12:19,  2.05s/ep]

## Evaluation with Visualization

Watch the trained agent perform phototaxis in real-time.

**Keyboard Controls:**
- `ESC/Q`: Quit
- `SPACE`: Pause/Resume
- `↑/↓`: Adjust FPS

In [None]:
trainer.play_with_pygame(episodes=5, fps=90)

## Save Trained Models

In [None]:
import os

# Create checkpoints directory if it doesn't exist
os.makedirs("checkpoints", exist_ok=True)

# Save the trained models
for i, agent in enumerate(agents):
    agent.action_model.save(f"checkpoints/phototaxis_dqn_agent{i}_action_model.keras")
    agent.target_model.save(f"checkpoints/phototaxis_dqn_agent{i}_target_model.keras")
    print(f"Agent {i} models saved successfully")

## Load Pre-trained Models (Optional)

In [None]:
# To load pre-trained models, uncomment and run:
# for i, agent in enumerate(agents):
#     agent.action_model = tf.keras.models.load_model(f"checkpoints/phototaxis_dqn_agent{i}_action_model.keras")
#     agent.target_model = tf.keras.models.load_model(f"checkpoints/phototaxis_dqn_agent{i}_target_model.keras")
#     print(f"Agent {i} models loaded successfully")

## Plot Training Rewards

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot training rewards
plt.figure(figsize=(12, 6))
plt.plot(train_rewards, alpha=0.6, label='Episode Reward')

# Calculate and plot moving average
if len(train_rewards) > moving_avg_window_size:
    moving_avg = np.convolve(
        train_rewards, 
        np.ones(moving_avg_window_size) / moving_avg_window_size, 
        mode='valid'
    )
    plt.plot(
        range(moving_avg_window_size - 1, len(train_rewards)), 
        moving_avg, 
        'r-', 
        linewidth=2, 
        label=f'Moving Average (window={moving_avg_window_size})'
    )

plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DQN Phototaxis Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Average reward over last {moving_avg_window_size} episodes: {np.mean(train_rewards[-moving_avg_window_size:]):.2f}")