# Sailors

### Requirements

In [7]:
import sys
import os
import numpy as np

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))
sys.path.append(os.path.abspath('..'))

# Import the BaseAgent class
from src.agents.base_agent import BaseAgent

from src.env_sailing import SailingEnv
from src.initial_windfields import get_initial_windfield

# Display the BaseAgent class documentation
#help(BaseAgent)

### Training function

In [None]:
def train_agent(
        agent: BaseAgent,
        seed: int = 42,
        env_version: str = "simple_static",
        num_episodes = 100,
        max_steps = 1000,
        verbose = True
        ):

    # Set fixed seed for reproducibility
    agent.seed(42)

    # Create environment with a simple initial windfield
    if env_version in ['simple_static']:
        env = SailingEnv(**get_initial_windfield('simple_static'))
    else:
        raise ValueError 
    
    print("Starting training...")

    for episode in range(num_episodes):
        # Reset environment and get initial state
        observation, info = env.reset(seed=episode)  # Different seed each episode
        state = agent.discretize_state(observation)
        
    # Progress tracking
    rewards_history = []
    steps_history = []
    success_history = []

    # Training loop
    print("Starting full training with 100 episodes...")
    import time
    start_time = time.time()

    for episode in range(num_episodes):
        # Reset environment and get initial state
        observation, info = env.reset(seed=episode)  # Different seed each episode
        state = agent.discretize_state(observation)
        
        total_reward = 0
        
        for step in range(max_steps):
            # Select action and take step
            action = agent.act(observation)
            next_observation, reward, done, truncated, info = env.step(action)
            next_state = agent.discretize_state(next_observation)
            
            # Update Q-table
            agent.learn(state, action, reward, next_state)
            
            # Update state and total reward
            state = next_state
            observation = next_observation
            total_reward += reward
            
            # Break if episode is done
            if done or truncated:
                break
        
        # Record metrics
        rewards_history.append(total_reward)
        steps_history.append(step+1)
        success_history.append(done)
        
        # Update exploration rate (decrease over time)
        agent.exploration_rate = max(0.05, agent.exploration_rate * 0.98)
        
        # Print progress every 10 episodes
        if (episode + 1) % 10 == 0:
            success_rate = sum(success_history[-10:]) / 10 * 100
            print(f"Episode {episode+1}/100: Success rate (last 10): {success_rate:.1f}%")

    training_time = time.time() - start_time

    # Calculate overall success rate
    success_rate = sum(success_history) / len(success_history) * 100

    print(f"\nTraining completed in {training_time:.1f} seconds!")
    print(f"Success rate: {success_rate:.1f}%")
    print(f"Average reward: {np.mean(rewards_history):.2f}")
    print(f"Average steps: {np.mean(steps_history):.1f}")
    print(f"Q-table size: {len(agent.q_table)} states")

## I. Mousse_1

### Defining Agent

In [5]:
class Mousse_1(BaseAgent):
    """A minimal valid agent that meets all interface requirements."""
    
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        super().__init__()
        self.np_random = np.random.default_rng()

        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

        self.position_bins = 8     # Discretize the grid into 8x8
        self.velocity_bins = 4     # Discretize velocity into 4 bins
        self.wind_bins = 8         # Discretize wind directions into 8 bins

        self.q_table = {}

    def discretize_state(self, observation):
        """Convert continuous observation to discrete state for Q-table lookup."""
        # Extract position, velocity and wind from observation
        x, y = observation[0], observation[1]
        vx, vy = observation[2], observation[3]
        wx, wy = observation[4], observation[5]
        
        # Discretize position (assume 32x32 grid)
        grid_size = 32
        x_bin = min(int(x / grid_size * self.position_bins), self.position_bins - 1)
        y_bin = min(int(y / grid_size * self.position_bins), self.position_bins - 1)
        
        # Discretize velocity direction (ignoring magnitude for simplicity)
        v_magnitude = np.sqrt(vx**2 + vy**2)
        if v_magnitude < 0.1:  # If velocity is very small, consider it as a separate bin
            v_bin = 0
        else:
            v_direction = np.arctan2(vy, vx)  # Range: [-pi, pi]
            v_bin = int(((v_direction + np.pi) / (2 * np.pi) * (self.velocity_bins-1)) + 1) % self.velocity_bins
        
        # Discretize wind direction
        wind_direction = np.arctan2(wy, wx)  # Range: [-pi, pi]
        wind_bin = int(((wind_direction + np.pi) / (2 * np.pi) * self.wind_bins)) % self.wind_bins
        
        # Return discrete state tuple
        return (x_bin, y_bin, v_bin, wind_bin)
    
    def act(self, observation: np.ndarray) -> int:
        """Choose an action using epsilon-greedy policy."""
        # Discretize the state
        state = self.discretize_state(observation)
        
        # Epsilon-greedy action selection
        if self.np_random.random() < self.exploration_rate:
            # Explore: choose a random action
            return self.np_random.integers(0, 9)
        else:
            # Exploit: choose the best action according to Q-table
            if state not in self.q_table:
                # If state not in Q-table, initialize it
                self.q_table[state] = np.zeros(9)
            
            # Return action with highest Q-value
            return np.argmax(self.q_table[state])
        

    def learn(self, state, action, reward, next_state):
        """Update Q-table based on observed transition."""
        # Initialize Q-values if states not in table
        if state not in self.q_table:
            self.q_table[state] = np.zeros(9)
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(9)
        
        # Q-learning update
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.discount_factor * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.learning_rate * td_error
    
    def reset(self) -> None:
        """Reset the agent."""
        pass  # Nothing to reset in this simple agent
    
    def seed(self, seed: int = None) -> None:
        """Set the random seed."""
        self.np_random = np.random.default_rng(seed)


### Training

In [10]:
mousse_1 = Mousse_1(learning_rate=0.1, discount_factor=0.99, exploration_rate=0.2)

# Set fixed seed for reproducibility
np.random.seed(42)
mousse_1.seed(42)

# Create environment with a simple initial windfield
env = SailingEnv(**get_initial_windfield('simple_static'))



# Training parameters
num_episodes = 200  # Small number for debugging
max_steps = 1000


print("Starting training with 10 episodes (debug run)...")
for episode in range(num_episodes):
    # Reset environment and get initial state
    observation, info = env.reset(seed=episode)  # Different seed each episode
    state = mousse_1.discretize_state(observation)
    
# Progress tracking
rewards_history = []
steps_history = []
success_history = []

# Training loop
print("Starting full training with 100 episodes...")
import time
start_time = time.time()

for episode in range(num_episodes):
    # Reset environment and get initial state
    observation, info = env.reset(seed=episode)  # Different seed each episode
    state = mousse_1.discretize_state(observation)
    
    total_reward = 0
    
    for step in range(max_steps):
        # Select action and take step
        action = mousse_1.act(observation)
        next_observation, reward, done, truncated, info = env.step(action)
        next_state = mousse_1.discretize_state(next_observation)
        
        # Update Q-table
        mousse_1.learn(state, action, reward, next_state)
        
        # Update state and total reward
        state = next_state
        observation = next_observation
        total_reward += reward
        
        # Break if episode is done
        if done or truncated:
            break
    
    # Record metrics
    rewards_history.append(total_reward)
    steps_history.append(step+1)
    success_history.append(done)
    
    # Update exploration rate (decrease over time)
    mousse_1.exploration_rate = max(0.05, mousse_1.exploration_rate * 0.98)
    
    # Print progress every 10 episodes
    if (episode + 1) % 10 == 0:
        success_rate = sum(success_history[-10:]) / 10 * 100
        print(f"Episode {episode+1}/100: Success rate (last 10): {success_rate:.1f}%")

training_time = time.time() - start_time

# Calculate overall success rate
success_rate = sum(success_history) / len(success_history) * 100

print(f"\nTraining completed in {training_time:.1f} seconds!")
print(f"Success rate: {success_rate:.1f}%")
print(f"Average reward: {np.mean(rewards_history):.2f}")
print(f"Average steps: {np.mean(steps_history):.1f}")
print(f"Q-table size: {len(mousse_1.q_table)} states")

Starting training with 10 episodes (debug run)...
Starting full training with 100 episodes...
Episode 10/100: Success rate (last 10): 100.0%
Episode 20/100: Success rate (last 10): 100.0%
Episode 30/100: Success rate (last 10): 100.0%
Episode 40/100: Success rate (last 10): 100.0%
Episode 50/100: Success rate (last 10): 100.0%
Episode 60/100: Success rate (last 10): 100.0%
Episode 70/100: Success rate (last 10): 100.0%
Episode 80/100: Success rate (last 10): 100.0%
Episode 90/100: Success rate (last 10): 100.0%
Episode 100/100: Success rate (last 10): 100.0%
Episode 110/100: Success rate (last 10): 100.0%
Episode 120/100: Success rate (last 10): 100.0%
Episode 130/100: Success rate (last 10): 100.0%
Episode 140/100: Success rate (last 10): 100.0%
Episode 150/100: Success rate (last 10): 100.0%
Episode 160/100: Success rate (last 10): 100.0%
Episode 170/100: Success rate (last 10): 100.0%
Episode 180/100: Success rate (last 10): 100.0%
Episode 190/100: Success rate (last 10): 100.0%
Epi

### Testing

In [11]:
# Turn off exploration for evaluation
mousse_1.exploration_rate = 0

# Create test environment
test_env = SailingEnv(**get_initial_windfield('training_1'))

# Test parameters
num_test_episodes = 5
max_steps = 1000

print("Testing the trained agent on 5 new episodes...")
# Testing loop
for episode in range(num_test_episodes):
    # Reset environment
    observation, info = test_env.reset(seed=1000 + episode)  # Different seeds from training
    
    total_reward = 0
    
    for step in range(max_steps):
        # Select action using learned policy
        action = mousse_1.act(observation)
        observation, reward, done, truncated, info = test_env.step(action)
        
        total_reward += reward
        
        # Break if episode is done
        if done or truncated:
            break
    
    print(f"Test Episode {episode+1}: Steps={step+1}, Reward={total_reward}, " +
          f"Position={info['position']}, Goal reached={done}")

Testing the trained agent on 5 new episodes...
Test Episode 1: Steps=297, Reward=100.0, Position=[15 31], Goal reached=True
Test Episode 2: Steps=328, Reward=100.0, Position=[15 30], Goal reached=True
Test Episode 3: Steps=97, Reward=100.0, Position=[15 30], Goal reached=True
Test Episode 4: Steps=141, Reward=100.0, Position=[15 30], Goal reached=True
Test Episode 5: Steps=145, Reward=100.0, Position=[15 30], Goal reached=True
