In [1]:
%load_ext autoreload
%autoreload 2
import gym
import gym_battleship
from typing import List
import numpy as np
from agents.deterministic import deterministic_policy
from tqdm import tqdm

# Find a single size 3 ship on a 5x5 board
ship_sizes = {3: 1}
board_size = (5, 5)
episode_steps = 100  # way more than should ever be needed
reward_dictionary = {
    'win': 100,  # for sinking all ships
    'missed': -0.25,  # for missing a shot
    'hit': 1,  # for hitting a ship
    'repeat_missed': -5,  # for shooting at an already missed cell
    'repeat_hit': -5  # for shooting at an already hit cell
}




In [2]:
def ship_dict_to_list(ship_dict: dict[int, int]) -> List[int]:
    ships = []
    for size in ship_dict:
        count = ship_dict[size]
        ships += [size] * count
    return ships

In [3]:

episodes = 1000
step_history = []
reward_history = []
env = gym.make('Battleship-v0', ship_sizes=ship_sizes, board_size=board_size, episode_steps=episode_steps, reward_dictionary=reward_dictionary)


for e in tqdm(range(0, episodes)):
    state = env.reset()
    ships = ship_dict_to_list(ship_sizes)
    done = False
    steps = 0
    rewards = 0

    while not done:
        action = deterministic_policy(state, ships)
        state, reward, done, remaining_ships = env.step(action)
        ships = ship_dict_to_list(remaining_ships)
        rewards += reward
        steps += 1
        env.render()

    step_history.append(steps)
    reward_history.append(rewards)

print(f'Episodes: {episodes}')
print(
    f'Reward: min {np.min(reward_history)}, max {np.max(reward_history)}, mean {np.mean(reward_history)}, median: {np.median(reward_history)}')
print(
    f'Steps: min {np.min(step_history)}, max {np.max(step_history)}, mean {np.mean(step_history)}, median: {np.median(step_history)}')


100%|██████████| 1000/1000 [00:04<00:00, 217.68it/s]

Episodes: 1000
Reward: min 99.25, max 102.0, mean 100.721, median: 100.75
Steps: min 3, max 14, mean 8.116, median: 8.0



