In [12]:
import battleship
import threading
import time
import gym
import numpy as np
import os
import random
import stable_baselines3

In [13]:
env = gym.make('Battleship-v0')

In [11]:
# check the environment
from stable_baselines3.common.env_checker import check_env
check_env(env)


In [None]:
env.close()

In [None]:
env.reset()
for i in range(env.action_space.n):
    env.step(i)
    env.render()
    time.sleep(0.4)

env.close()

In [None]:
# play a random game
env.reset()
env.render()
for _ in range(100):
    env.step(env.action_space.sample())
    env.render()
    time.sleep(1)

env.close()

In [None]:
# get the expectation score of a random agent
scores = []
for i in range(100):
    print(i)
    env.reset()
    score = 0
    done = False
    while not done:
        action = env.action_space.sample()
        _, reward, done, _ = env.step(action)
        score += reward
        
        
    scores.append(env.score)

print(np.mean(scores))

In [14]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import PPO, A2C

N_ENVS = os.cpu_count()

log_path = os.path.join(os.getcwd(), 'logs')
env = gym.make('Battleship-v0')
vec_env = make_vec_env(
        lambda: env,
        n_envs=N_ENVS,
        vec_env_cls=SubprocVecEnv,
        vec_env_kwargs=dict(start_method='fork'),
    )

# from stable_baselines3.common.vec_env import VecMonitor
# vec_env = VecMonitor(vec_env, log_path)

In [15]:
model = stable_baselines3.PPO('MlpPolicy', vec_env, verbose=1, tensorboard_log=log_path, learning_rate=0.001)

Using cpu device


In [16]:
model.learn(200_000_000)

Logging to /home/user/Desktop/RL/ALL-BATTLESHIP/logs/PPO_7
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 382       |
|    ep_rew_mean     | -2.87e+03 |
| time/              |           |
|    fps             | 10021     |
|    iterations      | 1         |
|    time_elapsed    | 1         |
|    total_timesteps | 16384     |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 365         |
|    ep_rew_mean          | -2.71e+03   |
| time/                   |             |
|    fps                  | 3709        |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.021954263 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.59       |
|  

KeyboardInterrupt: 

In [7]:
model.set_env(vec_env)

In [None]:
model.save("10x10-1200reward")

In [5]:
model = PPO.load("10x10-1200reward")

In [17]:
# get the architecture of the policy network
print(model.policy.

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=100, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
# test the trained agent
env = gym.make('Battleship-v0')
score = 0
done = False
obs = env.reset()
episode = 0
while not done:
    action, _states = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    score += reward
    env.render()
    time.sleep(2)
    episode += 1
print(score)
env.close()


In [None]:
env.close()