In [1]:
from Connect4Env import Connect4
from stable_baselines3 import PPO, A2C, DQN
from Connect4Board import pretty_board
from stable_baselines3.common.logger import configure

from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env

from tqdm import tqdm

In [None]:
env = Connect4()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

## Train the model

In [3]:
#tmp_path = "/tmp/sb3_log/"
# set up logger
#new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

# Train the agent
model = A2C("MlpPolicy", env, verbose=1)
#model.set_logger(new_logger)
model.learn(10000)
model.save('my_model.zip')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 10.5      |
|    ep_rew_mean        | -4.04e+03 |
| time/                 |           |
|    fps                | 207       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.89     |
|    explained_variance | -0.00872  |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 475       |
|    value_loss         | 4.03e+05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 10.6      |
|    ep_rew_mean        | -5.06e+03 |
| time/                 |           |
|    fps                | 248       |
|    iterations         | 200   

In [4]:
vec_env = make_vec_env(Connect4, n_envs=1, env_kwargs={'player2':'random'})

# Test the trained agent using the vecenv
obs = vec_env.reset()
vec_env.render()
step = 0
board_states, actions = [], []

model = A2C.load('my_model.zip')
while step < 100:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)

    print(f"Step {step}")
    print("Action: ", action)
    print("reward: ", reward)
    actions.append(action[0])

    if done: # Note that the VecEnv resets automatically when a done signal is encountered
        board_states.append(info[0]['terminal_observation'])
        pretty_board(info[0]['terminal_observation'])
        print(f"{info[0]['Winner']} wins!!!\n")
        print("###############################\n")
        print("########## GAME OVER ##########\n")
        print("###############################\n")
        break

    board_states.append(obs[0,:,:]) # Keep track of obs
    vec_env.render() # this prints obs
    print()
    step += 1


  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
| O |   |   |   |   |   |   |
_____________________________

Step 0
Action:  [3]
reward:  [-1.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
| O |   | O | X |   |   |   |
_____________________________


Step 1
Action:  [3]
reward:  [-1.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   | X |   |   |   |
| O |   | O | X |   | O |   |
_____________________________


Step 2
Action:  [3]
reward:  [-1.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   

## Iterative training

In [None]:
for i in tqdm(range(5)):
    env = Connect4(player2='my_model')
    model = A2C("MlpPolicy", env, verbose=False).learn(10000)
    model.save('my_model.zip')

In [15]:
vec_env = make_vec_env(Connect4, n_envs=1, env_kwargs={'player2':'my_model.zip'})

# Test the trained agent using the vecenv
obs = vec_env.reset()
vec_env.render()
step = 0
board_states, actions = [], []

while step < 200:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)

    print(f"Step {step}")
    print("Action: ", action)
    print("reward: ", reward)
    actions.append(action[0])

    if done: # Note that the VecEnv resets automatically when a done signal is encountered
        board_states.append(info[0]['terminal_observation'])
        pretty_board(info[0]['terminal_observation'])
        print(f"{info[0]['Winner']} wins!!!\n")
        print("###############################\n")
        print("########## GAME OVER ##########\n")
        print("###############################\n")
        break

    board_states.append(obs[0,:,:]) # Keep track of obs
    vec_env.render() # this prints obs
    print()
    step += 1


  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   | O |   |   |   |
_____________________________

Step 0
Action:  [2]
reward:  [-1.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   | O |   |   |   |   |
|   |   | X | O |   |   |   |
_____________________________


Step 1
Action:  [2]
reward:  [-1.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   | O |   |   |   |   |
|   |   | X |   |   |   |   |
|   |   | O |   |   |   |   |
|   |   | X | O |   |   |   |
_____________________________


Step 2
Action:  [2]
reward:  [-1.]

  0   1   2   3   4   5   6  
_____________________________
|   |   | O |   

In [29]:
vec_env = make_vec_env(Connect4, n_envs=1, env_kwargs={'player2':'random'})

# Test the trained agent using the vecenv
obs = vec_env.reset()
vec_env.render()
step = 0
board_states, actions = [], []

while step < 100:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)

    print(f"Step {step}")
    print("Action: ", action)
    print("reward: ", reward)
    actions.append(action[0])

    if done: # Note that the VecEnv resets automatically when a done signal is encountered
        board_states.append(info[0]['terminal_observation'])
        pretty_board(info[0]['terminal_observation'])
        print(f"{info[0]['Winner']} wins!!!\n")
        print("###############################\n")
        print("########## GAME OVER ##########\n")
        print("###############################\n")
        break

    board_states.append(obs[0,:,:]) # Keep track of obs
    vec_env.render() # this prints obs
    print()
    step += 1


  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   | O |   |   |   |   |
_____________________________

Step 0
Action:  [2]
reward:  [0.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   | X |   |   |   |   |
|   |   | O |   |   | O |   |
_____________________________


Step 1
Action:  [2]
reward:  [0.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |   |   |   |
|   |   |   |   |   |   |   |
|   |   | O |   |   |   |   |
|   |   | X |   |   |   |   |
|   |   | X |   |   |   |   |
|   |   | O |   |   | O |   |
_____________________________


Step 2
Action:  [5]
reward:  [0.]

  0   1   2   3   4   5   6  
_____________________________
|   |   |   |   |  