A notebook containing PPO RL agent for ConnectX competition (rows = 6, columns = 7, inarow = 4).

Created using stable-baselines3.

# 1. Install dependencies

In [None]:
!pip install stable-baselines3

# 2. Import global libraries
Documentation
- OpenAI Gym: https://gym.openai.com/
- NumPy: https://numpy.org/doc/stable/index.html
- PyTorch: https://pytorch.org/
- Stable-Baselines3: https://stable-baselines3.readthedocs.io/en/master/index.html
- Kaggle Environments: https://github.com/Kaggle/kaggle-environments

In [None]:
import gym
import os
import sys
import inspect
import numpy as np

import torch as th
import torch.nn as nn
th.set_printoptions(profile="full")

from tqdm.auto import tqdm
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.monitor import Monitor, load_results
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
# Disregard the gfootball error, it's a known issue
# Relevant issue: https://github.com/Kaggle/kaggle-environments/issues/102
from kaggle_environments import make, evaluate

# 3. Create ConnectX Environment

## 3.1. Environment
The agent will use a custom OpenAI Gym environment made with the help of Kaggle ConnectX environment

In [None]:
LOG_DIR = "log/"

class Connect4(gym.Env):
    """
    Connect 4 game environment based on OpenAI Gym standard and Kaggle helpers
    
    Attributes
    ----------
    env : kaggle_environment.Environment 
        Class representing the ConnectX environment
    switch_prob : float
        A number between 0.0 and 1.0 representing probability of switching which agent
        will play first
    agents : list of [str or func]
        Agents that will use the environment
    trainer : dict of {str : func}
        Dictionary with ``reset()`` and ``step()`` functions which reset and
        step through the game board
    board_template : tuple of int
        Template for the game board (dimensions)
    board : numpy.ndarray of int
        Current game board
    reward_range : tuple of int (1, 0 -1)
        Environment rewards
    action_space : gym.spaces.Discrete
        Object representing environment action space (columns from 0 to 6)
    observation_space : gym.spaces.Box
        Object representing environment observation space 
        (game board dimensions and possible values)
    """
    def __init__(self, opponent = 'random', switch_prob = 0.5):
        self.env = make('connectx', debug = False)
        self.switch_prob = switch_prob
        self.agents = [None, opponent] # The agent will train in place of None
        self.trainer = self.env.train(self.agents)
        config = self.env.configuration
        # PyTorch Conv2d expect 4 dimensional data 
        # (nSamples x nChannels x Height x Width)
        self.board_template = (1, config.rows, config.columns) 
        self.board = np.zeros(self.board_template, int)

        # Define required gym fields
        self.reward_range = (-1, 0, 1) # lose: -1, draw: 0, win: 1
        self.action_space = gym.spaces.Discrete(config.columns)
        self.observation_space = gym.spaces.Box(
            # ``low`` and ``high`` represent the possible values
            low = 0,
            high = 2,
            shape = self.board_template,
            dtype = int
        )
    
    def switch_starting_positions(self):
        self.agents = self.agents[::-1]
        self.trainer = self.env.train(self.agents)

    def step(self, action):
        # Check for invalid moves
        if self.board[0][0][int(action)] != 0:
            reward, done, _ = -10, True, {}
        else:
            observation, reward, done, _ = self.trainer.step(int(action))
            self.board = np.array(observation['board']).reshape(self.board_template)
                        
        return self.board, reward, done, _
    
    def reset(self):      
        if np.random.random() < self.switch_prob:
            self.switch_starting_positions()
            
        self.board = np.array(
            self.trainer.reset()['board']
        ).reshape(self.board_template)

        return self.board

training_env = Connect4('random')

# Create directory for logging training information
os.makedirs(LOG_DIR, exist_ok = True)

# Add logging
training_env = Monitor(training_env, LOG_DIR, allow_early_resets = True)

# Vectorize environment (expected by stable-baselines3 algorithms)
training_env = DummyVecEnv([lambda: training_env])

## 3.2. Progress bar callback

In [None]:
class ProgressBarCallback(BaseCallback):
    """
    Callback for displaying the progress bar in realtime
    
    Attributes
    ----------
    pbar : tqdm.pbar
        Progress bar object
    """
    def __init__(self, pbar):
        super(ProgressBarCallback, self).__init__()
        self.pbar = pbar

    def _on_step(self):
        self.pbar.n = self.num_timesteps
        self.pbar.update(0)

class ProgressBarManager(object):
    """
    Manager for proper initialisation and destruction of the progress callback 
    using the ``with`` block
    
    Attributes
    ----------
    pbar : tqdm.pbar
        Progress bar object
    total_timesteps : int
        Number of training steps
    """
    def __init__(self, total_timesteps):
        self.pbar = None
        self.total_timesteps = total_timesteps
        
    def __enter__(self): 
        self.pbar = tqdm(total = self.total_timesteps)
            
        return ProgressBarCallback(self.pbar)

    def __exit__(self, exc_type, exc_val, exc_tb): 
        self.pbar.n = self.total_timesteps
        self.pbar.update(0)
        self.pbar.close()

# 4. Create an Agent

## 4.1. Neural network model

In [None]:
class CustomCNN(BaseFeaturesExtractor):
    """
    Convolutional neural network for extracting features from observations
    
    Attributes
    ----------
    cnn : torch.nn.Sequential
        Object representing CNN part of the neural network
    linear : torch.nn.Sequential
        Object representing linear part of the neural network
    """
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int):
        super(CustomCNN, self).__init__(observation_space, features_dim)
                        
        # Simple, one convolutional layer with batch normalization. Kernel of
        # size 4 should be good because for detection of 4 marks in row.      
        self.cnn = nn.Sequential(
            nn.Conv2d(
                in_channels = observation_space.shape[0],
                out_channels = 64,
                kernel_size = 4,
                stride = 1
            ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Flatten()
        )
        

        
        # Compute flattened shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]
        
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            nn.BatchNorm1d(features_dim),
            nn.ReLU()
        )
        
    def forward(self, x):
        """
        Forward propagation
    
        Parameters
        ----------
        x : numpy.ndarray
            Array representing an observation taken from environment
        
        Returns
        ------
        x : torch.Tensor
            Tensor representing the output of neural network
        """
        x = self.cnn(x)
        x = self.linear(x)
        
        return x

## 4.3. Creating/loading agent model

In [None]:
POLICY_KWARGS = {
    'features_extractor_class': CustomCNN,
    'activation_fn':th.nn.ReLU, # Activation function for MlpPolicy part of the network
    # First number corresponds to output of the shared layer, that later goes to the:
    # pi - policy network which generates actions
    # vf - value network which scores provided observations    
    'net_arch':[64, dict(pi=[32, 16], vf=[32, 16])],
    # ``features_dim`` corresponds to the number of features that should be outputted
    #   from the features_extractor
    'features_extractor_kwargs': dict(features_dim = 768) 
}
# Path to model zip. Leave as empty string to create a new model.
MODEL_PATH = ''

# Load or initialize agent
if MODEL_PATH:
    print('Loading existing model\n')
    agent = PPO.load(
        path = MODEL_PATH,
        env = training_env,
        verbose = 0
    )
else:
    print('Creating a new model\n')
    agent = PPO(
        policy = 'MlpPolicy',
        env = training_env,
        policy_kwargs = POLICY_KWARGS,
        verbose = 0
    )


print(agent.policy)

## 4.2. Training

In [None]:
%%time

# Training constants
TOTAL_TIMESTEPS = 100_000
EVAL_FREQ = 2048 # The policy is updated every 2048 timesteps
EVAL_EPISODES = 100 # How many episodes the agent should be tested for evaluation

with ProgressBarManager(TOTAL_TIMESTEPS) as progress_callback:
    agent.learn(
        total_timesteps = TOTAL_TIMESTEPS,
        callback = [
            progress_callback,
            EvalCallback(
                eval_env = training_env,
                n_eval_episodes = EVAL_EPISODES,
                best_model_save_path = LOG_DIR,
                log_path = LOG_DIR,
                eval_freq = EVAL_FREQ,
                deterministic = True,
                render = False
            )
        ]
    )

# Display training result
df = load_results(LOG_DIR)['r']
df.rolling(window = 1000).mean().plot()

## 4.3. Neural network agent
For testing purposes - unoptimized and not encapsulated (not a valid submission)

In [None]:
def trained_nn_agent(observation, config = {'rows': 6, 'columns': 7, 'inarow': 4}):
    """
    Calculate next move based on neural network prediction
    
    Parameters
    ----------
    observation: numpy.ndarray
        A ``numpy.ndarray`` representing empty game board
    config: dict of {str: int}
        A dictionary with configuration parameters. Normally, it's passed to the
        agent by the environment.
        
    Returns
    -------
    action: int
        Number representing the column chosen by the agent 
    """
    import numpy as np
        
    observation = np.array(
        observation['board']
    ).reshape(1, config['rows'], config['columns'])
    action, info = agent.predict(observation)
    
    return int(action)

# 5. Agent evaluation

## 5.1. Encapsulated heuristic agent
A simple rule-based agent using minimax algorithm and custom heuristic weights for benchmarking. Ready for submission (got to around 47 place on leaderboard).

In [None]:
def heuristic_agent(observation, config):
    """
    Calculate next move by built in heuristic prediction. Largely unoptimized in
    terms of speed.
    
    Parameters
    ----------
    observation: kaggle_environments.object
        An object representing the observation of the environment. Mostly used
        to get the current game board which is a ``numpy.ndarray``.
    config: dict of {str: int}
        A dictionary with environment configuration. Normally, it's passed to the
        agent by the environment.
        
    Returns
    -------
    action: int
        Number representing the column chosen by the agent 
    """
    import random, numpy as np
    
    # Max depth of game state tree search
    LOOKAHEAD = 3
    
    def drop_piece(board, column, piece, config):
        next_board = board.copy()
        
        for row in range(config.rows - 1, -1, -1):
            if next_board[row][column] == 0:
                break
                
        next_board[row][column] = piece
        
        return next_board
    
    def check_window(window, num_discs, piece, config):
        return (
            window.count(piece) == num_discs and
            window.count(0) == config.inarow - num_discs
        )
    
    def count_windows(board, num_discs, piece, config):
        """
        Count number of windows satisfying the specified heuristic conditions

        Parameters
        ----------
        board : numpy.ndarray
            A ``numpy.ndarray`` representing game board
        num_discs : int
            Number representing how many discs in a row should be counted
        piece : int
            Number representing which piece the agent is playing
        config : dict of {str: int}
            A dictionary with environment configuration 
            
        Returns
        -------
        num_windows : int
            Number representing how many windows that satisfy the parameters were
            found in current game board
        """
        num_windows = 0
        
        # Horizontal
        for row in range(config.rows):
            for column in range(config.columns - config.inarow - 1):
                window = list(board[row, column : column + config.inarow])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
                    
        # Vertical
        for row in range(config.rows - config.inarow - 1):
            row_slice_end = row + config.inarow
            for column in range(config.columns):
                window = list(board[row : row_slice_end, column])
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
                    
        # Positive diagonal
        for row in range(config.rows - config.inarow - 1):
            row_stop = row + config.inarow
            for column in range(config.columns - config.inarow - 1):
                window = list(
                    board[
                        range(row, row_stop),
                        range(column, column + config.inarow)
                    ]
                )
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
                    
        # Negative diagonal
        for row in range(config.inarow - 1, config.rows):
            row_stop = row - config.inarow
            for column in range(config.columns - config.inarow - 1):
                window = list(
                    board[
                        range(row, row_stop, -1),
                        range(column, column + config.inarow)
                    ]
                )
                if check_window(window, num_discs, piece, config):
                    num_windows += 1
        
        return num_windows
    
    def get_heuristic(board, piece, config):
        """
        Calculate the value of a given game board with built in heuristic.
        The heuristic was chosen arbitrarily.

        Parameters
        ----------
        board : numpy.ndarray
            A ``numpy.ndarray`` representing game board
        piece : int
            Number representing which piece the agent is playing
        config : dict of {str: int}
            A dictionary with environment configuration 
            
        Returns
        -------
        score : int
            Number representing the score calculated for a given game board
        """
        ENEMY_PIECE = piece % 2 + 1
        
        num_fours = count_windows(board, 4, piece, config)
        num_threes = count_windows(board, 3, piece, config)
        num_twos = count_windows(board, 2, piece, config)
        num_fours_opp = count_windows(board, 4, ENEMY_PIECE, config)
        num_threes_opp = count_windows(board, 3, ENEMY_PIECE, config)
        num_twos_opp = count_windows(board, 2, ENEMY_PIECE, config)
        
        score = 1e6 * num_fours + 1e3 * num_threes + 1e2 * num_twos \
        - 1e5 * num_fours_opp - 1e4 * num_threes_opp - 1e2 * num_twos_opp 
        
        return score
    
    def score_move(board, column, piece, config, lookahead):
        """
        Use minimax algorithm to calculate the value of dropping piece in selected
        column

        Parameters
        ----------
        board : numpy.ndarray
            A ``numpy.ndarray`` representing game board
        column : int
            Number representing column
        piece : int
            Number representing which piece the agent is playing
        config : dict of {str: int}
            Dictionary with environment configuration
        lookahead : int
            Number representing max depth to which the game board tree is explored
            
        Returns
        -------
        score : int
            Number representing the score calculated for a given game board
        """
        next_board = drop_piece(board, column, piece, config)
        score = minimax(next_board, lookahead - 1, False, piece, config)
        
        return score

    def is_terminal_window(window, config):
        return window.count(1) == config.inarow or window.count(2) == config.inarow

    def is_terminal_node(board, config):
        """
        Check if the current game board finishes the game

        Parameters
        ----------
        board : numpy.ndarray
            A ``numpy.ndarray`` representing game board
        config : dict of {str: int}
            A dictionary with environment configuration 
            
        Returns
        -------
        is_terminal : bool
            Number representing how many windows that satisfy the parameters were
            found in current game board
        """
        # Check for draw 
        if list(board[0, :]).count(0) == 0:
            return True
        
        # Check for win: horizontal, vertical, or diagonal
        # Horizontal 
        for row in range(config.rows):
            for column in range(config.columns - config.inarow - 1):
                window = list(board[row, column: column + config.inarow])
                if is_terminal_window(window, config):
                    return True
                
        # Vertical
        for row in range(config.rows - config.inarow - 1):
            row_slice_end = row + config.inarow
            for column in range(config.columns):
                window = list(board[row : row_slice_end, column])
                if is_terminal_window(window, config):
                    return True
                
        # Positive diagonal
        for row in range(config.rows - config.inarow - 1):
            row_stop = row + config.inarow
            for column in range(config.columns - config.inarow - 1):
                window = list(
                    board[
                        range(row, row_stop),
                        range(column, column + config.inarow)
                    ]
                )
                if is_terminal_window(window, config):
                    return True
                
        # Negative diagonal
        for row in range(config.inarow - 1, config.rows):
            row_stop = row - config.inarow
            for column in range(config.columns - config.inarow - 1):
                window = list(
                    board[
                        range(row, row_stop, -1),
                        range(column, column + config.inarow)
                    ]
                )
                if is_terminal_window(window, config):
                    return True
                
        # Game board isn't terminal      
        return False

    # Minimax implementation
    def minimax(node, depth, isMaximizingPlayer, piece, config):
        """
        Maximize or minimize the score based 

        Parameters
        ----------
        node : numpy.ndarray
            A ``numpy.ndarray`` representing game board in a current node
        depth : int
            Number representing tree max tree depth
        isMaximizingPlayer : bool
            Flag which determines if the algorithm should maximize or minimize
        piece : int
            Number representing which pieces should be maximized or minimized
        config : dict of {str: int}
            A dictionary with environment configuration
            
        Returns
        -------
        value : int
            Number representing how many windows that satisfy the parameters were
            found in current game board
        """
        IS_TERMINAL = is_terminal_node(node, config)
        VALID_MOVES = [col for col in range(config.columns) if node[0][col] == 0]
        
        if depth == 0 or IS_TERMINAL:
            return get_heuristic(node, piece, config)
        
        value = 0
        
        if isMaximizingPlayer:
            value = -np.Inf
            for column in VALID_MOVES:
                child = drop_piece(node, column, piece, config)
                value = max(value, minimax(child, depth - 1, False, piece, config))        
        else:
            value = np.Inf
            for column in VALID_MOVES:
                child = drop_piece(node, column, piece % 2 + 1, config)
                value = min(value, minimax(child, depth - 1, True, piece, config))
        
        return value
    
    # Get list of valid moves
    valid_moves = [
        col for col in range(config.columns) if observation.board[col] == 0
    ]
    
    # Convert the board to a 2D board
    board = np.asarray(observation.board).reshape(config.rows, config.columns)
    
    # Use the heuristic to assign a score to each possible board in the next step
    scores = dict(
        zip(
            valid_moves,
            [
                score_move(board, column, observation.mark, config, LOOKAHEAD) \
                for column in valid_moves
            ]
        )
    )
    
    # Get a list of columns (moves) that maximize the heuristic
    max_columns = [
        key for key in scores.keys() if scores[key] == max(scores.values())
    ]
    
    # Select at random from the maximizing columns
    return random.choice(max_columns)

## 5.2. Agent validation
Check agent predictions in a single game with console output

In [None]:
env = make("connectx", debug = 1)

# Two agents play one game round
env.run([trained_nn_agent, heuristic_agent])

# Show the game
env.render(mode="ipython")

## 5.3. Evaluate an Agent
Evaluate the agent performance against selected agents. Kaggle also provides two agents:
- 'random' - does a random valid move
- 'negamax' - a negamax agent with 4 moves lookahead

The heuristic agents are unfortunately much better. RL agent would need to use a bit more sophisticated techniques and train longer to defeat them.

In [None]:
def evaluate_agent(
    agent1,
    agent2,
    config = {'rows': 6, 'columns': 7, 'inarow': 4},
    num_episodes = 100
):
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate(
        environment = "connectx",
        agents = [agent1, agent2],
        configuration = config,
        num_episodes = num_episodes // 2
    )

    # Agent 2 goes first (roughly) half the time      
    outcomes += [
        [b,a] for [a,b] in evaluate(
            environment = "connectx",
            agents = [agent2, agent1],
            configuration = config,
            num_episodes = num_episodes -  num_episodes // 2
        )
    ]
    
    print(
        "Agent 1 Win Percentage:",
        np.round(outcomes.count([1,-1])/len(outcomes), 2)
    )
    print(
        "Agent 2 Win Percentage:",
        np.round(outcomes.count([-1,1])/len(outcomes), 2)
    )
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

print('Trained agent vs Random')
evaluate_agent(trained_nn_agent, 'random')

print('\n')

# This may take a while
print('Trained agent vs Heuristic')
evaluate_agent(trained_nn_agent, heuristic_agent)

# 6. Create submission
For more information see:
https://www.kaggle.com/c/connectx

## 6.1. Create an encapsulated agent function

In [None]:
%%writefile submission.py

def submission_agent(observation, config):
    import numpy as np
    import torch as th
    from torch import nn as nn
    from torch import tensor
    
    class ActorCriticPolicy(nn.Module):
        """
        Neural network for a actor critic method. The feature extractor part is the
        same as in ``CustomCNN``, but there are additional parts that were previously
        provided by the agent policy. Required for encapsulation.

        Attributes
        ----------
        features_extractor : torch.nn.Sequential
            Object representing CNN part of the neural network
        shared_net : torch.nn.Sequential
            Object representing part of the neural network that was transforming the
            ``features_extractor`` into policy and value networks (provided by MlpPolicy)
        policy_net : torch.nn.Sequential
            Object representing part of the neural network that was transforming the
            output of ``shared_net`` into policy 
            (doesn't need to be related to the action space)
        action_net : torch.nn.Sequential
            Object representing part of the neural network that was transforming the
            output of ``policy_net`` into actions (related to action space)
        """
        def __init__(self):
            super(ActorCriticPolicy, self).__init__()
            
            self.features_extractor = nn.Sequential(
                nn.Conv2d(
                    in_channels = 1,
                    out_channels = 64,
                    kernel_size = 4,
                    stride = 1
                ),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.Flatten(),
                nn.Linear(768, 768),
                nn.BatchNorm1d(768),
                nn.ReLU()
            )
            
            self.shared_net = nn.Sequential(
                nn.Linear(768, 64),
                nn.ReLU()
            )
            
            self.policy_net = nn.Sequential(
                nn.Linear(64, 32),
                nn.ReLU(),
                nn.Linear(32, 16),
                nn.ReLU()
            )
            
            self.action_net = nn.Sequential(
                nn.Linear(16, 7),
                nn.ReLU()
            )

        def forward(self, x):
            x = self.features_extractor(x)
            x = self.shared_net(x)
            x = self.policy_net(x)
            x = self.action_net(x)
            x = x.argmax() # Take action with highest weight
            
            return x

## 6.2. Append neural network weights to the file

In [None]:
print('Trained model network:\n')
print(agent.policy.state_dict().keys())

agent_path = 'submission.py'

state_dict = agent.policy.to('cpu').state_dict()

# Reassign the neural network parameters to the attributes of ActorCriticPolicy class.
# Activation functions and flatten is skipped.
state_dict = {
    'features_extractor.0.weight': state_dict['features_extractor.cnn.0.weight'],
    'features_extractor.0.bias': state_dict['features_extractor.cnn.0.bias'],
    'features_extractor.1.weight': state_dict['features_extractor.cnn.1.weight'],
    'features_extractor.1.bias': state_dict['features_extractor.cnn.1.bias'],
    'features_extractor.1.running_mean': state_dict[
        'features_extractor.cnn.1.running_mean'
    ],
    'features_extractor.1.running_var': state_dict[
        'features_extractor.cnn.1.running_var'
    ],
    'features_extractor.1.num_batches_tracked': state_dict[
        'features_extractor.cnn.1.num_batches_tracked'
    ],
    'features_extractor.4.weight': state_dict['features_extractor.linear.0.weight'],
    'features_extractor.4.bias': state_dict['features_extractor.linear.0.bias'],
    'features_extractor.5.weight': state_dict['features_extractor.linear.1.weight'],
    'features_extractor.5.bias': state_dict['features_extractor.linear.1.bias'],
    'features_extractor.5.running_mean': state_dict[
        'features_extractor.linear.1.running_mean'
    ],
    'features_extractor.5.running_var': state_dict[
        'features_extractor.linear.1.running_var'
    ],
    'features_extractor.5.num_batches_tracked': state_dict[
        'features_extractor.linear.1.num_batches_tracked'
    ],
    
    'shared_net.0.weight': state_dict['mlp_extractor.shared_net.0.weight'],
    'shared_net.0.bias': state_dict['mlp_extractor.shared_net.0.bias'],
    
    'policy_net.0.weight': state_dict['mlp_extractor.policy_net.0.weight'],
    'policy_net.0.bias': state_dict['mlp_extractor.policy_net.0.bias'],
    'policy_net.2.weight': state_dict['mlp_extractor.policy_net.2.weight'],
    'policy_net.2.bias': state_dict['mlp_extractor.policy_net.2.bias'],
    
    'action_net.0.weight': state_dict['action_net.weight'],
    'action_net.0.bias': state_dict['action_net.bias'],
}

with open(agent_path, mode='a') as file:
    file.write(f'    state_dict = {state_dict}\n')
    print('\nAppending model weights successful!')

## 6.3. Append the ``move`` function
Steps necessary for agent to make a move

In [None]:
%%writefile -a submission.py

    model = ActorCriticPolicy()
    # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.float
    model = model.float()
    # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict
    model.load_state_dict(state_dict)
    # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to 
    # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.cpu
    model = model.to('cpu')
    # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.eval
    model = model.eval()
    
    observation = tensor(observation['board']).reshape(
        1, 1, config.rows, config.columns
    ).float()
    action = model(observation)
    
    return int(action)

# 7. Validate Submission

## 7.1. Play against itself
Play the submission agent against itself. This is the first episode the competition will run to weed out erroneous agents, which will roughly verify if agent is fully encapsulated and can be run remotely.

In [None]:
with open(agent_path, mode = 'r') as file:
    source = file.read()
    exec(source)
    
env = make("connectx", debug = True)

env.run([submission_agent, submission_agent])

# Show the game
env.render(mode="ipython")

## 7.2. Evaluate agent
To make sure that the neural network weight are assigned correctly

In [None]:
evaluate_agent(agent1 = submission_agent, agent2 = 'random')

# Submit to Competition

1. Commit this kernel.
2. View the commited version.
3. Go to "Data" section and find submission.py file.
4. Click "Submit to Competition"
5. Go to [My Submissions](https://kaggle.com/c/connectx/submissions) to view your score and episodes being played.