In [None]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import random
from numpy.typing import NDArray
import pandas as pd
import itertools
import json
import time
from typing import List, Tuple, Deque, Optional, Callable
import gym_connect4
import gym
import torch
import collections
import torch.nn.init as init
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import _LRScheduler

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
env = gym.make('Connect4-v0', height=6, width=9, connect=4)

In [None]:
class ObjectiveFunction:

    def __init__(self, env, policy,best_policy, num_episodes=1, max_time_steps=float('inf'), minimization_solver=True):
        self.ndim = policy.num_params
        self.env = env
        self.policy = policy
        self.best_policy = best_policy
        self.num_episodes = num_episodes
        self.max_time_steps = max_time_steps
        self.minimization_solver = minimization_solver

        self.num_evals = 0

    def eval(self, policy_params, num_episodes=None, max_time_steps=None, render=False,win_threshold=0.7):
        self.num_evals += 1
        if num_episodes is None:
            num_episodes = self.num_episodes

        if max_time_steps is None:
            max_time_steps = self.max_time_steps
        average_total_rewards = 0
        number_of_win = 0
        return average_total_rewards

    def __call__(self, policy_params, num_episodes=None, max_time_steps=None, render=False):
        return self.eval(policy_params, num_episodes, max_time_steps, render)

In [None]:
class QNetwork(torch.nn.Module):
    def __init__(self, n_observations: int, n_actions: int, nn_l1: int, nn_l2: int, nn_l3=0, nn_l4=0, nn_l5 =0):

        if(nn_l3 == 0):
            nn_l3 = nn_l2
        if(nn_l4 == 0):
           nn_l4 = nn_l3
        if(nn_l5 == 0):
           nn_l5 = nn_l4

        super(QNetwork, self).__init__()

        self.layer1 = torch.nn.Linear(n_observations, nn_l1)
        self.layer2 = torch.nn.Linear(nn_l1, nn_l2)
        self.layer3 = torch.nn.Linear(nn_l2, nn_l3)
        self.layer4 = torch.nn.Linear(nn_l3, nn_l4)
        self.layer5 = torch.nn.Linear(nn_l4, nn_l5)
        self.layer6 = torch.nn.Linear(nn_l5, n_actions)
        self.leaky_relu = torch.nn.LeakyReLU()

        init.xavier_uniform_(self.layer1.weight)
        init.xavier_uniform_(self.layer2.weight)
        init.xavier_uniform_(self.layer3.weight)
        init.xavier_uniform_(self.layer4.weight)
        init.xavier_uniform_(self.layer5.weight)
        init.xavier_uniform_(self.layer6.weight)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.to(self.layer1.weight.dtype)
        x = self.leaky_relu(self.layer1(x))
        x = self.leaky_relu(self.layer2(x))
        x = self.leaky_relu(self.layer3(x))
        x = self.leaky_relu(self.layer4(x))
        x = self.leaky_relu(self.layer5(x))
        x = self.layer6(x)

        return x

In [None]:
class Policy:

    def __init__(self, env,epsilon=0.5,epsilon_min=0.013, epsilon_decay=0.9875):
        self.number_inputs = env.width * env.height
        self.number_actions = env.width
        self.qnetwork = QNetwork(self.number_inputs, env.width, env.height, self.number_actions)
        self.epsilon = epsilon
        self.env = env
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

    def __call__(self,state,no_epsilon = False):
        available_moves = self.env.get_moves()
        q_values = self.qnetwork(torch.tensor(state))

        if(random.random()<self.epsilon and not no_epsilon):
            action = random.choice(available_moves)
        else:
            best_move = available_moves[0]
            best_q_value = q_values[best_move]
            for move in available_moves:
                if(q_values[move]>best_q_value):
                    best_move = move
                    best_q_value = q_values[move]
            action = best_move

        return action

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def reset_epsilon(self, epsilon = 0.5):
        self.epsilon = epsilon

In [None]:
def better_step(env,action,player):
        observed_space, reward_vector,winner,info = env.step(action)
        states = observed_space[player]
        empty_states = states[0]
        player_states = states[1]
        opponent_states = states[2]
        done = winner or (len(info['legal_actions'])==0)
        trad_state = player_states - opponent_states
        return trad_state.flatten(), reward_vector[player],done

In [None]:
class MinimumExponentialLR(torch.optim.lr_scheduler.ExponentialLR):

    def __init__(self, optimizer: torch.optim.Optimizer, lr_decay: float, last_epoch: int = -1, min_lr: float = 1e-6):
        self.min_lr = min_lr
        super().__init__(optimizer, lr_decay, last_epoch=-1)

    def get_lr(self) -> List[float]:
        return [
            max(base_lr * self.gamma ** self.last_epoch, self.min_lr)
            for base_lr in self.base_lrs
        ]

Naive training

In [None]:
def train_naive_agent_against(env: gym.Env,
                      optimizer: torch.optim.Optimizer,
                      first_agent: Policy,
                      oponnent:Policy,
                      loss_fn: Callable,
                      device: torch.device,
                      lr_scheduler: _LRScheduler,
                      num_episodes: int,
                      gamma: float) -> List[float]:
    trainee_wins=  0
    trainer_wins = 0
    episode_reward_list = []

    player = 0
    for episode_index in tqdm(range(1, num_episodes)):
        state = env.reset()
        state= state[0][1].flatten()
        episode_reward = 0
        player = np.random.choice([0,1])

        if(player ==1):
            opponent_action = oponnent.__call__(state)
            state, _= better_step(env, opponent_action,player)

        for t in itertools.count():
            q_network = first_agent.qnetwork
            q_values = q_network(torch.tensor(state))
            action = first_agent.__call__(state)
            calculated_reward = 0
            if(len(env.get_moves()) == 0):
                done = True
            else:
                next_state, done = better_step(env,action,1-player)
                if(done):
                    calculated_reward = 1

                    trainee_wins +=1
                else:
                    if(len(env.get_moves()) == 0):
                        done = True
                    else:
                        opponent_action = oponnent.__call__(next_state)
                        next_state, done = better_step(env, opponent_action,player)
                        if(done):
                            calculated_reward = -1
                            trainer_wins +=1

            episode_reward += calculated_reward
            next_state_tensor = torch.tensor(next_state)
            with torch.no_grad():
                target_q_values = calculated_reward + gamma * torch.max(q_network(next_state_tensor))
            current_q_value = q_values[action]

            loss = loss_fn(current_q_value, target_q_values)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if done:
                break

            state = next_state

        episode_reward_list.append(episode_reward)
        first_agent.decay_epsilon()
        lr_scheduler.step()

    print("End of episode: Trainer winrate : {:.2f}%, Trainee winrate : {:.2f}%, Draw rate : {:.2f}%".format(100* (trainer_wins/num_episodes), 100* (trainee_wins/num_episodes), 100* ((num_episodes -trainee_wins-trainer_wins)/num_episodes)))
    return episode_reward_list

Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, capacity: int):
        self.capacity = capacity
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state: np.ndarray, action: np.int64, reward: float, next_state: np.ndarray, done: bool):
        if(self.__len__() >self.capacity):
            self.buffer = collections.deque(maxlen=capacity)
            self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size: int) -> Tuple[np.ndarray, float, float, np.ndarray, bool]:
        states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))
        return np.array(states), actions, rewards, np.array(next_states), dones

    def __len__(self):
        return len(self.buffer)

Training with replay buffer

In [None]:
def train_dqn1_agent(env: gym.Env,
                     first_agent:Policy,
                     oponnent:Policy,
                     optimizer: torch.optim.Optimizer,
                     loss_fn: Callable,
                     device: torch.device,
                     lr_scheduler: _LRScheduler,
                     num_episodes: int,
                     gamma: float,
                     batch_size: int,
                     replay_buffer: ReplayBuffer,
                     render:bool) -> List[float]:

    episode_reward_list = []
    q_network = first_agent.qnetwork
    player = 0
    for episode_index in range(1, num_episodes):
        state= env.reset()
        state = state[0][1].flatten()

        episode_reward = 0
        player = np.random.choice([0,1])
        if(player == 1):
            opponent_action = oponnent.__call__(state)
            state, reward, done = better_step(env, opponent_action,player)
        for t in itertools.count():

            if(len(env.get_moves())==0):
                done = True
                reward = 0
            else:
                action = first_agent.__call__(state)
                next_state, done, reward = better_step(env, action, 1-player)
                if(not done and len(env.get_moves()) !=0):
                    opponent_action = oponnent.__call__(next_state)
                    next_state, reward,done = better_step(env, opponent_action, player)
                    reward = -1*reward

            replay_buffer.add(state, action, reward, next_state, done)

            if len(replay_buffer) >= batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

                states = torch.tensor(states, device=device)
                actions = torch.tensor(actions,  device=device)
                rewards = torch.tensor(rewards, device=device)
                next_states = torch.tensor(next_states,  device=device)
                dones = torch.tensor(dones,device=device)
                q_network= first_agent.qnetwork
                q_values = q_network(states)
                next_q_values = q_network(next_states)
                target_q_values = rewards + gamma * torch.max(next_q_values, dim=1).values * ~dones

                loss = loss_fn(q_values.gather(1, actions.unsqueeze(1)), target_q_values.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                lr_scheduler.step()

            if done:
                break

            state = next_state
            episode_reward += reward

        episode_reward_list.append(episode_reward)
        first_agent.decay_epsilon()

    return episode_reward_list

<h1> Training with DQN - not Naive function </h1>

We train different agents with varying batch sizes to sample from the replay buffer and save them.

In [None]:
env = gym.make('Connect4-v0')
print(dir(env))

In [None]:
number_of_training_done = 0

In [None]:
#Batch size = 1
env = gym.make('Connect4-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUMBER_OF_TRAININGS = 1000
naive_trains_result_list = [[], [], []]

first_agent = 1
opponent_agent = 2

first_agent = Policy(env)
opponent_agent = Policy(env)

for train_index in tqdm(range(NUMBER_OF_TRAININGS)):

    optimizer = torch.optim.AdamW(first_agent.qnetwork.parameters(), lr=0.004, amsgrad=True)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(10000)
    first_agent.epsilon = 0.9
    episode_reward_list = train_dqn1_agent(env,
                                            first_agent,
                                            opponent_agent,
                                            optimizer,
                                            loss_fn,
                                            device,
                                            lr_scheduler,
                                            num_episodes=200, #200 episodes * 1000 trainings = 200k games
                                            gamma=0.9,
                                            replay_buffer=replay_buffer,
                                            batch_size=1,
                                            render=False

                                            )
    naive_trains_result_list[0].extend(range(len(episode_reward_list)))
    naive_trains_result_list[1].extend(episode_reward_list)
    naive_trains_result_list[2].extend([train_index for _ in episode_reward_list])

    if(train_index % 10 == 0):
            opponent_agent = 2
            opponent_agent = Policy(env)
            opponent_agent.qnetwork.load_state_dict(first_agent.qnetwork.state_dict())
naive_trains_result_df = pd.DataFrame(np.array(naive_trains_result_list).T, columns=["num_episodes", "mean_final_episode_reward", "training_index"])
naive_trains_result_df["agent"] = "Naive"


torch.save(first_agent.qnetwork, "DQN_200k_with_1_Buffer.pth")

number_of_training_done+=1
env.close()

In [None]:
#Batch size = 10
env = gym.make('Connect4-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUMBER_OF_TRAININGS = 1000
naive_trains_result_list = [[], [], []]

first_agent = 1
opponent_agent = 2

first_agent = Policy(env)
opponent_agent = Policy(env)
for train_index in tqdm(range(NUMBER_OF_TRAININGS)):

    optimizer = torch.optim.AdamW(first_agent.qnetwork.parameters(), lr=0.004, amsgrad=True)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(10000)
    first_agent.epsilon = 0.9
    episode_reward_list = train_dqn1_agent(env,
                                            first_agent,
                                            opponent_agent,
                                            optimizer,
                                            loss_fn,
                                            device,
                                            lr_scheduler,
                                            num_episodes=200,
                                            gamma=0.9,
                                            replay_buffer=replay_buffer,
                                            batch_size=10,
                                            render=False

                                            )
    naive_trains_result_list[0].extend(range(len(episode_reward_list)))
    naive_trains_result_list[1].extend(episode_reward_list)
    naive_trains_result_list[2].extend([train_index for _ in episode_reward_list])

    if(train_index % 10 == 0):
            opponent_agent = 2
            opponent_agent = Policy(env)
            opponent_agent.qnetwork.load_state_dict(first_agent.qnetwork.state_dict())
naive_trains_result_df = pd.DataFrame(np.array(naive_trains_result_list).T, columns=["num_episodes", "mean_final_episode_reward", "training_index"])
naive_trains_result_df["agent"] = "Naive"

torch.save(first_agent.qnetwork, "DQN_200k_with_10_Buffer.pth")

number_of_training_done+=1
env.close()

In [None]:
#Batch size = 100
env = gym.make('Connect4-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUMBER_OF_TRAININGS = 1000
naive_trains_result_list = [[], [], []]

first_agent = 1
opponent_agent = 2

first_agent = Policy(env)
opponent_agent = Policy(env)
for train_index in tqdm(range(NUMBER_OF_TRAININGS)):

    optimizer = torch.optim.AdamW(first_agent.qnetwork.parameters(), lr=0.004, amsgrad=True)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(10000)
    first_agent.epsilon = 0.9
    episode_reward_list = train_dqn1_agent(env,
                                            first_agent,
                                            opponent_agent,
                                            optimizer,
                                            loss_fn,
                                            device,
                                            lr_scheduler,
                                            num_episodes=200,
                                            gamma=0.9,
                                            replay_buffer=replay_buffer,
                                            batch_size=100,
                                            render=False

                                            )
    naive_trains_result_list[0].extend(range(len(episode_reward_list)))
    naive_trains_result_list[1].extend(episode_reward_list)
    naive_trains_result_list[2].extend([train_index for _ in episode_reward_list])

    if(train_index % 10 == 0):
            opponent_agent = 2
            opponent_agent = Policy(env)
            opponent_agent.qnetwork.load_state_dict(first_agent.qnetwork.state_dict()) #Load opponent as trained one
naive_trains_result_df = pd.DataFrame(np.array(naive_trains_result_list).T, columns=["num_episodes", "mean_final_episode_reward", "training_index"])
naive_trains_result_df["agent"] = "Naive"

torch.save(first_agent.qnetwork, "DQN_200k_with_100_Buffer.pth")

number_of_training_done+=1
env.close()

In [None]:
#Batch size = 1000
env = gym.make('Connect4-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUMBER_OF_TRAININGS = 1000
naive_trains_result_list = [[], [], []]

first_agent = 1
opponent_agent = 2

first_agent = Policy(env)
opponent_agent = Policy(env)
for train_index in tqdm(range(NUMBER_OF_TRAININGS)):

    optimizer = torch.optim.AdamW(first_agent.qnetwork.parameters(), lr=0.004, amsgrad=True)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(10000)
    first_agent.epsilon = 0.9
    episode_reward_list = train_dqn1_agent(env,
                                            first_agent,
                                            opponent_agent,
                                            optimizer,
                                            loss_fn,
                                            device,
                                            lr_scheduler,
                                            num_episodes=200,
                                            gamma=0.9,
                                            replay_buffer=replay_buffer,
                                            batch_size=1000,
                                            render=False

                                            )
    naive_trains_result_list[0].extend(range(len(episode_reward_list)))
    naive_trains_result_list[1].extend(episode_reward_list)
    naive_trains_result_list[2].extend([train_index for _ in episode_reward_list])

    if(train_index % 10 == 0):
            opponent_agent = 2
            opponent_agent = Policy(env)
            opponent_agent.qnetwork.load_state_dict(first_agent.qnetwork.state_dict())
naive_trains_result_df = pd.DataFrame(np.array(naive_trains_result_list).T, columns=["num_episodes", "mean_final_episode_reward", "training_index"])
naive_trains_result_df["agent"] = "Naive"

torch.save(first_agent.qnetwork, "DQN_200k_with_1k_Buffer.pth")

number_of_training_done+=1
env.close()

In [None]:
#Batch size = 5000
env = gym.make('Connect4-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUMBER_OF_TRAININGS = 1000
naive_trains_result_list = [[], [], []]

first_agent = 1
opponent_agent = 2

first_agent = Policy(env)
opponent_agent = Policy(env)
for train_index in tqdm(range(NUMBER_OF_TRAININGS)):

    optimizer = torch.optim.AdamW(first_agent.qnetwork.parameters(), lr=0.004, amsgrad=True)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(10000)
    first_agent.epsilon = 0.9
    episode_reward_list = train_dqn1_agent(env,
                                            first_agent,
                                            opponent_agent,
                                            optimizer,
                                            loss_fn,
                                            device,
                                            lr_scheduler,
                                            num_episodes=200,
                                            gamma=0.9,
                                            replay_buffer=replay_buffer,
                                            batch_size=5000,
                                            render=False

                                            )
    naive_trains_result_list[0].extend(range(len(episode_reward_list)))
    naive_trains_result_list[1].extend(episode_reward_list)
    naive_trains_result_list[2].extend([train_index for _ in episode_reward_list])

    if(train_index % 10 == 0):
            opponent_agent = 2
            opponent_agent = Policy(env)
            opponent_agent.qnetwork.load_state_dict(first_agent.qnetwork.state_dict())
naive_trains_result_df = pd.DataFrame(np.array(naive_trains_result_list).T, columns=["num_episodes", "mean_final_episode_reward", "training_index"])
naive_trains_result_df["agent"] = "Naive"

torch.save(first_agent.qnetwork, "DQN_200k_with_5k_Buffer.pth")

number_of_training_done+=1
env.close()

In [None]:
#Batch size = 9000
env = gym.make('Connect4-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUMBER_OF_TRAININGS = 1000
naive_trains_result_list = [[], [], []]

first_agent = 1
opponent_agent = 2

first_agent = Policy(env)
opponent_agent = Policy(env)
for train_index in tqdm(range(NUMBER_OF_TRAININGS)):

    optimizer = torch.optim.AdamW(first_agent.qnetwork.parameters(), lr=0.004, amsgrad=True)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(10000)
    first_agent.epsilon = 0.9
    episode_reward_list = train_dqn1_agent(env,
                                            first_agent,
                                            opponent_agent,
                                            optimizer,
                                            loss_fn,
                                            device,
                                            lr_scheduler,
                                            num_episodes=200,
                                            gamma=0.9,
                                            replay_buffer=replay_buffer,
                                            batch_size=9000,
                                            render=False

                                            )
    naive_trains_result_list[0].extend(range(len(episode_reward_list)))
    naive_trains_result_list[1].extend(episode_reward_list)
    naive_trains_result_list[2].extend([train_index for _ in episode_reward_list])

    if(train_index % 10 == 0):
            opponent_agent = 2
            opponent_agent = Policy(env)
            opponent_agent.qnetwork.load_state_dict(first_agent.qnetwork.state_dict())
naive_trains_result_df = pd.DataFrame(np.array(naive_trains_result_list).T, columns=["num_episodes", "mean_final_episode_reward", "training_index"])
naive_trains_result_df["agent"] = "Naive"

torch.save(first_agent.qnetwork, "DQN_200k_with_9k_Buffer.pth")

number_of_training_done+=1
env.close()

<h1> Testing two Agent in 1vs1 </h1>


In [None]:
def play_a_game(env, first_agent, oponnent, no_epsilon_1 = False, no_epsilon_2 = False,render=False):
        state = env.reset()
        state= state[0][1].flatten()
        episode_reward = 0
        player = np.random.choice([0,1])

        if(player ==1):
            opponent_action = oponnent.__call__(state, no_epsilon_2)
            state, _,_= better_step(env, opponent_action,player)

            if(render): env.render()

        for t in itertools.count():
            q_network = first_agent.qnetwork
            q_values = q_network(torch.tensor(state))

            action = first_agent.__call__(state, no_epsilon=no_epsilon_1)
            calculated_reward = 0
            if(len(env.get_moves()) == 0):
                done = True

            else:

                next_state,_, done = better_step(env,action,1-player)#
                if(render): env.render()
                if(done):
                    calculated_reward = 1

                else:
                    if(len(env.get_moves()) == 0):
                        done = True

                    else:

                        opponent_action = oponnent.__call__(next_state,no_epsilon=no_epsilon_2)
                        next_state, _,done = better_step(env, opponent_action,player)

                        if(render): env.render()
                        if(done):
                            calculated_reward = -1

            episode_reward += calculated_reward
            next_state_tensor = torch.tensor(next_state)

            if done:
                break

            state = next_state
        return calculated_reward

In [None]:
def play_a_game_with_train(env, first_agent, oponnent,optimizer, loss, gamma, lr_scheduler,render=False,replay_buffer = ReplayBuffer(10000),batch_size=128):
        state = env.reset()
        state= state[0][1].flatten()
        episode_reward = 0
        player = np.random.choice([0,1])


        if(player ==1):
            opponent_action = oponnent.__call__(state)
            state, _,_= better_step(env, opponent_action,player)


        for t in itertools.count():
            q_network = first_agent.qnetwork
            q_values = q_network(torch.tensor(state))
            action = first_agent.__call__(state)
            calculated_reward = 0
            if(len(env.get_moves()) == 0):
                done = True
            else:
                next_state, _,done = better_step(env,action,1-player)
                if(done):
                    calculated_reward = 1
                else:
                    if(len(env.get_moves()) == 0):
                        done = True
                    else:
                        opponent_action = oponnent.__call__(next_state)
                        next_state, _,done = better_step(env, opponent_action,player)
                        if(done):
                            calculated_reward = -1
            episode_reward += calculated_reward
            next_state_tensor = torch.tensor(next_state)

            next_state_tensor = torch.tensor(next_state)

            replay_buffer.add(state, action, calculated_reward, next_state, done)

            if len(replay_buffer) >= batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

                states = torch.tensor(states, device=device)
                actions = torch.tensor(actions,  device=device)
                rewards = torch.tensor(rewards, device=device)
                next_states = torch.tensor(next_states,  device=device)
                dones = torch.tensor(dones,device=device)
                q_network= first_agent.qnetwork
                q_values = q_network(states)
                next_q_values = q_network(next_states)
                target_q_values = rewards + gamma * torch.max(next_q_values, dim=1).values * ~dones

                loss = loss_fn(q_values.gather(1, actions.unsqueeze(1)), target_q_values.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                lr_scheduler.step()
            if done:
                break
            state = next_state

        first_agent.decay_epsilon()
        lr_scheduler.step()
        return calculated_reward


In [None]:
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999)
lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
loss_fn = torch.nn.MSELoss()

NUMBER_OF_GAMES = 10000

no_epsilon_1 = False #True if games are played without epsilon policy
no_epsilon_2 = False

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_1_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)
    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1

agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=1)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 1, no adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 1, with adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_10_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)
    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1
agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=10)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 10, no adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 10, with adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_100_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)

    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1
agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=100)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 100, no adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 100, with adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_1k_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)

    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1
agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=1000)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 1000, no adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 1000, with adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_5k_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)
    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1
agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=5000)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 5000, no adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 5000, with adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_9k_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)

    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1
agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=9000)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 9000, no adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 9000, with adaptation, with $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

<h1> Impact of epsilon-greedy policy </h1>

In [None]:
no_epsilon_1 = True #True if games are played without epsilon policy
no_epsilon_2 = True

In [None]:
Agent_1 = 1
Agent_2 = 1

Agent_1 = Policy(env)
Agent_1.qnetwork = torch.load("DQN_200k_with_9k_Buffer.pth")
Agent_2 = Policy(env)

optimizer = torch.optim.AdamW(Agent_1.qnetwork.parameters(), lr=0.004, amsgrad=True)

agent_wins = 0
opponent_wins = 0
draws = 0
wr0 = 1
wrs0 = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game(env, Agent_1, Agent_2, no_epsilon_1, no_epsilon_2)

    if(res == 1):
        agent_wins+=1
        wr0 = (wr0*i + 1)/(i+1)
        wrs0.append(wr0)
    elif(res ==-1):
        opponent_wins +=1
        wr0 = (wr0*i)/(i+1)
        wrs0.append(wr0)
    else:
        draws+=1
agent_wins = 0
opponent_wins = 0
draws = 0
wr = 1
wrs = []
for i in tqdm(range(NUMBER_OF_GAMES)):
    res = play_a_game_with_train(env, Agent_1, Agent_2,optimizer, loss_fn,0.5,lr_scheduler,batch_size=9000)
    if(res == 1):
        agent_wins+=1
        wr = (wr*i + 1)/(i+1)
        wrs.append(wr)
    elif(res ==-1):
        opponent_wins +=1
        wr = (wr*i)/(i+1)
        wrs.append(wr)
    else:
        draws+=1

In [None]:
wrs0 = wrs0[600:]
plt.title('Batch size = 9000, no adaptation, without $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs0), len(wrs0)),wrs0,color='r')
plt.show()

In [None]:
wrs = wrs[600:]
plt.title('Batch size = 9000, no adaptation, without $\epsilon$-greedy policy')
plt.xlabel('Number of Games')
plt.ylabel('Winrate')
plt.plot(np.linspace(1,len(wrs), len(wrs)),wrs,color='g')
plt.show()

<h1> Checking Inside an Agent </h1>

In [None]:
first_agent = Agent_1
for name, param in first_agent.qnetwork.named_parameters():
    if 'weight' in name:
        print(f'Layer: {name}, Coefficients:')
        print(param.data)
        print()

In [None]:
opponent = Agent_2

for name, param in opponent.qnetwork.named_parameters():
    if 'weight' in name:
        print(f'Layer: {name}, Coefficients:')
        print(param.data)
        print()