In [56]:
from kaggle_environments import make
import torch
from torch import tensor, save, load, optim, cat, zeros
import torch.nn.functional as F
from metrics import mean_reward_with_enemy as mean_reward
from memory import Memory
from model import Model
import numpy as np
import random

from submit import write_agent

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [57]:
def get_model():
    from submission import model
    save(model.state_dict(), 'model')
    model = Model()
    model.load_state_dict(load('model'))
    return model

In [58]:
def optimize_model(policy_model, target_model, memory, optimizer, BATCH_SIZE):
    GAMMA = 0.99
    if not (memory==500):
        return False
    _obs_batch, act_batch, obs_batch, reward_batch = zip(*memory.sample(BATCH_SIZE))

    _obs_batch = tensor(tuple(map(lambda s: tuple(s), _obs_batch)), device=device)
    act_batch = tensor(act_batch, device=device).unsqueeze(1)
    non_final_obs = tensor([tuple(s) for s in obs_batch if s is not None], device=device)
    non_final_mask = tensor(tuple(map(lambda s: s is not None, obs_batch)), device=device, dtype=torch.bool)
    reward_batch = tensor(reward_batch, device=device).unsqueeze(1)

    policy_action_values = policy_model(_obs_batch).gather(1, act_batch)
    target_action_values = zeros(BATCH_SIZE, device=device)
    target_action_values[non_final_mask] = target_model(non_final_obs).max(-1)[0].detach()
    expected_state_action_values = (target_action_values * GAMMA) + reward_batch

    loss = F.smooth_l1_loss(policy_action_values, target_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_model.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()


In [69]:
def train(model,
          episods=1000,
          memory_size = 1000
          ):
    TARGET_UPDATE = 25
    env = make("connectx")
    trainer = env.train([None, "negamax"])
    p_model = model
    t_model = Model()
    t_model.load_state_dict(p_model.state_dict())
    memory = Memory(memory_size)
    p_model.to(device)
    t_model.to(device)
    t_model.train()
    optimizer = optim.RMSprop(p_model.parameters())
    for e in range(episods):
        obs = trainer.reset()
        obs = tensor(obs.board, dtype=torch.float)
        while not env.done:
            act = 0
            _obs = obs
            if(memory==memory_size):
                pred = model(obs.to(device))
                act = int(pred.data.max(-1)[1])
            else:
                rnd_pred = tensor([[random.randrange(7)]], dtype=torch.int)
                act = int(rnd_pred)
                # act = int(random.choice(np.argwhere(np.array(obs.board)[:7]==0))[0])
            # previous_observation, action, observation, reward, done, info
            obs, reward, done, _ = trainer.step(act)
            obs = tensor(obs.board, dtype=torch.float)
            if done == 1 or reward is None:
                obs = None
            if(reward is None):
                reward = -1
            state = [_obs, act, obs, reward]
            memory.memorize(state)
            optimize_model(p_model, t_model, memory, optimizer, 64)
        if e % TARGET_UPDATE == 0:
            print(e)
            t_model.load_state_dict(p_model.state_dict())

    return p_model

            # if(done):
            #     reward += 10
        # env.render(mode='ipython')
model = train(model)

0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975


In [70]:
write_agent(model.paramsToList())

In [71]:
from submission import my_agent

In [66]:
model = Model()
def my_agent(observation, configuration):
    pred = model(torch.tensor(observation.board, dtype=torch.float))
    pred = int(pred.data.max(-1)[1])
    act =  pred if observation.board[pred] == 0 else random.choice(np.argwhere(np.array(observation.board)[:7]==0))[0]
    return int(act)
for i in range(5):
    mean_reward(my_agent, 'random', 100)

My Agent vs Random Agent: 0.86
My Agent vs Random Agent: 0.79
My Agent vs Random Agent: 0.82
My Agent vs Random Agent: 0.76
My Agent vs Random Agent: 0.855


In [80]:
for i in range(5):
    mean_reward('negamax', 'negamax', 1)

My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0


In [53]:
model_2 = Model()
def my_agent_2(observation, configuration):
    pred = model_2(torch.tensor(observation.board, dtype=torch.float))
    pred = int(pred.data.max(-1)[1])
    act =  pred if observation.board[pred] == 0 else random.choice(np.argwhere(np.array(observation.board)[:7]==0))[0]
    return int(act)
for i in range(5):
    mean_reward(my_agent, my_agent_2, 100)

My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0
My Agent vs Random Agent: 0.0


In [55]:
env = make("connectx")
env.run([my_agent, my_agent_2])
env.render()

+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 0 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 1 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 1 | 0 | 0 |
+---+---+---+---+---+---+---+
| 2 | 0 | 0 | 0 | 1 | 1 | 0 |
+---+---+---+---+---+---+---+

