In [2]:
!pip install kaggle-environments -U > /dev/null 2>&1s
!cp -r ../input/lux-ai-2021/* .

In [3]:
import numpy as np
import json
from pathlib import Path
import os
import random
from tqdm.notebook import tqdm
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.model_selection import train_test_split
import math
from itertools import count

In [4]:
from lux.constants import Constants
from lux.game import Game
from kaggle_environments import make

### 0) Set Inputs

In [5]:
# Global Variables
# < SYSTEM >
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_PATH = './policy_network'

# < TRAINING >
EPS_START = 0.5
EPS_END = 0.01
EPS_DECAY = 200
BATCH_SIZE = 32
NUM_EPOCHS = 1000
GAMMA = 0.999
GAME_STEP = 0
STEPS_DONE = 0
REPLAY_CAPACITY = 1000
TARGET_UPDATE = 10
LEARNING_RATE = 1e-2
BEST_ACCURACY = 0

# < MAP >
N_ACTIONS = 5
MAX_WIDTH = 32
MAX_HEIGHT = 32
WIDTH = 24
HEIGHT = 24

In [6]:
game_state = None
def get_game_state(observation):
    global game_state
    
    if observation["step"] == 0:
        game_state = Game()
        game_state._initialize(observation["updates"])
        game_state._update(observation["updates"][2:])
        game_state.id = observation["player"]
    else:
        game_state._update(observation["updates"])
    return game_state


def in_city(pos):    
    try:
        city = game_state.map.get_cell_by_pos(pos).citytile
        return city is not None and city.team == game_state.id
    except:
        return False


def call_func(obj, method, args=[]):
    return getattr(obj, method)(*args)


unit_actions = [('move', 'n'), ('move', 's'), ('move', 'w'), ('move', 'e'), ('build_city',)]
def get_action(policy, unit, dest):    
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * GAME_STEP / EPS_DECAY)
    
    for label in np.argsort(policy)[::-1]:
        # Add Noise
        debug = False
        if sample > eps_threshold:
            act = unit_actions[label]
        else:
            act = unit_actions[random.randrange(N_ACTIONS)]
            debug = True
        pos = unit.pos.translate(act[-1], 1) or unit.pos
        
        if pos not in dest or in_city(pos):
            if debug:
                print( f'[GET RAND ACTION] policy {label} at {pos}/[{dest}] - act:{act}' )
            else :
                print( f'[GET EXEC ACTION] policy {label} at {pos}/[{dest}] - act:{act}' )
            return call_func(unit, *act), pos
            
    print( f'[GET ACTION] move center {label} at {pos}/[{dest}] - act:{act}' )
    return unit.move('c'), unit.pos


def agent(observation, configuration):
    global game_state, policyNet, targetNet
    model = CLuxNet(N_ACTIONS)
    game_state = get_game_state(observation)    
    player = game_state.players[observation.player]
    actions = []
    
    # City Actions
    unit_count = len(player.units)
    for city in player.cities.values():
        for city_tile in city.citytiles:
            if city_tile.can_act():
                if unit_count < player.city_tile_count: 
                    actions.append(city_tile.build_worker())
                    unit_count += 1
                elif not player.researched_uranium():
                    actions.append(city_tile.research())
                    player.research_points += 1
    
    # Worker Actions
    dest = []
    model.load_state_dict(targetNet.state_dict())
    for unit in player.units:
        #print( "agent call get_action:", unit.id, "-", unit.can_act(), f'({unit.pos})', in_city(unit.pos) )
        if unit.can_act() and (game_state.turn % 40 < 30 or not in_city(unit.pos)):
            nStep: int = observation['step']
            nXShift: int = (32 - observation['width']) // 2
            nYShift: int = (32 - observation['height']) // 2
            state = updateMap(nStep ,\
                              nXShift, \
                              nYShift, \
                              0, \
                              unit.id, \
                              observation['updates'])
            with torch.no_grad():
                p = model(torch.from_numpy(state).float().unsqueeze(0))

            policy = p.squeeze(0).numpy()

            action, pos = get_action(policy, unit, dest)
            actions.append(action)
            dest.append(pos)

    return actions

In [7]:
INPUT_CONSTANTS = Constants.INPUT_CONSTANTS
RESOURCE_TYPES = Constants.RESOURCE_TYPES

def updateMap(nStep: int, \
              nXShift: int, \
              nYShift: int, \
              nTeam: int, \
              sUId: str, \
              updateList: list) -> list:

    # indexing
    # rp  - gameMap[0:2]                  #resource points
    # r   - gameMap[2:5]                  #resource
    # u   - gameMap[5:13]                 #unit
    # c   - ...it only consumes fuels     #city
    # ct  - gameMap[8:12]                 #citytile
    # ccd - gameMap[]                     #roads (city cool down)

    rpStart = 0
    rStart = 2
    uStart = 5
    ctStart = 8

    gameMap = np.zeros((20, MAX_WIDTH, MAX_HEIGHT))
    cityDict: dict = {}

    for update in updateList:
        cmdList: list[str] = update.split(' ')

        sIdentifier: str = cmdList[0]
        if INPUT_CONSTANTS.RESEARCH_POINTS == sIdentifier:
            team = int(cmdList[1])
            rp = int(cmdList[2])
            idx = rpStart + (team - nTeam) % 2
            value = min(rp, 200) / 200
            gameMap[idx, :] = value

        elif INPUT_CONSTANTS.RESOURCES == sIdentifier:
            rtype = cmdList[1]
            x = int(cmdList[2]) + nXShift
            y = int(cmdList[3]) + nYShift
            amt = int(float(cmdList[4]))
            idx = rStart + {'wood':0, 'coal':1, 'uranium':2}[rtype]
            value = amt / 800
            gameMap[idx, x, y] = value

        elif INPUT_CONSTANTS.UNITS == sIdentifier:
            utype = int(cmdList[1])
            team = int(cmdList[2])
            uid = cmdList[3]
            x = int(cmdList[4])
            y = int(cmdList[5])
            cooldown = float(cmdList[6]) / 6.0
            wood = int(cmdList[7])
            coal = int(cmdList[8])
            uranium = int(cmdList[9])
            resources = (wood + coal + uranium) / 100

            if sUId == uid:
                idx = uStart
                value = (1, resources)
                gameMap[idx:idx+2, x, y] = value
            else:
                idx = uStart + 2
                value = (1, cooldown, resources)
                gameMap[idx:idx+3, x, y] = value

        elif INPUT_CONSTANTS.CITY == sIdentifier:
            team = int(cmdList[1])
            cid: str = cmdList[2]
            fuel = float(cmdList[3])
            lightupkeep = float(cmdList[4])
            cityDict[cid] = min(fuel / lightupkeep, 10) / 10

        elif INPUT_CONSTANTS.CITY_TILES == sIdentifier:
            team = int(cmdList[1])
            cid: str = cmdList[2]
            x = int(cmdList[3]) + nXShift
            y = int(cmdList[4]) + nYShift
            cooldown = float(cmdList[5])
            idx = ctStart + (team - nTeam) % 2 * 2
            value = (1, cityDict[cid])
            gameMap[idx:idx+2, x, y] = value

        elif INPUT_CONSTANTS.ROADS == sIdentifier:
            x = int(cmdList[1])
            y = int(cmdList[2])
            road = float(cmdList[3])


    # Day/Night Cycle
    gameMap[17, :] = nStep % 40 / 40
    # Turns
    gameMap[18, :] = nStep / 360
    # Map Size
    gameMap[19, nXShift:MAX_WIDTH-nXShift, nYShift:MAX_HEIGHT-nYShift] = 1

    return gameMap
    

In [8]:
def toLabel(player, action):
    if action is None or len(action) < 1:
        return f'u_{player}', None
    strs = action[0].split(' ')
    unit_id = strs[1]
    if strs[0] == 'm':
        label = {'c': None, 'n': 0, 's': 1, 'w': 2, 'e': 3}[strs[2]]
    elif strs[0] == 'bcity':
        label = 4
    else:
        label = None
    return unit_id, label

def depletedResources(obs):
    for u in obs['updates']:
        if u.split(' ')[0] == 'r':
            return False
    return True

### 1) Network

In [9]:
class CBasicConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.conv = nn.Conv2d(
            input_dim, output_dim, 
            kernel_size=kernel_size, 
            padding=(kernel_size[0] // 2, kernel_size[1] // 2)
        )
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = self.conv(x)
        h = self.bn(h) if self.bn is not None else h
        return h

class CLuxNet(nn.Module):
    def __init__( self, nActions ):
        super().__init__()
        layers, filters = 12, 32
        self.conv = CBasicConv2d(20, filters, (3, 3), True)
        self.blocks = nn.ModuleList([
            CBasicConv2d(filters, filters, (3, 3), True) for _ in range( layers )
        ])
        self.head = nn.Linear( filters, nActions, bias=False )

    def forward(self, x):
        h = F.relu_( self.conv(x) )
        for b in self.blocks:
            h = F.relu_( h + b( h ) )
        h = ( h * x[:, :1] ).view( h.size(0), h.size(1), -1 ).sum(-1)
        ret = self.head( h )
        return ret

### 2) Replay Memory

In [10]:
# Input for ReplayMemory
from collections import namedtuple, deque
Data = namedtuple('Data',
                  ('state', 'action', 'next_state', 'reward'))

# state: list(str) = state
# action: list(str) = step[0]['action']
# next_state: list(str) = step[0]['observation']['updates']
class CReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a Data"""
        self.memory.append(Data(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
memory = CReplayMemory(REPLAY_CAPACITY)

### 3) Select Action w/ noise

### 4) Optimize Model

In [11]:
def optimizeModel( memory: CReplayMemory, \
                   policyNet: CLuxNet, \
                   targetNet: CLuxNet, \
                   optimizer ) -> None:
    global BEST_ACCURACY, GAME_STEP
    # -1) return exceptions
    if STEPS_DONE == 0 or STEPS_DONE % BATCH_SIZE != 0 or len(memory) < BATCH_SIZE:
        return
    
    GAME_STEP += 1
    
    # 1) fetch memory in batch size
    datas = memory.sample(BATCH_SIZE)

    # 2) make in a bulk list of Data type
    datas = Data(*zip(*datas))
    
    # 3) concatenate state, action, reward
    mask = torch.tensor(
        tuple(map(lambda a: a is not None, datas.action)),\
        device=DEVICE,\
        dtype=torch.bool
    )
    
    # 어떻게 move c를 None으로 두고 아래 코드를 동작하게 할 수 있을까? - Check
    states = torch.cat([torch.tensor(s) for s in datas.state])
    actions = torch.cat([torch.tensor(a) for a in np.array(datas.action).reshape(1, -1)])
    rewards = torch.cat([torch.tensor(r) for r in np.array(datas.reward)])
    
    # 4) next state mask
    nextStateMask = torch.tensor(
        tuple(map(lambda n_s: n_s is not None, datas.next_state)), \
        device = DEVICE, \
        dtype = torch.bool
    )
    
    # 5) concatenate next state
    nextStates = torch.cat([
        torch.tensor(n_s) for n_s in datas.next_state if n_s is not None
    ])
    
    # ** RESIZE INPUTS & SELECT DEVICE
    states = states.view(BATCH_SIZE, -1, MAX_WIDTH, MAX_HEIGHT).float().to(DEVICE)
    nextStates = nextStates.view(BATCH_SIZE, -1, MAX_WIDTH, MAX_HEIGHT).float().to(DEVICE)    
    actions = actions.view(1, -1).to(DEVICE)
    rewards.to(DEVICE)
    
    # 6) Compute Q-Value( Q(s_t, a) ), and select the columns of actions taken for each batch size
    pred = policyNet(states)
    qValue = policyNet(states).gather(1, actions)
    
    # 7) Compute V(s_{t+1}) for all next states
    vValue = torch.zeros(BATCH_SIZE, device=DEVICE)
    vValueIdx = targetNet(nextStates).argmax(1).detach()
    vValue[nextStateMask] = targetNet(nextStates).max(1)[0].detach() #select action 가능
    
    # 8) Compute expected Q-Values with discount rate
    expcQValue = (vValue * GAMMA + rewards).unsqueeze(1)
    
    # 9) Compute Huber Loss
    # criterion = nn.SmoothL1Loss()
    criterion = nn.CrossEntropyLoss()
    loss = criterion(pred, actions.data.squeeze(0)[nextStateMask])
    print( "-------- PREDICTION ----------\n", vValue[nextStateMask], "\n", vValueIdx, "\n", actions.data.squeeze(0)[nextStateMask] )
    acc = torch.sum(vValueIdx[nextStateMask] == actions.data.squeeze(0)[nextStateMask]) / len(vValueIdx[nextStateMask])
    
    # 10) Save the best model
    if BEST_ACCURACY < acc:
        BEST_ACCURACY = acc
        torch.save(targetNet, SAVE_PATH)
        
    # 11) Log
    print(f':: {STEPS_DONE} STEP :: Acc({acc}/{BEST_ACCURACY}), loss({loss}), QValue({expcQValue.min()}, {expcQValue.max()})')
    
    # 12) Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policyNet.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

### 5) Trainig

In [12]:
LOG_LEVEL = 1 # 1 - ERROR / 2 - WARN / 3 - INFO
def trainModel(policyNet: CLuxNet, \
               targetNet: CLuxNet, \
               optimizer, \
               width: int, \
               height: int) -> None:
    global STEPS_DONE
        
    targetNet.cuda()
    policyNet.cuda()
    
    targetNet.eval()
    policyNet.train()
        
    for epoch in range(NUM_EPOCHS):
        env = make("lux_ai_2021", configuration={"width": WIDTH, "height": HEIGHT, "loglevel": LOG_LEVEL, "annotations": True}, debug=True)
        steps = env.run([agent, agent])
        
        xShift, yShift = 0, 0
        nextState = torch.zeros((20, MAX_WIDTH, MAX_HEIGHT), dtype=torch.float32)
        for s in steps:
            if len(s) > 1:
                step = s[0]
            else:
                step = s
            
            observation = step['observation']
            
            depletedResources(observation)
            
            nStep: int = observation['step']
            nTeam: int = observation['player']
            sUId, action = toLabel(nTeam, step['action'])
            if action is None:
                continue
            
            if nStep == 0:
                width, height = observation['width'], observation['height']
                xShift, yShift = (MAX_WIDTH - width) // 2, (MAX_HEIGHT - height) // 2
                    
            state = nextState
            reward = torch.tensor([step['reward'] if step['reward'] is not None else 0], device=DEVICE)
            nextState = updateMap(nStep, xShift, yShift, nTeam, sUId, observation['updates'])
            
            memory.push(state, action, nextState, reward)
            optimizeModel(memory, policyNet, targetNet, optimizer)
            
            STEPS_DONE += 1
        
        if epoch % TARGET_UPDATE == 0:
            targetNet.load_state_dict(policyNet.state_dict())
            print(":: NETWORK UPDATED ::")
        

### 6) Run

In [13]:
policyNet = CLuxNet(N_ACTIONS).to(DEVICE)
targetNet = CLuxNet(N_ACTIONS).to(DEVICE)
targetNet.load_state_dict(policyNet.state_dict())
optimizer = torch.optim.AdamW(policyNet.parameters(), lr=LEARNING_RATE)

In [14]:
trainModel(policyNet, targetNet, optimizer, WIDTH, HEIGHT)

In [None]:
#print(*policyNet.parameters())
for i in policyNet.parameters():
    print( i.shape, i[0][2] )
    break

for i in targetNet.parameters():
    print( i.shape, i[0][2] )
    break

# Submission

In [None]:
from kaggle_environments import make

env = make("lux_ai_2021", configuration={"width": 24, "height": 24, "loglevel": 2, "annotations": True}, debug=False)
steps = env.run([agent, agent])
#env.render(mode="ipython", width=1200, height=800)

In [None]:
!tar -czf submission.tar.gz *

In [None]:
trainModel(policyNet, targetNet, optimizer, WIDTH, HEIGHT)