In [1]:
import torch
from torch import nn
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import copy
from collections import deque
import random

class QWOP_Agent:
    def __init__(self, q_net, lr, sync_freq, exp_replay_size):
        self.q_net = q_net
        self.target_net = copy.deepcopy(self.q_net)

        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=lr)

        self.network_sync_freq = sync_freq
        self.network_sync_counter = 0
        self.gamma = 0.9
        self.experience_replay = deque(maxlen=exp_replay_size)
        self.max_exp_replay_size = exp_replay_size

        self.device = torch.device("cpu")
    
    def to(self, device):
        self.device = device
        self.q_net.to(device)
        self.target_net.to(device)
        return self
    
    def load(self, model_path="models/dqn.pth"):
        self.q_net.load_state_dict(torch.load(model_path))
        self.target_net = copy.deepcopy(self.q_net)
        return self

    def save(self, model_path="models/dqn.pth"):
        torch.save(self.q_net.state_dict(), model_path)
        return self
    
    def get_q(self, state):
        with torch.no_grad():
            qp = self.target_net(state)
            
            return torch.max(qp, axis=0)[0]
    
    def get_action(self, state, temperature=0, epsilon=0):
        with torch.no_grad():
            Qp = self.q_net(state)
        
            if temperature > 0:
                # use boltzman exploration
                A = torch.multinomial(torch.softmax(Qp / temperature, 0), num_samples=1)
            elif epsilon > 0 and torch.rand(1, ).item() < epsilon:
                # epsilon greedy
                A = torch.randint(0, len(Qp), (1,))
            else:
                # best move
                A = torch.max(Qp, axis=-1)[1]
            
            return A
    
    def collect_experience(self, experience):
        self.experience_replay.append(experience)

    def sample_experience(self, sample_size):
        if len(self.experience_replay) < sample_size:
            sample_size = len(self.experience_replay)
        return random.sample(self.experience_replay, sample_size)
        s = torch.stack([exp[0] for exp in sample]).float()
        a = torch.tensor([exp[1] for exp in sample]).long()
        rn = torch.tensor([exp[2] for exp in sample]).float()
        sn = torch.stack([exp[3] for exp in sample]).float()
        return s, a, rn, sn

    def train(self, batch_size):
        samples = self.sample_experience(batch_size)
        if self.network_sync_counter == self.network_sync_freq:
            self.optimizer.zero_grad()
            self.target_net.load_state_dict(self.q_net.state_dict())
            self.network_sync_counter = 0

        loss = 0

        for s, a, rn, sn in samples:
            # predict expected return of current state using main network
            pred_return = self.q_net(s.to(self.device))[a]

            # get target return using target network
            q_next = self.get_q(sn.to(self.device))
            target_return = rn + q_next * self.gamma

        # print(qp, pred_return, pred_return.shape, target_return.shape)

            loss += self.loss_fn(pred_return, target_return)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.network_sync_counter += 1
        return loss.item()

In [3]:

class QWOP_Agent:
    def __init__(self, q_net, lr, sync_freq, exp_replay_size, input_shape):
        self.q_net: nn.Module = q_net
        self.target_net = copy.deepcopy(self.q_net)

        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=lr)

        self.optim_step_freq = 64
        self.optim_step_counter = 0
        self.network_sync_freq = sync_freq
        self.network_sync_counter = 0
        self.gamma = 0.9

        self.input_shape = input_shape
        self.state_len = input_shape[0]
        # experience
        self.states = torch.zeros(
            (exp_replay_size+self.state_len, *input_shape[1:]),
            device=device)
        self.actions = -torch.ones(
            (exp_replay_size+self.state_len-1,),
            device=device, dtype=torch.long)
        self.rewards = torch.zeros(
            (exp_replay_size,),
            device=device)

        self.loc = - self.state_len + 1 # set to zero at initialization
        self.experience_filled = False

    def load(self, model_path="models/dqn.pth"):
        self.q_net.load_state_dict(torch.load(model_path)).to(device)
        self.target_net = copy.deepcopy(self.q_net)
        return self

    def save(self, model_path="models/dqn.pth"):
        torch.save(self.q_net.state_dict(), model_path)
        return self

    def get_q(self, state):
        with torch.no_grad():
            qp = self.target_net(state)
            return torch.max(qp, axis=0)[0]

    def get_action(self, state, temperature=0, epsilon=0):
        with torch.no_grad():
            Qp = self.q_net(state)

            if temperature > 0:
                # use boltzman exploration
                A = torch.multinomial(torch.softmax(
                    Qp / temperature, 0), num_samples=1)
            elif epsilon > 0 and torch.rand(1, ).item() < epsilon:
                # epsilon greedy
                A = torch.randint(0, len(Qp), (1,))
            else:
                # best move
                A = torch.max(Qp, axis=-1)[1]

            return A

    def collect_experience(self, state, action, reward):
        # - 0 1 2 3 4 ... 
        # S 1 1 1 1 2 ...
        # A 1 2 ...
        # R 1 2 ...

        # states can be set beyond the experience buffer size, it is then copied to the front
        self.states[self.loc+self.state_len-1] = state
        if self.loc == len(self.rewards):
            self.states[:self.state_len] = self.states[-self.state_len:]
            self.actions[self.loc] = action
            self.loc = 0
            self.experience_filled = True

        self.actions[self.loc] = action
        # invalidate the next |self.state_len| experiences
        self.actions[self.loc+1:self.loc+self.state_len] = -1
        self.rewards[self.loc] = reward

        self.loc += 1

    def initialize(self, state):
        self.loc += self.state_len - 1
        if self.loc >= len(self.rewards):
            self.loc = -1
        
        for i in range(self.state_len-1):
            self.states[self.loc+i] = state

    def train(self):
        # pick random index
        max_idx = len(self.rewards) if self.experience_filled else self.loc
        if max_idx == 0:
            return 0
        index = torch.randint(0, max_idx, (1,))
        if self.actions[index] == -1 or index == self.loc:
            # index invalid, unlucky, next...
            return 0

        # cloned for grad
        state = torch.clone(self.states[index:index+self.state_len])
        next_state = self.states[index+1:index+self.state_len+1]
        action = self.actions[index]
        reward = self.rewards[index]

        pred_return = self.q_net(state)[action]
        q_next = self.get_q(next_state)
        target_return = reward + q_next * self.gamma

        loss = self.loss_fn(pred_return, target_return)
        loss.backward()

        # step optimizer every self.batch_size trains
        self.optim_step_counter += 1
        if self.optim_step_counter >= self.optim_step_freq:
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.optim_step_counter = 0

            self.network_sync_counter += 1
        # sync target_net every self.network_sync_freq times
        # do this not in the same step as optimzer step
        elif self.network_sync_counter >= self.network_sync_freq:
            self.target_net.load_state_dict(self.q_net.state_dict())
            self.network_sync_counter = 0

        return loss.item()

In [4]:
q_net = nn.Sequential(
    nn.Conv2d(3, 8, 4, 2),
    nn.LeakyReLU(),
    nn.Conv2d(8, 16, 4, 2),
    nn.LeakyReLU(),
    nn.Conv2d(16, 32, 4, 2),
    nn.LeakyReLU(),
    nn.Flatten(start_dim=0),
    nn.Linear(23040, 5760), # / 4
    nn.LeakyReLU(),
    nn.Linear(5760, 1440), # / 4
    nn.LeakyReLU(),
    nn.Linear(1440, 360), # / 4
    nn.LeakyReLU(),
    nn.Linear(360, 9)
).to(device)
input_shape = (4, 3, 100, 160)
agent = QWOP_Agent(q_net, 1e-4, 100, 10000, input_shape)# .load()
state = torch.randn(input_shape).to(device)
agent.get_action(state, epsilon=0.0)

agent.state_len

4

In [5]:
from qwop_env import QWOP_Env
# from qwop_env_multi import QWOP_Env_Multi
import game_host

game_host.start()
env = QWOP_Env(headless=False)

In [6]:
env.reset().shape

(3, 100, 160)

In [7]:
from tqdm import tqdm

def train(env, agent: QWOP_Agent, episodes=20000, epsilons=(0.4, 0.05, 2e-5)):
    epsilon = epsilons[0]

    index = 0
    pbar = tqdm(range(episodes))
    for i in pbar:
        obs, done, losses, ep_len, rew = env.reset(), False, 0, 0, 0
        
        obs_list = deque([obs] * 4, maxlen=4)
        # print("init")
        agent.initialize(torch.tensor(obs, dtype=float, device=device))

        experience = None
        for _ in range(300):
            ep_len += 1
            obs = torch.tensor(list(obs_list), dtype=torch.float32, device=device) # np.array(obslist)).to(device)
            A = agent.get_action(obs, epsilon=epsilon)
            obs_next, reward, done = env.step(A)

            obs_list.append(obs_next)
            obs_next = torch.tensor(list(obs_list), dtype=torch.float32, device=device)
            
            if experience != None:
                experience[2] = reward
                agent.collect_experience(*experience)
                # print(agent.states.sum(dim=(1, 2, 3)))
                # print(agent.actions)
                # print(agent.rewards)
                # print()

                loss = agent.train()
                losses += loss
            
            experience = [obs[-1], A.item(), reward]

            rew += reward

            if epsilon > epsilons[1]:
                epsilon -= epsilons[2]
                pbar.set_postfix({"rew": rew, "eps": epsilon})
            
            if done: # experience is for state only since next action will be invalid
                agent.collect_experience(*experience)
                break

            pbar.set_postfix({"rew": rew, "eps": epsilon})
        

        agent.save()
        # agent.save("models/dqn_backup.pth")

        pbar.set_postfix({"rew": rew, "eps": epsilon})

train(env, agent, epsilons=(0.350, 0.05, 4e-7))

  obs = torch.tensor(list(obs_list), dtype=torch.float32, device=device) # np.array(obslist)).to(device)
  thighs = action // 3
100%|██████████| 20000/20000 [33:33:40<00:00,  6.04s/it, rew=14.3, eps=0.351]       


In [8]:
# transfer joint trainig to image
image_net = nn.Sequential(
    nn.Conv2d(3, 8, 4, 2),
    nn.LeakyReLU(),
    nn.Conv2d(8, 16, 4, 2),
    nn.LeakyReLU(),
    nn.Conv2d(16, 32, 4, 2),
    nn.LeakyReLU(),
    nn.Flatten(start_dim=0),
    nn.Linear(23040, 5760), # / 4
    nn.LeakyReLU(),
    nn.Linear(5760, 1440), # / 4
    nn.LeakyReLU(),
    nn.Linear(1440, 360), # / 4
    nn.LeakyReLU(),
    nn.Linear(360, 9)
)

for _ in range(1000):
    obs = torch.tensor(obs, dtype=torch.float32, device=device)
    A = agent.get_action(obs.unsqueeze(0), epsilon=0.5)

    Qp = agent.q_net(state)
    obs_next, reward, done = env.step(A)

    # obslist.popleft()
    # obslist.append(obs_next)
    obs_next = torch.tensor(obs_next, dtype=torch.float32, device=device)
    agent.collect_experience([obs, A.item(), reward, obs_next])

NameError: name 'obs' is not defined