In [1]:
import torch
from torch import nn
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import copy
from collections import deque
import random

class QWOP_Agent:
    def __init__(self, q_net, lr, sync_freq, exp_replay_size):
        self.q_net = q_net
        self.target_net = copy.deepcopy(self.q_net)

        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=lr)

        self.network_sync_freq = sync_freq
        self.network_sync_counter = 0
        self.gamma = 0.9
        self.experience_replay = deque(maxlen=exp_replay_size)
        self.max_exp_replay_size = exp_replay_size

        self.device = torch.device("cpu")
    
    def to(self, device):
        self.device = device
        self.q_net.to(device)
        self.target_net.to(device)
        return self
    
    def load(self, model_path="models/dqn.pth"):
        self.q_net.load_state_dict(torch.load(model_path))
        return self

    def save(self, model_path="models/dqn.pth"):
        torch.save(self.q_net.state_dict(), model_path)
        return self
    
    def get_q(self, state):
        with torch.no_grad():
            qp = self.target_net(state)
            
            return torch.max(qp, axis=1)[0]
    
    def get_action(self, state, temperature=0, epsilon=0):
        with torch.no_grad():
            Qp = self.q_net(state)
        
            if temperature > 0:
                # use boltzman exploration
                A = torch.multinomial(torch.softmax(Qp / temperature, 0), num_samples=1)
            elif epsilon > 0 and torch.rand(1, ).item() < epsilon:
                # epsilon greedy
                A = torch.randint(0, len(Qp[0]), (1,))
            else:
                # best move
                A = torch.max(Qp, axis=-1)[1]
            
            return A
    
    def collect_experience(self, experience):
        self.experience_replay.append(experience)

    def sample_experience(self, sample_size):
        if len(self.experience_replay) < sample_size:
            sample_size = len(self.experience_replay)
        sample = random.sample(self.experience_replay, sample_size)
        s = torch.stack([exp[0] for exp in sample]).float()
        a = torch.tensor([exp[1] for exp in sample]).long()
        rn = torch.tensor([exp[2] for exp in sample]).float()
        sn = torch.stack([exp[3] for exp in sample]).float()
        return s, a, rn, sn

    def train(self, batch_size):
        s, a, rn, sn = self.sample_experience(batch_size)
        if self.network_sync_counter == self.network_sync_freq:
            self.optimizer.zero_grad()
            self.target_net.load_state_dict(self.q_net.state_dict())
            self.network_sync_counter = 0

        # predict expected return of current state using main network
        qp = self.q_net(s.to(self.device))
        pred_return = qp[range(batch_size), a]

        # get target return using target network
        q_next = self.get_q(sn.to(self.device))
        target_return = rn.to(self.device) + q_next * self.gamma

        # print(qp, pred_return, pred_return.shape, target_return.shape)

        loss = self.loss_fn(pred_return, target_return)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.network_sync_counter += 1
        return loss.item()

In [3]:
q_net = nn.Sequential(
    nn.Linear(71, 128),
    nn.LeakyReLU(),
    nn.Linear(128, 128),
    nn.LeakyReLU(),
    nn.Linear(128, 128),
    nn.LeakyReLU(),
    nn.Linear(128, 9)
)
agent = QWOP_Agent(q_net, 1e-4, 100, 20000).load().to(device)
state = torch.randn(71).to(device)
agent.get_action(state, epsilon=0.0)

tensor(5, device='cuda:0')

In [4]:
from qwop_env import QWOP_Env
# from qwop_env_multi import QWOP_Env_Multi
import game_host

game_host.start()
env = QWOP_Env(headless=False)

In [8]:
from tqdm import tqdm

def train(env, agent, episodes=20000, epsilons=(0.4, 0.05, 1e-5)):
    epsilon = epsilons[0]

    index = 0
    pbar = tqdm(range(episodes))
    for i in pbar:
        obs, done, losses, ep_len, rew = env.reset(), False, 0, 0, 0
        # for _ in range(4): 
        #     obslist.append(obs)
    
        for _ in range(1000):
            ep_len += 1
            obs = torch.tensor(obs, dtype=torch.float32, device=device) # np.array(obslist)).to(device)
            A = agent.get_action(obs.unsqueeze(0), epsilon=epsilon)
            obs_next, reward, done = env.step(A)

            # obslist.popleft()
            # obslist.append(obs_next)
            obs_next = torch.tensor(obs_next, dtype=torch.float32, device=device)
            agent.collect_experience([obs, A.item(), reward, obs_next])

            rew += reward
            index += 1

            if index > 128:
                index = 0
                loss = agent.train(64)
                losses += loss

                if epsilon > epsilons[1]:
                    epsilon -= epsilons[2]
                    # env.feet_calf_limit = max(0.6, 1e-6)
                    pbar.set_postfix({"rew": rew, "eps": epsilon})
            
            if done:
                break

            pbar.set_postfix({"rew": rew, "eps": epsilon})
        

        agent.save()
        agent.save("models/dqn_backup.pth")

        pbar.set_postfix({"rew": rew, "eps": epsilon})

train(env, agent) #, epsilons=(0, 0, 0))

  obs = torch.tensor(obs, dtype=torch.float32, device=device) # np.array(obslist)).to(device)
  0%|          | 0/20000 [00:16<?, ?it/s, rew=22]           


KeyboardInterrupt: 