In [None]:
import gym, random, os, math
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from atari_wrappers import make_atari, wrap_deepmind,LazyFrames
from tqdm import tqdm
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [None]:
# Create and wrap the environment
env = make_atari('PongNoFrameskip-v4')
env = wrap_deepmind(env, scale = False, frame_stack=True )

In [None]:
class QNetwork(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_actions):
        super(QNetwork, self).__init__()
        self.encoder_conv = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), # bs*32*19*19
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2), # bs*64*9*9
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), # bs*64*7*7
            nn.ReLU()
        )
        self.encoder_linear = nn.Sequential(
            nn.Linear(7 * 7 * 64, 512), # bs*512
            nn.ReLU(),
            nn.Linear(512, hidden_dim), # bs*hid_dim
            nn.ReLU()
        )
        self.DQN = nn.Linear(hidden_dim, num_actions) # bs*num_actions
        self.encoder_conv = torch.load('saved_model/daqn_pre_conv')
        self.encoder_linear = torch.load('saved_model/daqn_pre_linear')

    def forward(self, x):
        ## encoder:input->hidden
        hidden = self.encoder_conv(x)
        hidden = hidden.reshape(hidden.size(0),-1)
        hidden = self.encoder_linear(hidden)
        ## DQN:hidden->qtable
        qtable = self.DQN(hidden)
        
        return qtable

In [None]:
class Memory(object):
    def __init__(self, memory_size=100000):
        self.buffer = []
        self.memory_size = memory_size
        self.next_idx = 0
        
    def push(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if len(self.buffer) <= self.memory_size: 
            self.buffer.append(data)
        else: # buffer is full
            self.buffer[self.next_idx] = data
        self.next_idx = (self.next_idx + 1) % self.memory_size

    def size(self):
        return len(self.buffer)

In [None]:
class DAQNAgent: 
    def __init__(self, in_channels = 1, action_space = [], hidden_dim = 6, USE_CUDA = False, memory_size = 10000, epsilon  = 1, lr = 1e-4):
        self.epsilon = epsilon
        self.action_space = action_space
        self.memory = Memory(memory_size)
        self.behaviourNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet.load_state_dict(self.behaviourNet.state_dict())

        self.USE_CUDA = USE_CUDA
        if USE_CUDA:
            self.behaviourNet = self.behaviourNet.cuda()
            self.targetNet = self.targetNet.cuda()
        self.optimizer = torch.optim.Adam(self.behaviourNet.parameters(),lr=lr)

    def observe(self, lazyframe):
        # from Lazy frame to tensor
        state =  torch.from_numpy(lazyframe._force().transpose(2,0,1)[None]/255).float()
        if self.USE_CUDA:
            state = state.cuda()
        return state

    def value(self, state):
        q_values = self.behaviourNet(state)
        return q_values
    
    def act(self, state, epsilon = None):
        if epsilon is None: epsilon = self.epsilon
        q_values = self.value(state).cpu().detach().numpy()
        if random.random() < epsilon:
            aciton = random.randrange(self.action_space.n)
        else:
            aciton = q_values.argmax(1)[0]
        return aciton
    
    def compute_td_loss(self, states, actions, rewards, next_states, is_done, gamma = 0.99):
        actions = torch.tensor(actions).long()    # shape: [batch_size]
        rewards = torch.tensor(rewards, dtype =torch.float)  # shape: [batch_size]
        is_done = torch.tensor(is_done).bool()  # shape: [batch_size]
        if self.USE_CUDA:
            actions = actions.cuda()
            rewards = rewards.cuda()
            is_done = is_done.cuda()
            
        # get q-values for all actions in current states
        predicted_qvalues = self.behaviourNet(states)
        # select q-values for chosen actions:Q(s_t,a_t)
        predicted_qvalues_for_actions = predicted_qvalues[range(states.shape[0]), actions]
        # compute q-values for all actions in next states:Q(s_t+1,a*)
        predicted_next_qvalues = self.targetNet(next_states)
        # compute V*(next_states) using predicted next q-values:max Q(s_t+1,a*)
        next_state_values =  predicted_next_qvalues.max(-1)[0] 
        # compute "target q-values" for loss 
        target_qvalues_for_actions = rewards + gamma *next_state_values
        # at the last state we shall use simplified formula: done or not
        target_qvalues_for_actions = torch.where(is_done, rewards, target_qvalues_for_actions)

        # loss
        loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())

        return loss
    
    def sample_from_buffer(self, batch_size):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(batch_size):
            idx = random.randint(0, self.memory.size() - 1)
            data = self.memory.buffer[idx]
            frame, action, reward, next_frame, done = data
            states.append(self.observe(frame))
            actions.append(action)
            rewards.append(reward)
            next_states.append(self.observe(next_frame))
            dones.append(done)
        return torch.cat(states), actions, rewards, torch.cat(next_states), dones

    def learn_from_experience(self, batch_size):
        states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
        td_loss = self.compute_td_loss(states, actions, rewards, next_states, dones)
        self.optimizer.zero_grad()
        td_loss.backward()
        for param in self.behaviourNet.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        return(td_loss.item())
    
    def save_model(self):
        torch.save(self.behaviourNet, 'saved_model\DAQN')

In [None]:
## HyperParameters
gamma = 0.99
epsilon_max = 1
epsilon_min = 0.05
eps_decay = 30000
frames = 1000000
USE_CUDA = torch.cuda.is_available()
learning_rate = 2e-4
max_buff = 100000
update_tar_interval = 1000
batch_size = 32
print_interval = 10000

action_space = env.action_space
action_dim = env.action_space.n
state_channel = env.observation_space.shape[2]
hidden_dim = 6
agent = DAQNAgent(in_channels = state_channel, action_space = action_space, hidden_dim = hidden_dim, 
                 USE_CUDA = USE_CUDA, lr = learning_rate, memory_size = max_buff)

for _ in tqdm(range(100)):
    frame = env.reset()
    done = False
    while not done:
        action = random.randrange(agent.action_space.n)
        next_frame, reward, done, _ = env.step(action)
        agent.memory.push(frame, action, reward, next_frame, done)
        frame = next_frame
frame = env.reset()

episode_reward = 0
all_rewards = []
avg_rewards = []
losses = []
episode_num = 0
save_flag = False

# e-greedy decay
epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon_max - epsilon_min) * math.exp(
            -1. * frame_idx / eps_decay)

for i in tqdm(range(frames)):
    epsilon = epsilon_by_frame(i)
    state_tensor = agent.observe(frame)
    action = agent.act(state_tensor, epsilon)
    
    next_frame, reward, done, _ = env.step(action)
    
    episode_reward += reward
    agent.memory.push(frame, action, reward, next_frame, done)
    frame = next_frame
    
    loss = agent.learn_from_experience(batch_size)
    losses.append(loss)

    if i % print_interval == 0:
        print("frames: %5d, reward: %5f, loss: %4f, epsilon: %5f, episode: %4d" % (i, np.mean(all_rewards[-10:]), loss, epsilon, episode_num))

    if i % update_tar_interval == 0:
        agent.targetNet.load_state_dict(agent.behaviourNet.state_dict())
    
    if done:
        frame = env.reset()
        all_rewards.append(episode_reward)        
        episode_reward = 0
        episode_num += 1
        
        avg_reward = np.mean(all_rewards[-50:])
        avg_rewards.append(avg_reward)
        
        if avg_reward > 19:
            if save_flag == False:
                agent.save_model()
                save_flag = True
                max_reward = avg_reward
                print("model saved, episode:", episode_num, ",avg reward:", avg_reward)
            else:
                if avg_reward >= max_reward:
                    agent.save_model()
                    max_reward = avg_reward
                    print("model saved, episode:", episode_num, ",avg reward:", avg_reward)

In [None]:
# save learning curve
learning_curve = np.array(avg_rewards)
np.save('curve\DAQN', learning_curve)