# Deep Q-Learning (DQN) :

The main idea of DQN is to use the Q-Learning algorithm with a neural network, which will approximate the action/value function. 

Q-learning works well when we have a simple environment, but when the number of states and actions increases, the problem become much more complex (more parameters and slower to train). So in practice, it becomes impossible to solve. That's why we try to approximate the value/action function with a neural network, instead of using lookup table.

DQN is an algorithm unstable. In order to overcome that, an Experience Replay has been implemented . The goal is to save the different state, reward action and next state at each step in our memory. Then, we can subsample n 

$Q(s_{t}, a) \leftarrow Q(s_{t}, a)+\alpha(r_{t+1}+\gamma \max_{p} Q(s_{t+1}, p))-Q(s_{t}, a))$

An improvement to reduce the unstability is to used a target network (freeze network) and a policy network (use for selecting the action during the experiment). The target network is updated every n steps (copy of the policy network)
$r_{t+1}+\gamma \max_{p} Q(s_{t+1}, p)$ is computing by the target network.

<img src="img/dqn.JPG" >    

  


In [19]:
import gym
import numpy as np
from matplotlib import pyplot as plt
import copy
import torch
from torch import nn
from torch.nn import functional as F
import random
from moviepy.editor import ImageSequenceClip


In [2]:
import os

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

In [3]:
history = {}

In [4]:
env = gym.make("CartPole-v0")
original_max_ep = env._max_episode_steps
env._max_episode_steps = 500


In [17]:
def test_model(env, policy_model, epsilon, device,gif_name=None):
    frames = []
    observation = env.reset()
    total_r = 0.
    state=np.array(observation).reshape(1,-1).astype(np.float32)
    for curr_step in range(env._max_episode_steps):
        frame = env.render(mode='rgb_array')
        frames.append(frame)
        action = select_action(env, policy_model, state, epsilon=epsilon, device=device)
        observation, reward, done, info = env.step(action)
        state = np.array(observation).reshape(1,-1).astype(np.float32)
        if done:
            
            print(f"num_steps:{curr_step} ")
            break
           
    env.close()
    if gif_name is not None:
        clip = ImageSequenceClip(frames, fps=20)
        clip.write_gif(gif_name, fps=20)


In [6]:
class ReplayMemory(object):
    def __init__(self, batch_size=32, max_memory=10000):
        self.batch_size=batch_size
        self.memory = []
        self.max_memory = max_memory

    def update(self, x):
        if len(self.memory) >= self.max_memory:
            self.memory.pop()

        self.memory.append(x)
        
    def sample(self):
        idx = np.random.randint(0, len(self.memory), (self.batch_size))
        
        return [self.memory[i] for i in idx]

    def __len__(self):
        return len(self.memory)
    
class ValueNetwork(nn.Module):
    def __init__(self,  HIDDEN_LAYER=30):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, HIDDEN_LAYER)
        self.outputs = nn.Linear(HIDDEN_LAYER, 2)
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.outputs(x)
        return x

In [7]:
def select_action(env, model, state, epsilon=None, device="cuda"):
    with torch.no_grad():
        if epsilon is None:

            q = model(torch.as_tensor(state).to(device)).detach().cpu().numpy()
            return np.argmax(q, axis=-1)[0]
        else:
            if np.random.uniform() < epsilon:
                action = env.action_space.sample() # your agent here (this takes random actions)
            else:
                q = model(torch.as_tensor(state).to(device)).detach().cpu().numpy()
                #print(q)
                action = np.argmax(q, axis=-1)[0]
            return action

In [8]:
epsilon=1.0
counts_steps = []
success = 0.
numberExp = 10000
expdecay = 5000
discount = 0.98
save_name = "dqn_mlp.pth"
device = "cpu"
loss_fn = nn.MSELoss()
best_score = 0

In [9]:
replay = ReplayMemory(batch_size=64, max_memory=5000)
policy_model = ValueNetwork().to(device)
policy_model.train()
target_model = ValueNetwork().to(device)
target_model.load_state_dict(policy_model.state_dict())
optimizer = torch.optim.Adam(policy_model.parameters(), lr=1e-2, amsgrad=True)


In [10]:
def train_step(policy_model, target_model, replay, optimizer, loss_fn, discount=0.95, device="cuda"):
    if len(replay.memory) < replay.batch_size:
        return
    
    batch = replay.sample()
    state, action, reward, new_state = zip(*batch)
    #state, action  = torch.as_tensor(np.asarray(state).reshape(-1,4)).to(device), torch.as_tensor(action).to(device)
    #reward, new_state = torch.as_tensor(reward).to(device), torch.as_tensor(np.asarray(new_state).reshape(-1,4)).to(device)
    state = torch.cat(state)
    new_state = torch.cat(new_state)
    action = torch.stack(action, dim=-1)
    reward = torch.stack(reward, dim=-1)
    done = (reward != -10).float() # if reward == -10, exp is finished, then the targets = reward, so we create a mask.
    # estimate q value
    #print(reward.shape)
    q = policy_model(state)
    one_hot = F.one_hot(action).bool()
    #print(q.shape, one_hot.shape, action)
    q = q[one_hot] # torch.gather(q, 1,  action)
    # expected max q value in next state
    with torch.no_grad():
        max_q = target_model(new_state).detach().max(1)[0]
        targets = reward + (discount * max_q*done)
    
    # correction
    optimizer.zero_grad()
    L = loss_fn(targets, q)
    
    L.backward()
    #torch.nn.utils.clip_grad_norm(policy_model.parameters(), 1.0)
    optimizer.step()

In [11]:
R = []
steps = 0
leave=False
for exp in range(numberExp+100):
    
    # launch experience
    observation = env.reset()
    total_r = 0.
    state = np.array(observation).reshape(1,-1).astype(np.float32)
    for curr_step in range(env._max_episode_steps+5000):
        action = select_action(env, policy_model, state, epsilon=epsilon, device=device)
        observation, reward, done, info = env.step(action)
        new_state = np.array(observation).reshape(1,-1).astype(np.float32)
        
        
        if done:
            reward = -10
            
        state, action  = torch.as_tensor(state).to(device), torch.as_tensor(action).to(device)
        reward, new_state = torch.as_tensor(reward).to(device), torch.as_tensor(new_state).to(device)
        replay.update([state, action, reward, new_state])
        
        
            
        if done:
            R.append(total_r)
            if np.mean(R[-20:]) >= best_score and epsilon == 0:
                best_score = np.mean(R[-20:])
                torch.save(policy_model.state_dict(), save_name)
                if best_score == 499:
                    print(R[-50:])
                    print("finished")
                    leave=True

            if exp%200 == 0:
                print(f" exp : {exp}  num_steps:{curr_step}, epsilon={epsilon}")

            counts_steps.append(curr_step)    
            
            break
        total_r += reward.cpu().numpy()
        state=new_state
        steps += 1
    train_step(policy_model, target_model, replay, optimizer, loss_fn, discount, device)
    if exp % 50 == 0:
        target_model.load_state_dict(policy_model.state_dict())
    epsilon = max(0.00, (expdecay-exp)/expdecay)
    if leave:
        break

env.close()

 exp : 0  num_steps:13, epsilon=1.0
 exp : 200  num_steps:10, epsilon=0.9602
 exp : 400  num_steps:23, epsilon=0.9202
 exp : 600  num_steps:69, epsilon=0.8802
 exp : 800  num_steps:46, epsilon=0.8402
 exp : 1000  num_steps:32, epsilon=0.8002
 exp : 1200  num_steps:54, epsilon=0.7602
 exp : 1400  num_steps:15, epsilon=0.7202
 exp : 1600  num_steps:56, epsilon=0.6802
 exp : 1800  num_steps:13, epsilon=0.6402
 exp : 2000  num_steps:12, epsilon=0.6002
 exp : 2200  num_steps:33, epsilon=0.5602
 exp : 2400  num_steps:15, epsilon=0.5202
 exp : 2600  num_steps:102, epsilon=0.4802
 exp : 2800  num_steps:18, epsilon=0.4402
 exp : 3000  num_steps:13, epsilon=0.4002
 exp : 3200  num_steps:107, epsilon=0.3602
 exp : 3400  num_steps:14, epsilon=0.3202
 exp : 3600  num_steps:81, epsilon=0.2802
 exp : 3800  num_steps:90, epsilon=0.2402
 exp : 4000  num_steps:13, epsilon=0.2002
 exp : 4200  num_steps:53, epsilon=0.1602
 exp : 4400  num_steps:99, epsilon=0.1202
 exp : 4600  num_steps:92, epsilon=0.0802


In [12]:
policy_model.load_state_dict(torch.load(save_name))

<All keys matched successfully>

In [15]:
best_score

499.0

In [20]:
test_model(env, policy_model, epsilon, device, gif_name="DQN_cartpole.gif")

t:   2%|█▎                                                                  | 10/501 [00:00<00:05, 95.16it/s, now=None]

num_steps:499 
MoviePy - Building file DQN_cartpole.gif with imageio.


                                                                                                                       