## Reinforcement learning - TME 4 - DQN

L'objectif du TME est d'implémenter les algorithmes de renforcement value-based étudiés en cours (Q-learning et ses variantes) et de les tester dans un framework classique (gym de open-ai, MDP GridWorld).

In [1]:
import matplotlib
#from matplotlib import pyplot as plt

matplotlib.use("TkAgg")
import gym
import gridworld
from gym import wrappers, logger
import numpy as np
import copy
import torch
from torch import nn

### Implémentation des algorithmes 

In [2]:
class RandomAgent(object):
    """The world's simplest agent!"""

    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return np.random.choice([0, 1])
    
    def learn(self, observation, reward, done):
        return

In [3]:
class NN(nn.Module):
    def __init__(self, inSize, outSize, layers=[]):
        super(NN, self).__init__()
        self.layers = nn.ModuleList([])
        for x in layers:
            self.layers.append(nn.Linear(inSize, x))
            inSize = x
        self.layers.append(nn.Linear(inSize, outSize))
        
    def forward(self, x):
        x = self.layers[0](x)
        for i in range(1, len(self.layers)):
            x = torch.nn.functional.leaky_relu(x)
            x = self.layers[i](x)
        return x

Paramètres CartPole

epsilon=0.01, epsilonDecay=0.99999
gamma=0.999
btachSize=100, capacity=100000
ctarget=100
layers=[200]
lr=0.001

LunarLander (convergence après environ 10000 episodes):
epsilon=0.1, epsilonDecay=0.99999
gamma=0.99
btachSize=1, capacity=1
ctarget=1000
layers=[200]
lr=0.0001

  Pour Gridworld  (convergence après environ 2000 episodes sur plan0 avec rewards={0:-0.001,3:1,4:1,5:-1,6:-1}):
epsilon=0.1, epsilonDecay=0.9999 (epsilon multiplié par epsilonDecay à chaque passage dans act)
gamma=0.99
batchSize=10, capacity=1000000
ctarget=1000 (fréquence de mise à jour du réseau cible)
layers=[30,30]
lr=0.0001 (learning rate)

In [4]:
class DQN(object):
    """Implementing a DQN learning agent"""

    def __init__(self, env, params):
        self.env = env
        self.N = params['N']
        self.D = np.zeros((self.N, 4 + 1 + 1 + 4 + 1))
        self.C = params['C']
        self.batch = params['batch']
        self.Q = NN(5, 1, params['layers'])
        self.Qhat = NN(5, 1, params['layers'])
        self.eps = params['eps']
        self.epsDecay = params['epsDecay']
        self.state = []
        self.step = -1
        self.gamma = params['gamma']
        self.loss = torch.nn.SmoothL1Loss()
        self.optim = torch.optim.Adam(params=self.Q.parameters(), lr=params['lr'])
        
    def act(self, observation, reward, done):
        self.state = observation
        if np.random.rand() < self.eps:
            self.action = np.random.choice([0, 1])
        else:
            input0 = torch.tensor(np.append(observation, [0])).float()
            input1 = torch.tensor(np.append(observation, [1])).float()
            self.action = 0 if self.Q(input0) > self.Q(input1) else 1
            
        self.eps *= self.epsDecay
        return self.action
    
    def learn(self, observation, reward, done):
        self.step += 1
        self.D[self.step % self.N] = list(self.state) + [self.action, reward] + list(observation) + [int(done)]
              
        inputs = self.D[np.random.randint(0, min(self.N, self.step+1), self.batch)]
        x = torch.from_numpy(inputs[:, 0:5]).float()
      #  print(x)
        rewards = inputs[:, 5]
        
        input0 = torch.from_numpy(np.append(inputs[:, 6:10], np.zeros((self.batch, 1)), axis=1)).float()
        input1 = torch.from_numpy(np.append(inputs[:, 6:10], np.ones((self.batch, 1)), axis=1)).float()
        y = rewards + self.gamma * (1-inputs[:, -1]) * np.max([self.Qhat(input0).detach().numpy(), self.Qhat(input1).detach().numpy()], axis=0).flatten()
        
        self.optim.zero_grad()
        l = self.loss(torch.from_numpy(y).float(), self.Q(x).flatten())
        l.backward()
        self.optim.step()
        
        if (self.step % self.C) == 0:
            self.Qhat = copy.deepcopy(self.Q)
        

In [9]:
env = gym.make('CartPole-v1')
env.seed(0)  # Initialise le seed du pseudo-random
np.random.seed(5)
params = {'eps':0.01, 'epsDecay':0.99999, 'batch':100, 'C':100, 'lr':0.005, 'layers':[200], 'gamma':0.999, 'N':100000}
agent = DQN(env, params)
#agent = RandomAgent(env.action_space)

outdir = outdir = 'cartpole-v0/random-agent-results'
envm = wrappers.Monitor(env, directory=outdir, force=True, video_callable=False)

for i in range(500):
    rsum = 0
    obs = envm.reset()
    reward = 0 
    done = False 
    
    while True:
        action = agent.act(obs, reward, done)
        obs, reward, done, _ = envm.step(action)
        agent.learn(obs, reward, done)
        rsum += reward
        if done:
            break
    print(rsum)
env.close()


10.0
9.0
8.0
11.0
8.0
9.0
10.0
9.0
10.0
9.0
8.0
11.0
8.0
9.0
10.0
8.0
10.0
8.0
9.0
11.0
8.0
9.0
10.0
10.0
9.0
10.0
10.0
8.0
9.0
10.0
10.0
9.0
9.0
9.0
9.0
10.0
9.0
9.0
9.0
10.0
9.0
9.0
9.0
10.0
9.0
9.0
9.0
10.0
9.0
11.0
10.0
10.0
9.0
10.0
11.0
10.0
9.0
8.0
10.0
10.0
16.0
29.0
10.0
43.0
31.0
10.0
62.0
8.0
8.0
8.0
11.0
29.0
11.0
24.0
37.0
51.0
49.0
51.0
32.0
40.0
41.0
37.0
13.0
47.0
31.0
24.0
26.0
22.0
12.0
14.0
15.0
17.0
15.0
15.0
16.0
14.0
18.0
18.0
80.0
17.0
100.0
101.0
104.0
94.0
107.0
115.0
134.0
138.0
158.0
152.0
186.0
141.0
131.0
126.0
119.0
106.0
118.0
128.0
112.0
120.0
121.0
117.0
122.0
135.0
158.0
138.0
114.0
105.0
125.0
126.0
136.0
129.0
124.0
115.0
128.0
117.0
118.0
164.0
103.0
146.0
110.0
126.0
117.0
120.0
124.0
110.0
166.0
195.0
100.0
107.0
196.0
95.0
92.0
92.0
95.0
104.0
92.0
108.0
114.0
122.0
123.0
106.0
117.0
117.0
116.0
115.0
109.0
97.0
105.0
113.0
97.0
122.0
103.0
115.0
114.0
107.0
118.0
120.0
129.0
117.0
116.0
145.0
353.0
282.0
316.0
23.0
92.0
94.0
91.0
107.0
103.0
116