In [1]:
import random
import numpy as np
import cv2

from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim

from AutoEncoder import VAE

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
def image_processing(image, height=83, width=83):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = image.astype(float)
    image /= 255.0
    image = image[:height, :width]
    return image

In [4]:
import math

def dim_encoder(Hin, Win, padding, dilation, kernel_size, stride):
    Hout = (Hin + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1
    Wout = (Win + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1
    return math.floor(Hout), math.floor(Wout)

Hin, Win = 83, 83
Hout, Wout = dim_encoder(Hin, Win, padding=(0, 0), dilation=(1, 1), kernel_size=(7, 7), stride=(3, 3))
Hout, Wout = math.floor(Hout/2), math.floor(Wout/2)
Hout, Wout = dim_encoder(Hout, Wout, padding=(0, 0), dilation=(1, 1), kernel_size=(4, 4), stride=(1, 1))
Hout, Wout = math.floor(Hout/2), math.floor(Wout/2)

print(Hout, Wout)

5 5


In [1]:
class DQN(nn.Module):
    def __init__(self, action_space):
        super(DQN, self).__init__()
        
        self.linear_layers = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(True),
            nn.Linear(64, 32),
            nn.ReLU(True),
            nn.Linear(32, len(action_space)),
        )
        
        self.sig = nn.Sigmoid()
    
    def forward(self, X):
        X = self.linear_layers(X)
        return X
    
    
class DQNAgent:
    def __init__(self, lr=0.001, weight_decay=1e-5, epochs=1, batch_size=128, memory_size=5000, epsilon=0.98, gamma=0.99):        
        self.action_space = [
            (0, 1, 0), (-1, 0, 0), 
            (1, 0, 0), (0, 0, 1),
        ]
        
        # Memory for the experience replay
        self.memory = deque(maxlen=memory_size)
        # Gamma for the Q update
        self.gamma = gamma
        # Epsilon to deal with the exploration/explotation trade-off
        self.espilon = epsilon
        
        # Models of the agent
        self.model = DQN(self.action_space).to(device)
        self.target = DQN(self.action_space).to(device)
        self.optim_model = optim.Adam(params=self.model.parameters(), lr=lr, weight_decay=weight_decay)
        self.optim_target = optim.Adam(params=self.target.parameters(), lr=lr, weight_decay=weight_decay)
        self.loss = nn.MSELoss()
        self.epochs = epochs
        self.batch_size = batch_size
        
        # Variational AutoEncoder used by the agent
        self.encoder = VAE()
        self.encoder.load_state_dict(torch.load("models/vae"))
        
    def play_action(self, state):
        # Exploitation
        if np.random.rand() < self.espilon:
            state = state.to(device)
            actions = self.model(state).cpu().detach().numpy()[0]
            action = np.argmax(actions)
        # Exploration
        else:
            action = np.random.randint(0, len(self.action_space))

        return self.action_space[action]
    
    def fit(self, X, y):
        X = torch.from_numpy(X.astype('float32')).to(device)
        y = torch.from_numpy(y.astype('float32'))
        
        for _ in range(self.epochs):
            y_pred = self.model(X).cpu()[:, 0, :]
            loss = self.loss(y_pred, y)
            self.optim_model.zero_grad()
            loss.backward()
            self.optim_model.step()
            
    def replay_memory(self):
        # Random selection from memory to create a batch of data
        minibatch = random.sample(self.memory, self.batch_size)
        train_state = []
        train_target = []
        
        for state, action, reward, next_state, terminate in minibatch:
            # Prediction of the target
            state = state.to(device)
            target = self.model(state).cpu().detach().numpy()[0]
            if terminate:
                target[action] = reward
            # Update of the target according to the next state
            else:
                next_state = next_state.to(device)
                t = self.target(next_state).cpu().detach().numpy()[0]
                target[action] = reward + self.gamma * np.amax(t)
            train_state.append(state.cpu().detach().numpy())
            train_target.append(target)
        # Gradient descent
        self.fit(np.array(train_state), np.array(train_target))
        
    def encode_state(self, state):
        # Encode the state from the sampling of the latent distribution
        mu, log_var = self.encoder.encode(state)
        return self.encoder.reparameterize(mu, log_var)
            
    def memorize(self, state, action, reward, next_state, terminate):
        self.memory.append((state, self.action_space.index(action), reward, next_state, terminate))
        
    def update_target(self):
        self.target.load_state_dict(self.model.state_dict())
            


NameError: name 'nn' is not defined

In [6]:
import gym
from gym.envs.box2d.car_racing import CarRacing
import logging
import traceback

EPISODES = 100
BATCH_SIZE = 128
UPDATE_TARGET_FREQUENCY = 5

env = gym.make("CarRacing-v0")
car = DQNAgent()
reward_history = []

file = open("logs/dqn_ae/dqn_ae_log.txt", "w")

try:
    for eps in range(1, EPISODES+1):        
        
        # Initialisation
        init_state = env.reset()
        init_state = image_processing(init_state)
        init_state = np.expand_dims(init_state, axis=0)
        init_state = torch.from_numpy(init_state.astype('float32')).unsqueeze(dim=0)
        
        total_reward = 0
        counter_non_moving = 0
        terminate = False
        
        # Encoding of the state
        current_state = car.encode_state(init_state)
        for _ in range(50):
            env.step(None)
        
        while True:
            # Predict action
            action = car.play_action(current_state)
            
            # Play the action for 2 frames
            reward = 0
            for _ in range(2):
                next_state, r, terminate, info = env.step(action)
                reward += r
                if terminate:
                    break
            
            # Count the number of times the agent interrupts in a row 
            # Avoid long episode where the agent does not move at all
            if action[1] != 1:
                counter_non_moving += 1
            else:
                counter_non_moving = 0
            
            # Update of the reward
            total_reward += reward
            
            # Memorization
            next_state = image_processing(next_state)
            next_state = np.expand_dims(next_state, axis=0)
            next_state = torch.from_numpy(next_state.astype('float32')).unsqueeze(dim=0)
            
            # Encode the state
            next_state = car.encode_state(next_state)
            car.memorize(current_state, action, reward, next_state, terminate)
            
            # Termination conditions check
            if terminate or total_reward < 0 or counter_non_moving >= 50:
                file.write(f"Episode {eps}/{EPISODES}, total reward: {total_reward}\n")
                print(f"Episode {eps}/{EPISODES}, total reward: {total_reward}")
                break
            
            # Experience replay
            if len(car.memory) > BATCH_SIZE:
                car.replay_memory()
                
            current_state = next_state
                
        # Update of the target model
        if eps % UPDATE_TARGET_FREQUENCY == 0:
            car.update_target()
            
        # Update reward history over time
        reward_history.append(total_reward)
            
except (Exception, KeyboardInterrupt) as e:
    logging.error(traceback.format_exc())
        
file.close()
env.close()

Track generation: 1179..1481 -> 302-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1265..1585 -> 320-tiles track
Episode 1/100, total reward: -0.18244514106571832
Track generation: 1056..1324 -> 268-tiles track
Episode 2/100, total reward: -0.07340823970032528
Track generation: 1182..1482 -> 300-tiles track
Episode 3/100, total reward: -0.04414715719056625
Track generation: 1172..1469 -> 297-tiles track
Episode 4/100, total reward: -0.10810810810806581
Track generation: 1231..1552 -> 321-tiles track
Episode 5/100, total reward: -0.1499999999999947
Track generation: 1159..1453 -> 294-tiles track
Episode 6/100, total reward: -0.16109215017063563
Track generation: 1027..1290 -> 263-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1106..1394 -> 288-tiles track
Episode 7/100, total reward: -0.01951219512219035
Track generation: 1035..1298 -> 263-tiles track
Episode 8/1