In [1]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import cv2
import pickle
from PIL import Image

In [2]:
def get_speed_wheel(image):
    '''
    extract the speed and the wheel from the image, the speed is represented by the number of white pixels in the speedometer and their intensity
    the wheel is represented by the number of green pixels
    both extracted values are normalized:
    speed: (0, 1, 2, ..., 20)
    wheel: (-10, -8, -7, -5, -4, -2, -1, 0, 1, 2, 4, 5, 7, 8, 10)
    '''
    speed_img = image[90:94,12:14]
    w1 = np.all(speed_img > [240, 240, 240], axis=-1)
    w2 = np.all(speed_img > [210, 210, 210], axis=-1)
    w3 = np.all(speed_img > [180, 180, 180], axis=-1)
    w4 = np.all(speed_img > [150, 150, 150], axis=-1)
    w5 = np.all(speed_img > [120, 120, 120], axis=-1)
    w6 = np.all(speed_img > [90, 90, 90], axis=-1)
    w7 = np.all(speed_img > [60, 60, 60], axis=-1)
    w8 = np.all(speed_img > [30, 30, 30], axis=-1)
    speed1 = np.sum(w1)
    speed2 = np.sum(w2)
    speed3 = np.sum(w3)
    speed4 = np.sum(w4)
    speed5 = np.sum(w5)
    speed6 = np.sum(w6)
    speed7 = np.sum(w7)
    speed8 = np.sum(w8)
    speed = math.ceil((speed1 + speed2 + speed3 + speed4 + speed5 + speed6 + speed7 + speed8) / 2)
    if speed > 20:
        speed = 20 # (0, 1, 2, ..., 20)

    wheel_image = image[86:92,36:60]
    R, G, B = wheel_image[:, :, 0], wheel_image[:, :, 1], wheel_image[:, :, 2]
    green_mask = (G > 250) & (R == 0) & (B == 0)
    left_green = green_mask[:,:12]
    right_green = green_mask[:,12:]
    left_count = np.sum(left_green)
    right_count = np.sum(right_green)
    if left_count > 0 and right_count > 0:
        print('ERR')
    green_pixels = max(left_count, right_count)
    wheel = math.ceil(green_pixels / 4) # (-7, -6, ..., 0, ..., 6, 7)
    if right_count > 0:
        wheel *= -1

    return speed, wheel

def highlight_track(image):
    '''
    This function removes red and green pixels from the image, removing curbs and grass and highlighting the track
    '''
    r, g, b = cv2.split(image)
    mask_white_only = (r > 130) & (g > 130) & (b > 130)
    mask_white = np.zeros_like(mask_white_only, dtype=bool)
    mask_white[mask_white_only] = True
    mask_diff_only = (np.abs(r - g) > 10) | (np.abs(r - b) > 10) | (np.abs(g - b) > 10)
    mask_diff = np.zeros_like(mask_diff_only, dtype=bool)
    mask_diff[mask_diff_only] = True
    mask = mask_diff | mask_white
    image[mask > 0] = [255, 255, 255]
    image[mask == 0] = [0, 0, 0]
    return image

def is_out_of_track(image):
    '''
    this function checks if the car is out of the track
    '''
    zoom_state = image[0:64, 14:78]
    h_t = highlight_track(zoom_state)
    edges = cv2.Canny(h_t, threshold1=80, threshold2=120)
    down_indices = np.where(edges[63, :] > 0)[0]
    if len(down_indices) == 0:
        return True
    if len(down_indices) > 2:
        down_indices = np.array(sorted(down_indices, key=lambda elem: abs(elem - 32))[:2]) #takes the 2 nearest to center

    if (np.all(down_indices > 37) or np.all(down_indices < 27)) and len(down_indices) == 2:
        return True
    return False

def discretize_state(state):
    '''
    this function discretize the state in order to use it as a key in the Q table
    '''
    speed, wheel = get_speed_wheel(state)
    zoom_state = state[0:64, 16:80]
    zoom_state_resized = cv2.resize(zoom_state, (32, 32), interpolation=cv2.INTER_LINEAR)
    h_t = highlight_track(zoom_state_resized)
    edges = cv2.Canny(h_t, threshold1=80, threshold2=120)

    edges = edges / 255.0 #normalize the input to 0 and 1 more easy for cnn

    image_tensor = torch.tensor(edges, dtype=torch.float32).unsqueeze(0)
    numeric_values_tensor = torch.tensor([speed, wheel], dtype=torch.float32).unsqueeze(0)
    return image_tensor, numeric_values_tensor

In [3]:
plt.ion() #activate interaction mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class DQN(nn.Module):
    def __init__(self, n_actions):
          super(DQN, self).__init__()
          #self.dropout = nn.Dropout(0.1)

          self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)  # Input 32x32, Output 16x14x14
          self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)  # Input 16x14x14, Output 32x5x5
          self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=2)  # Input 32x5x5, Output 32x2x2

          # Modifica del fully connected layer
          self.fc1 = nn.Linear(32 * 2 * 2, 64)  # Output dopo le convoluzioni: 128 unità

          self.fc2 = nn.Linear(2, 16)  # Percorso per speed e wheel

          self.fc3 = nn.Linear(64 + 16, 64)  # Combina l'output della CNN e dei valori numerici
          self.fc4 = nn.Linear(64, n_actions)

    def forward(self, image, numeric_values):
        if image.dim() == 3:  # Se l'immagine ha dimensione [1, altezza, larghezza]
          image = image.unsqueeze(0)  # Aggiungi la dimensione batch

        x = F.relu(self.conv1(image))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten the output from the CNN
        x = F.relu(self.fc1(x))    # Fully connected layer per le caratteristiche estratte dalle convoluzioni

        #x = self.dropout(x)  # Applica dropout al fully connected layer

        y = F.relu(self.fc2(numeric_values)) # Percorso per i valori numerici (velocità, sterzo)

        combined = torch.cat((x, y), dim=1) # Concatenazione dei due percorsi
        combined = F.relu(self.fc3(combined))
        output = self.fc4(combined)

        return output

In [5]:
class DQNAgent:
    def __init__(self, action_size, gamma=0.99, epsilon=0.9, epsilon_decay=0.999, lr=1e-4):
        self.action_size = action_size
        self.memory = deque([], maxlen=10000)
        self.gamma = gamma    # how much is important the reward of next step, and next of next, ...
        self.epsilon = epsilon   # exploration rate
        self.epsilon_min = 0.15
        self.epsilon_decay = epsilon_decay
        self.learning_rate = lr
        self.policy_net = DQN(action_size).to(device) #computes Q values for each action in a different state, updated during training (model)
        self.target_net = DQN(action_size).to(device) #computes targets to update weights of policy_net (t_model)
        self.update_target_net()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) # amsgrad=True) -> use AdamW that include a weight decay

    def update_target_net(self):
        ''' this function updates the target network with the weights of the policy network
        '''
        self.target_net.load_state_dict(self.policy_net.state_dict()) #(initialization) useful to sync the weights of the 2 networks
        self.target_net.eval()

    def remember(self, state, action, reward, next_state, truncated, terminated):
        ''' save the state, action, reward, next_state, truncated, terminated in the memory '''
        self.memory.append((state, action, reward, next_state, truncated, terminated))

    def act(self, state):
        ''' choose an action based on the state, with epsilon-greedy policy'''
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        im_state, num_state = discretize_state(state)
        im_state = im_state.to(device)
        num_state = num_state.to(device)

        act_values = self.policy_net(im_state, num_state)
        return torch.argmax(act_values).item()

    def replay(self, batch_size):
        ''' train the model extracting a batch of samples from the memory '''
        minibatch = random.sample(self.memory, batch_size) # take random samples in mem

        for state, action, reward, next_state, truncated, terminated in minibatch:

            im_state, num_state = discretize_state(state)
            im_state = im_state.to(device)
            num_state = num_state.to(device)

            if not (truncated or terminated):
                im_state_next, num_state_next = discretize_state(next_state)
                im_state_next = im_state_next.to(device)
                num_state_next = num_state_next.to(device)

                target = reward + self.gamma * torch.max(self.target_net(im_state_next, num_state_next)).item() #compute target based on the next_state
            else:
                target = reward

            target_f = self.policy_net(im_state, num_state)
            target_f[0][action] = target

            self.optimizer.zero_grad()
            loss = torch.nn.functional.mse_loss(self.policy_net(im_state, num_state), target_f)
            loss.backward()
            self.optimizer.step()

    def test_trained_model(self, state):
        '''
        this function is used to test the trained model, it chooses the best action based on the Q values
        '''
        self.policy_net.eval()

        im_state, num_state = discretize_state(state)

        im_state = im_state.to(device)
        num_state = num_state.to(device)

        with torch.no_grad():
            q_values = self.policy_net(im_state, num_state)
            action = torch.argmax(q_values).item()
        return action


In [6]:
def train(agent, env, batch_size, num_episodes=100):

    truncated = terminated = False

    for e in range(num_episodes):
        state, _ = env.reset()
        step = 0
        score = 0
        while True:
            if step < 45: #skip the first phase of the episode while the image of the output is zooming in
                state, _, _, _, _ = env.step(0)
                step += 1
                continue
            action = agent.act(state)
            next_state, reward, truncated, terminated, info = env.step(action)
            if is_out_of_track(next_state): #check if the car is out of the track
                print('OUT')
                reward = -100
                terminated = True
            agent.remember(state, action, reward, next_state, truncated, terminated)
            score += reward
            state = next_state
            if truncated or terminated:
                agent.update_target_net() # update the target network at the end of each episode
                print(f"Episode: {e + 1}/{num_episodes}, epsilon: {agent.epsilon}, Episode-reward: {score}, steps_done: {step}")
                break
            step += 1
        agent.replay(batch_size) #train the model with a batch of samples from the memory at the end of each episode
        if agent.epsilon > agent.epsilon_min:
            agent.epsilon *= agent.epsilon_decay

In [41]:
def evaluate(agent, env, num_episodes=100, render=False, random_a=False):
    total_score = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        truncated = terminated = False
        episode_score = 0
        step = 0
        while not (truncated or terminated):
            if step < 45: #skip the first zoom samples
                state, _, _, _, _ = env.step(0)
                step += 1
                continue
            if random_a:
                action = random.randint(0, 4)
            else:
                action = agent.test_trained_model(state)
            next_state, reward, truncated, terminated, _ = env.step(action)
            # if is_out_of_track(next_state):
            #     print('OUT')
            #     reward = -100
            #     terminated = True
            episode_score += reward
            state = next_state
            if render and random.randint(0, 3) == 0:
                img = env.render()
                img_to_save = Image.fromarray(img)
                img_to_save.save(f'./im_good/{step}.png')
                plt.imshow(img)
                plt.axis('off')
                plt.show()
                clear_output(wait=True)
            step += 1
        total_score += episode_score
    average_score = total_score / num_episodes
    return average_score

In [20]:
env = gym.make("CarRacing-v2", domain_randomize=False, continuous=False, render_mode="rgb_array")

batch_size = 512
episodes = 3000
gamma = 0.99
epsilon = 1
epsilon_decay = 0.9992
lr = 1e-4

agent = DQNAgent(env.action_space.n, gamma=gamma, epsilon=epsilon, epsilon_decay=epsilon_decay, lr=lr)

In [None]:
print(evaluate(agent, env, 15, render=False, random_a=True))

In [None]:
train(agent, env, batch_size, episodes)

In [None]:
print(evaluate(agent, env, 15, render=False, random_a=True))

In [25]:
''' save the agent, the environment and the hyperparameters and the results of the evaluation '''
# with open('DQN/env7.pkl', 'wb') as f:
# 	pickle.dump(env, f)
# with open('DQN/agent7.pkl', 'wb') as f:
# 	pickle.dump(agent, f)
# with open('hyperparameters7.txt', 'w') as f:
# 	f.write(f'batch_size: {batch_size}\n')
# 	f.write(f'episodes: {episodes}\n')
# 	f.write(f'gamma: {gamma}\n')
# 	f.write(f'epsilon: {epsilon}\n')
# 	f.write(f'epsilon_decay: {epsilon_decay}\n')
# 	f.write(f'lr: {lr}\n')
# 	f.write(f'result_random_15_ep: {evaluate(agent, env, 15, render=False, random_a=True)}\n')
# 	f.write(f'result_trained_15_ep: {evaluate(agent, env, 15, render=False, random_a=False)}\n')


In [46]:
''' load pretrained model with pkl and test it '''
# with open('DQN/agent1.pkl', 'rb') as f:
# 	agent_test = pickle.load(f)
# with open('DQN/env1.pkl', 'rb') as f:
# 	env_test = pickle.load(f)

# print(evaluate(agent_test, env_test, 1, render=True))

891.5550161812156
