In [1]:
import math
import random
import itertools
import matplotlib.pyplot as plt
from tqdm import tqdm

import numpy as np
import numba
import numpy.random
from collections import deque

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam

In [2]:
class GridWorld:
    def __init__(self, taille, position_start, good_end_position, bad_end_position):
        self.current_state = position_start  # État actuel (ligne, colonne)
        self.states = [[x, y] for x in range(taille[0]) for y in range(taille[1])]
        self.end_good_state = good_end_position  # État final (ligne, colonne)
        self.end_bad_state = bad_end_position
        self.grid_size = taille  # Taille de la grille (lignes, colonnes)
        self.stateSpace = {}
        self.matchStates()
        self.currentIntState = self.getStateInt(self.current_state)
        self.num_actions = 4  # Nombre total d'actions possibles (haut, bas, gauche, droite)
        self.reward = 0  # Récompense actuelle
        self.done = False  # Indique si la partie est terminée
        self.generate_grid()
        self.actions = [0, 1, 2, 3]
        self.rewards = [0, 1, 3]
        self.actionSpace = {0: -self.grid_size[0], 1: self.grid_size[0],
                            2: -1, 3: 1}

    
    def reset(self):
        self.done = False
        self.current_state = [np.random.randint(0,self.grid_size[0]-1), np.random.randint(0,self.grid_size[1]-1)]
        self.currentIntState = self.getStateInt(self.current_state)
        self.reward = 0
        
    def matchStates(self):
        i=0
        for s in self.states:
            self.stateSpace[str(s)] = i
            i = i+1
    
    def getStateInt(self, st):
        return self.stateSpace[str(st)]
    
    def getStateCouple(self, st):
        n_state = {i for i in self.stateSpace if self.stateSpace[i]==st}
        return list(n_state)
        
    
    def step(self, action):
        if action == 0:
            if self.current_state[0] == 0 :
                self.current_state[0] = self.grid_size[0] - 1
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour traverser le mur
                self.generate_grid()
                self.endgame()
            else:
                self.current_state[0] = self.current_state[0] - 1
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()

        elif action == 1:
            if self.current_state[0] == self.grid_size[0] - 1:
                self.current_state[0] = 0
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()
            else :
                self.current_state[0] = self.current_state[0] + 1
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()

        elif action == 2:
            if self.current_state[1] == 0:
                self.current_state[1] = self.grid_size[1] - 1
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()
            else:
                self.current_state[1] = self.current_state[1] - 1
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()

        elif action == 3:
            if self.current_state[1] == self.grid_size[1] - 1:
                self.current_state[1] = 0
                self.currentIntState = self.getStateInt(self.current_state)
                #print(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()
            else:
                self.current_state[1] = self.current_state[1] + 1
                self.currentIntState = self.getStateInt(self.current_state)
                self.reward = -1  # Pas de récompense pour avancer
                self.generate_grid()
                self.endgame()
                # Si l'on atteint l'état final, la partie est terminée
        return self.currentIntState, self.reward, self.done

    def endgame(self):
        if self.current_state == self.end_good_state:
            self.reward = 10  # Récompense de 1 pour atteindre l'état final
            self.done = True
        elif self.current_state == self.end_bad_state:
            self.reward = -10
            self.done = True

    # def update_grid(self):
    #     new_grid = [["_", "_", "_", "_"],
    #                 ["_", "_", "_", "_"],
    #                 ["_", "_", "_", "_"],
    #                 ["_", "_", "_", "_"]]
    #     new_grid[self.current_state[0]][self.current_state[1]] = "X"
    #     for i in new_grid:
    #         print(i)

    def generate_grid(self):
        grid=[]
        for i in range(self.grid_size[0]):
            grid.append([])
            for j in range(self.grid_size[1]):
                grid[i].append("_")
        grid[self.current_state[0]][self.current_state[1]] = "X"
        for i in grid:
            print(i)
        print("\n")
    

In [3]:
def build_compile_model(env):
    model = tf.keras.models.Sequential()
    model.add(Dense(24, input_dim=1, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(len(env.actions), activation='linear'))

    model.compile(loss='mse', optimizer=Adam(learning_rate=0.01))
    return model

In [4]:
def ddqn(env, episodes=1000, gamma=0.99, alpha=0.1, epsilon=0.1):
    q_network = build_compile_model(env)
    target_network = build_compile_model(env)
    nb_steps = 0
    first_episode = True
    
    step = 0
    reward = 0
    reward_per_episode = []
    step_by_episode = []
    batch_size = 32
    memory = deque(maxlen=2000)
    
    for episode in range(episodes):
        env.reset()
        print("Episode : ", episode)
        cumumated_reward = 0
        done = False
        current_state = env.currentIntState
        while not done:
            actions = env.actions

            q_values = q_network.predict(np.array([current_state]))[0]
            if np.random.rand() < epsilon:
                a = np.random.choice(actions)
            else:
                a= np.argmax(q_values)

            old_reward = env.reward
            new_state, reward, done = env.step(a)
            memory.append((old_reward, a, reward, new_state, done))
            
            if (len(memory) > batch_size):
                minibatch = random.sample(memory, batch_size)
                for s, ac, reward, ns, terminated in minibatch:
                    q_values = q_network.predict(np.array([s]))[0]
                    if terminated:
                        q_values[a] = reward
                    else:
                        t = target_network.predict(np.array([ns]))[0]
                        q_values[a] = reward + gamma * np.amax(t)
                    q_network.fit(np.array([s]), np.array([q_values]), verbose=0)
            
            cumumated_reward += reward
            step += 1

            if done:
                target_network.set_weights(q_network.get_weights())
                
        reward_per_episode.append(cumumated_reward)
        print(reward_per_episode)
    return reward_per_episode

In [None]:
if __name__ == '__main__':
    world = GridWorld([4,4], [3,3], [3,3], [0,0])
    scores = ddqn(world, episodes = 100)
    plt.plot(scores)

['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', 'X']


Episode :  0
['_', '_', 'X', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']


['_', 'X', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']


['X', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']


[-12]
Episode :  1
['X', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']


[-12, -10]
Episode :  2
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', 'X']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', 'X', '_', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['X', '_', '_', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', 'X']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', 'X', '_', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['X', '_', '_', '_']
['_', '_', '_', '_']




['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', 'X']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']




['_', '_', '_', '_']
['_', '_', '_', '_']
['_', 'X', '_', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['X', '_', '_', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', 'X']
['_', '_', '_', '_']




['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']




['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', 'X', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['X', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', '_', 'X']




[-12, -10, -60]
Episode :  3
['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', '_', '_']
['_', 'X', '_', '_']
['_', '_', '_', '_']




['_', '_', '_', '_']
['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']


['_', '_', '_', '_']
['_', '_', 'X', '_']
['_', '_', '_', '_']
['_', '_', '_', '_']


