In [1]:
import math
import random
import itertools
import matplotlib.pyplot as plt
from tqdm import tqdm

import numpy as np
import numba
import numpy.random
from collections import deque

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam

In [2]:
class LineWorld:
    def __init__(self, nb_cells=10, start_cell=1, good_end_cell=9, bad_end_cell=0):
        self.done = None
        self.current_state = start_cell  # État actuel
        self.end_good_state = good_end_cell  # État final
        self.end_bad_state = bad_end_cell
        self.reward = 0.0
        self.num_states = nb_cells  # Nombre total d'états
        self.states = [i for i in range(nb_cells)]
        self.actions = [0, 1]
        self.num_actions = 2  # Nombre total d'actions possibles
        self.line_world = ["_"] * (self.num_states - 1)
        self.line_world.insert(self.current_state, "X")

    def reset(self):
        self.reward = 0.0
        self.done = False
        self.current_state = np.random.randint(1, 8)

    def state_description(self):
        return np.array([self.current_state / (self.num_states - 1) * 2.0 - 1.0])

    def state_dim(self):
        return len(self.state_description())

    def step(self, action):
        # Si l'action est 1, on avance à droite
        if (action == 1) and (self.current_state != self.end_good_state) and (self.current_state != self.end_bad_state):
            self.current_state += 1
            self.reward = 0  # Pas de récompense pour avancer
            self.line_world.remove("X")
            self.line_world.insert(self.current_state, "X")
            print(self.line_world)
        # Si l'action est 0, on avance à gauche
        elif (action == 0) and (self.current_state != self.end_good_state) and (
                self.current_state != self.end_bad_state):
            self.current_state -= 1
            self.reward = 0  # Pas de récompense pour avancer
            self.line_world.remove("X")
            self.line_world.insert(self.current_state, "X")
            print(self.line_world)
        # Si l'on atteint l'état final, la partie est terminée
        if self.current_state == self.end_good_state:
            self.reward = 1  # Récompense de 1 pour atteindre l'état final
            # print(self.line_world)
            self.done = True
        elif self.current_state == self.end_bad_state:
            self.reward = -1
            # print(self.line_world)
            self.done = True
        return self.current_state, self.reward, self.done

In [17]:
def build_compile_model(env):
    model = tf.keras.models.Sequential()
    model.add(Dense(24, input_dim=1, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(len(env.actions), activation='linear'))

    model.compile(loss='mse', optimizer=Adam(learning_rate=0.01))
    return model

In [8]:
def ddqn_per(env, max_iter=1000, gamma=0.99, alpha=0.1, epsilon=0.1):
    q_network = build_compile_model(env)
    target_network = build_compile_model(env)
    nb_steps = 0
    first_episode = True
    
    step = 0
    reward = 0
    cumumated_reward = 0
    reward_per_episode = []
    step_by_episode = []
    batch_size = 32
    memory = deque(maxlen=2000)
    priority = deque(maxlen=2000)
    
    for iteration in range(max_iter):
        print("Episode : ", iteration)
        if env.done:
            reward_per_episode.append(env.reward)
            step_by_episode.append(step)
            env.reset()
            step = 0
            cumumated_reward = 0
        
        actions = env.actions
        current_state = env.current_state
        q_values = q_network.predict(np.array([current_state]))[0]
        if np.random.rand() < epsilon:
            a = np.random.choice(actions)
        else:
            a= np.argmax(q_values)

        old_reward = env.reward
        new_state, reward, done = env.step(a)
        print(reward)
        memory.append((old_reward, a, reward, new_state, done))
        priority.append(abs(reward + gamma * np.amax(target_network.predict(np.array([new_state]))[0]) - q_values[a]))
            
        if (len(memory) > batch_size):
            priority_sum = np.sum(priority)
            probabilities = [p/priority_sum for p in priority]
            minibatch = np.random.choice(len(memory), batch_size, p=probabilities, replace=False)
            for m in minibatch:
                s, ac, rreward, ns, terminated = memory[m]
                q_values = q_network.predict(np.array([s]))[0]
                if terminated:
                    q_values[a] = rreward
                else:
                    t = target_network.predict(np.array([ns]))[0]
                    q_values[a] = rreward + gamma * np.amax(t)
                q_network.fit(np.array([s]), np.array([q_values]), verbose=0)
            
        cumumated_reward += reward
        step += 1

        if done:
            target_network.set_weights(q_network.get_weights())
                
    return reward_per_episode, step_by_episode

In [19]:
world = LineWorld()
scores, steps = ddqn_per(world, max_iter = 1000)
plt.plot(scores)
plt.show()
plt.plot(steps)
plt.show()

Episode :  0
['_', '_', 'X', '_', '_', '_', '_', '_', '_', '_']
0
Episode :  1
['_', '_', '_', 'X', '_', '_', '_', '_', '_', '_']
0
Episode :  2
['_', '_', '_', '_', 'X', '_', '_', '_', '_', '_']
0
Episode :  3
['_', '_', '_', '_', '_', 'X', '_', '_', '_', '_']
0
Episode :  4
['_', '_', '_', '_', '_', '_', 'X', '_', '_', '_']
0
Episode :  5
['_', '_', '_', '_', '_', '_', '_', 'X', '_', '_']
0
Episode :  6
['_', '_', '_', '_', '_', '_', '_', '_', 'X', '_']
0
Episode :  7
['_', '_', '_', '_', '_', '_', '_', '_', '_', 'X']
1
Episode :  8
['_', '_', '_', '_', '_', '_', '_', '_', 'X', '_']
0
Episode :  9
['_', '_', '_', '_', '_', '_', '_', '_', '_', 'X']
1
Episode :  10
['_', '_', '_', '_', '_', '_', '_', '_', 'X', '_']
0
Episode :  11
['_', '_', '_', '_', '_', '_', '_', '_', '_', 'X']
1
Episode :  12
['_', '_', '_', '_', '_', '_', 'X', '_', '_', '_']
0
Episode :  13
['_', '_', '_', '_', '_', '_', '_', 'X', '_', '_']
0
Episode :  14
['_', '_', '_', '_', '_', '_', '_', '_', 'X', '_']
0
Episo

KeyboardInterrupt: 