In [1]:
import numpy as np
import gymnasium as gym
import random
from collections import defaultdict

In [2]:
#v1
#Agent Q-Learning optimisé pour une convergence rapide.
    #- Décroissance exponentielle de epsilon
    #- Décroissance logarithmique de alpha
#Paramètres de l'agent
alpha_start = 0.1
gamma = 0.99  # Facteur de réduction (discount)
epsilon_start = 1.0
epsilon_min = 0.01
epsilon_decay = 0.0001
method = "q_learning"

# Création de l'environnement
env = gym.make("Blackjack-v1", natural=True, sab=True)

q_table = defaultdict(lambda: np.zeros(env.action_space.n))
visits = defaultdict(lambda: np.zeros(env.action_space.n))
epsilon = epsilon_start

def choose_action(state):
    #Politique epsilon-greedy avec décroissance rapide de epsilon.
    if np.random.rand() < epsilon:
        return env.action_space.sample()  #Exploration
    return np.argmax(q_table[state])  #Exploitation

def update_q_table(state, action, reward, next_state, next_action=None, done=False, episode=1):
    #Mise à jour de la Q-table avec une décroissance rapide de epsilon et logarithmique de alpha.

    # Learning rate logarithmique
    alpha = max(0.02, alpha_start / (1 + np.log(1 + episode)))

    # Exploration décroissante exponentiellement
    global epsilon
    epsilon = max(epsilon_min, epsilon_start * np.exp(-epsilon_decay * episode))

    # Mise à jour Q-Learning
    best_next_action = np.argmax(q_table[next_state])
    target = reward + gamma * q_table[next_state][best_next_action] * (not done)
    q_table[state][action] += alpha * (target - q_table[state][action])

    # Comptabilisation des visites
    visits[state][action] += 1

def train(num_episodes=100000):
    #Entraîne l'agent Q-Learning.

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            update_q_table(state, action, reward, next_state, done=done, episode=episode)
            state = next_state

        # Affichage périodique des performances
        if episode % 10000 == 0:
            win_rate = evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}, Epsilon: {epsilon:.4f}")

def evaluate(num_episodes=5000, verbose=True):
    #Évalue l'agent après l'entraînement.

    wins, losses, draws = 0, 0, 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = np.argmax(q_table[state])  # Exploitation totale
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state

        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1

    win_rate = wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

train(num_episodes=100000)
evaluate()

Episode 0/100000, Win Rate: 39.52%, Epsilon: 1.0000
Episode 10000/100000, Win Rate: 43.04%, Epsilon: 0.3679
Episode 20000/100000, Win Rate: 42.22%, Epsilon: 0.1353
Episode 30000/100000, Win Rate: 41.72%, Epsilon: 0.0498
Episode 40000/100000, Win Rate: 44.14%, Epsilon: 0.0183
Episode 50000/100000, Win Rate: 43.60%, Epsilon: 0.0100
Episode 60000/100000, Win Rate: 41.80%, Epsilon: 0.0100
Episode 70000/100000, Win Rate: 43.14%, Epsilon: 0.0100
Episode 80000/100000, Win Rate: 42.46%, Epsilon: 0.0100
Episode 90000/100000, Win Rate: 42.74%, Epsilon: 0.0100
Performance sur 5000 épisodes: 42.90% de victoires


0.429