In [1]:
import numpy as np
import gymnasium as gym
import random
from collections import defaultdict

In [2]:
#v1

#Initialisation de l'agent Q-Learning optimisé pour une phase massive d'exploration au départ
# - Phase d'exploration forcée jusqu'à 200K épisodes
# - Décroissance hybride de epsilon (exponentielle + logarithmique)
# - Apprentissage hybride (phase rapide suivie de stabilisation)

#Création de l'environnement
env = gym.make("Blackjack-v1", natural=True, sab=True)

#Paramètres de l'agent
alpha_start = 0.1
gamma = 0.95
epsilon_start = 1.0
epsilon_min = 0.05
epsilon_decay = 0.00002
method = "q_learning"

q_table = defaultdict(lambda: np.zeros(env.action_space.n))
visits = defaultdict(lambda: np.zeros(env.action_space.n))

def choose_action(state, episode):
    #Politique epsilon-greedy avec une phase d'exploration massive au début.

    #Exploration massive pendant les 200K premiers épisodes
    if episode < 200000:
        epsilon = 1.0
    else:
        epsilon = max(epsilon_min, epsilon_start / (1 + np.log(1 + episode - 200000)))

    if np.random.rand() < epsilon:
        return env.action_space.sample()
    return np.argmax(q_table[state])

def update_q_table(state, action, reward, next_state, next_action=None, done=False, episode=1):

#Mise à jour de la Q-table avec une hybridation de la réduction de alpha.

    #Warm Start amélioré pour influencer les premières décisions
    if state not in q_table:
        q_table[state] = np.array([0.6, 0.8])  # Favorise Stand légèrement au début

    #Alpha hybride : Phase rapide + stabilisation plus lente après 300K épisodes
    if episode < 300000:
        alpha = max(0.02, alpha_start / (1 + 0.00001 * episode))
    else:
        alpha = max(0.02, alpha_start * np.exp(-0.000005 * (episode - 300000)))

    #Epsilon : Mélange exponentiel + logarithmique après 200K épisodes
    if episode < 200000:
        epsilon = 1.0
    else:
        epsilon = max(epsilon_min, epsilon_start * np.exp(-epsilon_decay * (episode - 200000)))

    # Mise à jour Q-Learning
    best_next_action = np.argmax(q_table[next_state])
    target = reward + gamma * q_table[next_state][best_next_action] * (not done)
    q_table[state][action] += alpha * (target - q_table[state][action])

    visits[state][action] += 1

def train(num_episodes=700000):
    #Entraîne l'agent avec une exploration massive suivie d'une exploitation intelligente.

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = choose_action(state, episode)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            update_q_table(state, action, reward, next_state, done=done, episode=episode)
            state = next_state

        #Suivi des performances tous les 50 000 épisodes
        if episode % 50000 == 0:
            win_rate = evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}")

def evaluate(num_episodes=5000, verbose=True):
    #Évalue l'agent après l'entraînement.

    wins, losses, draws = 0, 0, 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = np.argmax(q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state

        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1

    win_rate = wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

#Entraînement de l'agent
train(num_episodes=700000)
evaluate()

Episode 0/700000, Win Rate: 37.52%
Episode 50000/700000, Win Rate: 42.48%
Episode 100000/700000, Win Rate: 43.26%
Episode 150000/700000, Win Rate: 42.22%
Episode 200000/700000, Win Rate: 42.66%
Episode 250000/700000, Win Rate: 43.90%
Episode 300000/700000, Win Rate: 44.14%
Episode 350000/700000, Win Rate: 42.02%
Episode 400000/700000, Win Rate: 41.48%
Episode 450000/700000, Win Rate: 42.06%
Episode 500000/700000, Win Rate: 41.82%
Episode 550000/700000, Win Rate: 41.34%
Episode 600000/700000, Win Rate: 42.32%
Episode 650000/700000, Win Rate: 43.86%
Performance sur 5000 épisodes: 43.26% de victoires


0.4326

In [4]:
#v2

#Dernière version optimisée du Q-Learning :
 #- Augmentation de gamma pour décisions stratégiques
 #- Réduction plus forte de epsilon après 500K épisodes
 #- Meilleure pondération des récompenses pour un apprentissage plus efficace

# Initialisation des paramètres de l'agent
alpha_start = 0.1
gamma = 0.97  # Encourage un apprentissage plus stratégique
epsilon_start = 1.0
epsilon_min = 0.05
epsilon_decay = 0.00002
method = "q_learning"

# Création de l'environnement
env = gym.make("Blackjack-v1", natural=True, sab=True)

q_table = defaultdict(lambda: np.zeros(env.action_space.n))
visits = defaultdict(lambda: np.zeros(env.action_space.n))
epsilon = epsilon_start

def choose_action(state, episode):
    """
    Politique epsilon-greedy avec ajustement dynamique après 500K épisodes.
    """
    if np.random.rand() < epsilon:
        return env.action_space.sample()

    return np.argmax(q_table[state])

def update_q_table(state, action, reward, next_state, next_action=None, done=False, episode=1, method="q_learning"):
    """
    Mise à jour de la Q-table avec renforcement des récompenses et stabilisation après 500K épisodes.
    """
    # Warm Start conservé
    if state not in q_table:
        q_table[state] = np.array([0.5, 0.7])

    # Alpha diminue plus lentement
    alpha = max(0.02, alpha_start / (1 + 0.000005 * episode))

    # Réduction drastique de epsilon après 500K épisodes (Focus Mode)
    global epsilon
    if episode > 500000:
        epsilon = max(epsilon_min, 0.02)

    # Augmenter l’impact des victoires et défaites
    if reward > 0:
        reward = 2  # Double la valeur des victoires
    elif reward < 0:
        reward = -2  # Pénalise plus les défaites

    if method == "sarsa":
        # Mise à jour SARSA
        target = reward + gamma * q_table[next_state][next_action] * (not done)
        q_table[state][action] += alpha * (target - q_table[state][action])
    else:
        # Mise à jour Q-Learning
        best_next_action = np.argmax(q_table[next_state])
        target = reward + gamma * q_table[next_state][best_next_action] * (not done)
        q_table[state][action] += alpha * (target - q_table[state][action])

    visits[state][action] += 1

def train(num_episodes=800000, method="q_learning"):
    """
    Entraîne l'agent avec un Focus Mode après 500K épisodes.
    """
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        action = choose_action(state, episode)

        while not done:
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_action = choose_action(next_state, episode) if method == "sarsa" else None
            update_q_table(state, action, reward, next_state, next_action, done=done, episode=episode, method=method)
            state, action = next_state, next_action if method == "sarsa" else np.argmax(q_table[next_state])

        # Suivi des performances tous les 50 000 épisodes
        if episode % 50000 == 0:
            win_rate = evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}")

def evaluate(num_episodes=50000, verbose=True):
    """
    Évalue l'agent après l'entraînement.
    """
    wins, losses, draws = 0, 0, 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = np.argmax(q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state

        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1

    win_rate = wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

# Entraînement avec la méthode Q-learning
train(num_episodes=800000, method="q_learning")
evaluate()

# Entraînement avec la méthode SARSA
train(num_episodes=800000, method="sarsa")
evaluate()

Episode 0/800000, Win Rate: 39.90%
Episode 50000/800000, Win Rate: 41.64%
Episode 100000/800000, Win Rate: 40.36%
Episode 150000/800000, Win Rate: 43.74%
Episode 200000/800000, Win Rate: 43.80%
Episode 250000/800000, Win Rate: 42.22%
Episode 300000/800000, Win Rate: 42.86%
Episode 350000/800000, Win Rate: 42.80%
Episode 400000/800000, Win Rate: 42.76%
Episode 450000/800000, Win Rate: 43.38%
Episode 500000/800000, Win Rate: 43.48%
Episode 550000/800000, Win Rate: 42.92%
Episode 600000/800000, Win Rate: 42.72%
Episode 650000/800000, Win Rate: 43.18%
Episode 700000/800000, Win Rate: 42.60%
Episode 750000/800000, Win Rate: 41.76%
Performance sur 50000 épisodes: 42.94% de victoires
Episode 0/800000, Win Rate: 42.22%
Episode 50000/800000, Win Rate: 42.20%
Episode 100000/800000, Win Rate: 41.58%
Episode 150000/800000, Win Rate: 41.94%
Episode 200000/800000, Win Rate: 42.50%
Episode 250000/800000, Win Rate: 42.28%
Episode 300000/800000, Win Rate: 43.12%
Episode 350000/800000, Win Rate: 43.10%


0.42868

In [5]:
#v3
#Dernière version Q-Learning :
    #- Gamma plus agressif pour des décisions plus directes
    #- Pondération des victoires/défaites à 3
    #- Réduction forte de epsilon après 600K épisodes

# Paramètres de l'agent
alpha_start = 0.1
gamma = 0.94  # Encourage des décisions plus directes
epsilon_start = 1.0
epsilon_min = 0.05
epsilon_decay = 0.00002
method = "q_learning"

# Création de l'environnement
env = gym.make("Blackjack-v1", natural=True, sab=True)

q_table = defaultdict(lambda: np.zeros(env.action_space.n))
visits = defaultdict(lambda: np.zeros(env.action_space.n))
epsilon = epsilon_start

def choose_action(state, episode):
#Politique epsilon-greedy avec stabilisation après 600K épisodes.

    if np.random.rand() < epsilon:
        return env.action_space.sample()

    return np.argmax(q_table[state])

def update_q_table(state, action, reward, next_state, next_action=None, done=False, episode=1):
    #Mise à jour de la Q-table avec un ajustement des récompenses et epsilon.

    # Warm Start conservé
    if state not in q_table:
        q_table[state] = np.array([0.5, 0.7])

    # Alpha diminue plus lentement
    alpha = max(0.02, alpha_start / (1 + 0.000005 * episode))

    # Réduction forte de epsilon après 600K épisodes
    global epsilon
    if episode > 600000:
        epsilon = max(epsilon_min, 0.01)

    # Augmenter l’impact des victoires et défaites
    if reward > 0:
        reward = 3  # Triple la valeur des victoires
    elif reward < 0:
        reward = -3  # Pénalise plus les défaites

    # Mise à jour Q-Learning
    best_next_action = np.argmax(q_table[next_state])
    target = reward + gamma * q_table[next_state][best_next_action] * (not done)
    q_table[state][action] += alpha * (target - q_table[state][action])

    visits[state][action] += 1

def train(num_episodes=900000):
    #Entraîne l'agent avec des ajustements agressifs après 600K épisodes.

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = choose_action(state, episode)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            update_q_table(state, action, reward, next_state, done=done, episode=episode)
            state = next_state

        # Suivi des performances tous les 50 000 épisodes
        if episode % 50000 == 0:
            win_rate = evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}")

def evaluate(num_episodes=50000, verbose=True):
    #Évalue l'agent après l'entraînement.

    wins, losses, draws = 0, 0, 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = np.argmax(q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state

        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1

    win_rate = wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

# Entraînement final avec toutes les améliorations
train(num_episodes=900000)
evaluate()

Episode 0/900000, Win Rate: 38.42%
Episode 50000/900000, Win Rate: 43.10%
Episode 100000/900000, Win Rate: 44.22%
Episode 150000/900000, Win Rate: 43.66%
Episode 200000/900000, Win Rate: 43.10%
Episode 250000/900000, Win Rate: 42.68%
Episode 300000/900000, Win Rate: 42.64%
Episode 350000/900000, Win Rate: 44.00%
Episode 400000/900000, Win Rate: 43.36%
Episode 450000/900000, Win Rate: 42.80%
Episode 500000/900000, Win Rate: 42.78%
Episode 550000/900000, Win Rate: 43.62%
Episode 600000/900000, Win Rate: 43.50%
Episode 650000/900000, Win Rate: 42.34%
Episode 700000/900000, Win Rate: 41.14%
Episode 750000/900000, Win Rate: 42.68%
Episode 800000/900000, Win Rate: 42.36%
Episode 850000/900000, Win Rate: 42.34%
Performance sur 50000 épisodes: 43.11% de victoires


0.43106