In [4]:
import numpy as np
import gymnasium as gym
import random
from collections import defaultdict

In [1]:
#Agent Q-Learning optimisé pour un bon équilibre exploration/exploitation.
        #- Décroissance logarithmique de epsilon
        #- Décroissance exponentielle de alpha
        #- Initialisation biaisée pour accélérer l'apprentissage

# Paramètres globaux
gamma= 0.97
alpha_start= 0.1
epsilon_start= 1.0
epsilon_min= 0.05
epsilon_decay= 0.00005
q_table= defaultdict(lambda: np.zeros(env.action_space.n))
visits= defaultdict(lambda: np.zeros(env.action_space.n))

def choose_action(state, episode):
    # Politique epsilon-greedy avec une décroissance logarithmique de epsilon
    if np.random.rand()< epsilon_start / (1 + np.log(1 + episode)):
        return env.action_space.sample()
    return np.argmax(q_table[state])

def update_q_table(state, action, reward, next_state, done, episode):
    # Initialisation biaisée pour un apprentissage plus rapide
    if state not in q_table:
        q_table[state]= np.array([0.1, 0.2])

    # Alpha décroissant exponentiellement
    alpha= max(0.02, alpha_start * np.exp(-0.00005 * episode))

    # Mise à jour Q-Learning
    best_next_action= np.argmax(q_table[next_state])
    target= reward + gamma* q_table[next_state][best_next_action] * (not done)
    q_table[state][action] += alpha* (target- q_table[state][action])

    visits[state][action] +=1

def train(num_episodes=300000):
    for episode in range(num_episodes):
        state, _= env.reset()
        done= False

        while not done:
            action= choose_action(state, episode)
            next_state, reward, terminated, truncated, _= env.step(action)
            done= terminated or truncated
            update_q_table(state, action, reward, next_state, done, episode)
            state= next_state

        if episode % 50000 == 0:
            win_rate= evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}, Epsilon: {epsilon_start / (1 + np.log(1 + episode)):.4f}")

def evaluate(num_episodes=5000, verbose=True):
    wins, losses, draws= 0, 0, 0

    for _ in range(num_episodes):
        state, _= env.reset()
        done= False

        while not done:
            action= np.argmax(q_table[state])
            next_state, reward, terminated, truncated, _= env.step(action)
            done= terminated or truncated
            state= next_state

        if reward > 0:
            wins+= 1
        elif reward < 0:
            losses+= 1
        else:
            draws+= 1

    win_rate= wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

env= gym.make("Blackjack-v1", natural=True, sab=True)

# Entraînement avec un équilibre stable
train(num_episodes=300000)
evaluate()

Episode 0/300000, Win Rate: 37.78%, Epsilon: 1.0000
Episode 50000/300000, Win Rate: 43.62%, Epsilon: 0.0846
Episode 100000/300000, Win Rate: 43.20%, Epsilon: 0.0799
Episode 150000/300000, Win Rate: 42.42%, Epsilon: 0.0774
Episode 200000/300000, Win Rate: 42.68%, Epsilon: 0.0757
Episode 250000/300000, Win Rate: 43.60%, Epsilon: 0.0745
Performance sur 5000 épisodes: 42.30% de victoires


0.423

In [2]:
#v2
#Agent Q-Learning optimisé pour un bon équilibre exploration/exploitation.
        #- Décroissance progressive de epsilon avec exploration forcée
        #- Décroissance lente de alpha pour stabiliser l'apprentissage
        #- Ajustement de gamma pour favoriser des décisions plus pragmatiques
       # - Warm Start amélioré pour accélérer l'apprentissage des stratégies sûres

#Paramètres globaux
gamma= 0.95
alpha_start= 0.1
epsilon_start= 1.0
epsilon_min= 0.05
epsilon_decay= 0.00002
q_table= defaultdict(lambda: np.zeros(env.action_space.n))
visits= defaultdict(lambda: np.zeros(env.action_space.n))

def choose_action(state, episode):
    # Phase d'exploration forcée pour les 50 000 premiers épisodes
    if episode<50000:
        epsilon= 1.0
    else:
        epsilon= max(epsilon_min, epsilon_start / (1+ np.log(1+episode - 50000)))

    if np.random.rand()< epsilon:
        return env.action_space.sample()
    return np.argmax(q_table[state])

def update_q_table(state, action, reward, next_state, done, episode):
    # Initialisation biaisée améliorée pour un apprentissage plus rapide
    if state not in q_table:
        q_table[state]= np.array([0.3, 0.4])

    # Alpha décroissant très lentement pour assurer un apprentissage sur le long terme
    alpha= max(0.02, alpha_start / (1+ 0.00002*episode))

    best_next_action= np.argmax(q_table[next_state])
    target= reward + gamma* q_table[next_state][best_next_action] * (not done)
    q_table[state][action] += alpha* (target- q_table[state][action])

    visits[state][action] +=1

def train(num_episodes=500000):
    for episode in range(num_episodes):
        state, _= env.reset()
        done= False

        while not done:
            action= choose_action(state, episode)
            next_state, reward, terminated, truncated, _= env.step(action)
            done= terminated or truncated
            update_q_table(state, action, reward, next_state, done, episode)
            state= next_state

        if episode % 50000 == 0:
            win_rate= evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}")

def evaluate(num_episodes=5000, verbose=True):
    wins, losses, draws= 0, 0, 0

    for _ in range(num_episodes):
        state, _= env.reset()
        done= False

        while not done:
            action= np.argmax(q_table[state])
            next_state, reward, terminated, truncated, _= env.step(action)
            done= terminated or truncated
            state= next_state

        if reward > 0:
            wins+= 1
        elif reward < 0:
            losses+= 1
        else:
            draws+= 1

    win_rate= wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

env= gym.make("Blackjack-v1", natural=True, sab=True)

# Entraînement avec un équilibre stable
train(num_episodes=300000)
evaluate()

Episode 0/300000, Win Rate: 38.26%
Episode 50000/300000, Win Rate: 42.14%
Episode 100000/300000, Win Rate: 42.12%
Episode 150000/300000, Win Rate: 42.48%
Episode 200000/300000, Win Rate: 41.76%
Episode 250000/300000, Win Rate: 43.98%
Performance sur 5000 épisodes: 43.62% de victoires


0.4362

In [3]:
#v3
#Agent Q-Learning ultra-optimisé pour maximiser les victoires.
        #- Facteur UCB pour favoriser l'exploration des actions sous-évaluées
        #- Réduction plus lente de alpha pour améliorer l'apprentissage long terme
        #- Ajustement gamma pour décisions plus offensives
        #- Warm Start amélioré

env = gym.make("Blackjack-v1", natural=True, sab=True)

#Paramètres d'apprentissage
alpha_start= 0.1
gamma = 0.92
epsilon_start= 1.0
epsilon_min= 0.05
epsilon_decay= 0.00002

#Q-Table et visites
q_table = defaultdict(lambda: np.zeros(env.action_space.n))
visits = defaultdict(lambda: np.zeros(env.action_space.n))

def choose_action(state, episode):
    #Politique UCB + epsilon-greedy pour meilleure exploration
    if np.random.rand()< epsilon_start:
        return env.action_space.sample()  #Exploration

    #Facteur UCB pour favoriser les actions sous-explorées
    total_visits = np.sum(visits[state]) + 1e-6
    ucb_values = q_table[state] + np.sqrt(2 * np.log(total_visits) / (visits[state] + 1e-6))
    return np.argmax(ucb_values)  #Exploitation améliorée

def update_q_table(state, action, reward, next_state, done, episode):
    #Mise à jour de la Q-table avec réduction optimisée de alpha et epsilon
    #Warm Start ajusté
    if state not in q_table:
        q_table[state] = np.array([0.5, 0.7])  #Influence plus les premières décisions

    alpha = max(0.02, alpha_start / (1 + 0.00001 * episode))

    if episode < 50000:
        epsilon = 1.0
    else:
        epsilon = max(epsilon_min, epsilon_start / (1 + np.log(1 + episode - 50000)))

    best_next_action = np.argmax(q_table[next_state])
    target = reward + gamma * q_table[next_state][best_next_action] * (not done)
    q_table[state][action] += alpha * (target - q_table[state][action])
    visits[state][action] += 1

def train(num_episodes=600000):
    #Entraîne l'agent Q-Learning avec une stabilité renforcée
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = choose_action(state, episode)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            update_q_table(state, action, reward, next_state, done, episode)
            state = next_state

        #Performances tous les 50 000 épisodes
        if episode % 50000 == 0:
            win_rate = evaluate(5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}")

def evaluate(num_episodes=5000, verbose=True):
    #Évalue l'agent après l'entraînement
    wins, losses, draws = 0, 0, 0

    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = np.argmax(q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state

        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1

    win_rate = wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

#Entraînement final
train(num_episodes=600000)
evaluate()

Episode 0/600000, Win Rate: 38.28%
Episode 50000/600000, Win Rate: 42.70%
Episode 100000/600000, Win Rate: 43.94%
Episode 150000/600000, Win Rate: 42.88%
Episode 200000/600000, Win Rate: 43.52%
Episode 250000/600000, Win Rate: 42.56%
Episode 300000/600000, Win Rate: 44.68%
Episode 350000/600000, Win Rate: 43.48%
Episode 400000/600000, Win Rate: 44.22%
Episode 450000/600000, Win Rate: 43.86%
Episode 500000/600000, Win Rate: 41.68%
Episode 550000/600000, Win Rate: 43.28%
Performance sur 5000 épisodes: 42.16% de victoires


0.4216

In [5]:
#v4

#Agent Q-Learning amélioré pour dépasser 45% :
#- Phase de réexploration après 300 K épisodes
#- Décroissance plus lente de alpha pour un apprentissage long terme
#- Exploration légère en fin de partie pour ajustement

# Définition de l'agent
env = gym.make("Blackjack-v1", natural=True, sab=True)

alpha = 0.1
gamma = 0.92
epsilon_start = 1.0
epsilon_min = 0.05
epsilon_decay = 0.00002
method = "q_learning"

agent = {}
agent['env'] = env
agent['alpha_start'] = alpha
agent['gamma'] = gamma
agent['epsilon'] = epsilon_start
agent['epsilon_start'] = epsilon_start
agent['epsilon_min'] = epsilon_min
agent['epsilon_decay'] = epsilon_decay
agent['method'] = method
agent['q_table'] = defaultdict(lambda: np.zeros(env.action_space.n))
agent['visits'] = defaultdict(lambda: np.zeros(env.action_space.n))

def choose_action(agent, state, episode):
    if np.random.rand() < agent['epsilon']:
        return agent['env'].action_space.sample()

    if episode > 500000 and np.random.rand() < 0.02:  # Perturbation contrôlée
        return agent['env'].action_space.sample()

    return np.argmax(agent['q_table'][state])

def update_q_table(agent, state, action, reward, next_state, next_action=None, done=False, episode=1):
    if state not in agent['q_table']:
        agent['q_table'][state] = np.array([0.5, 0.7])

    alpha = max(0.02, agent['alpha_start'] / (1 + 0.000005 * episode))

    if 300000 <= episode < 450000:
        agent['epsilon'] = max(agent['epsilon_min'], 0.15)
    else:
        agent['epsilon'] = max(agent['epsilon_min'], agent['epsilon_start'] / (1 + np.log(1 + episode)))

    best_next_action = np.argmax(agent['q_table'][next_state])
    target = reward + agent['gamma'] * agent['q_table'][next_state][best_next_action] * (not done)
    agent['q_table'][state][action] += alpha * (target - agent['q_table'][state][action])

    agent['visits'][state][action] += 1

def train(agent, num_episodes=700000):
    for episode in range(num_episodes):
        state, _ = agent['env'].reset()
        done = False
        while not done:
            action = choose_action(agent, state, episode)
            next_state, reward, terminated, truncated, _ = agent['env'].step(action)
            done = terminated or truncated
            update_q_table(agent, state, action, reward, next_state, done=done, episode=episode)
            state = next_state
        if episode % 50000 == 0:
            win_rate = evaluate(agent, 5000, verbose=False)
            print(f"Episode {episode}/{num_episodes}, Win Rate: {win_rate:.2%}")

def evaluate(agent, num_episodes=5000, verbose=True):
    wins, losses, draws = 0, 0, 0
    for _ in range(num_episodes):
        state, _ = agent['env'].reset()
        done = False
        while not done:
            action = np.argmax(agent['q_table'][state])
            next_state, reward, terminated, truncated, _ = agent['env'].step(action)
            done = terminated or truncated
            state = next_state
        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1
    win_rate = wins / num_episodes
    if verbose:
        print(f"Performance sur {num_episodes} épisodes: {win_rate:.2%} de victoires")
    return win_rate

#Entraînement final
train(agent, num_episodes=700000)
evaluate(agent)


Episode 0/700000, Win Rate: 38.06%
Episode 50000/700000, Win Rate: 42.26%
Episode 100000/700000, Win Rate: 41.84%
Episode 150000/700000, Win Rate: 43.34%
Episode 200000/700000, Win Rate: 41.60%
Episode 250000/700000, Win Rate: 42.30%
Episode 300000/700000, Win Rate: 42.24%
Episode 350000/700000, Win Rate: 42.76%
Episode 400000/700000, Win Rate: 42.28%
Episode 450000/700000, Win Rate: 42.72%
Episode 500000/700000, Win Rate: 41.78%
Episode 550000/700000, Win Rate: 43.04%
Episode 600000/700000, Win Rate: 42.78%
Episode 650000/700000, Win Rate: 44.02%
Performance sur 5000 épisodes: 42.54% de victoires


0.4254