<a href="https://colab.research.google.com/github/Qm1ne/GameDesignBalance/blob/main/%20Q-Learningmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (

In [5]:

pip install gymnasium




In [None]:
import random
import numpy as np
import pandas as pd
import json
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import torch


def create_deck():
    suits = ['♠', '♥', '♦', '♣']
    ranks = list(range(1, 8)) + ['J', 'Q', 'K']
    return [f"{r}{s}" for s in suits for r in ranks]


def card_value(card):
    r = card[:-1]
    if r == 'J': return 8
    if r == 'Q': return 9
    if r == 'K': return 10
    return int(r)


def card_to_id(card):
    """Convertit une carte en ID numérique pour l'IA"""
    suits = ['♠', '♥', '♦', '♣']
    ranks = list(range(1, 8)) + ['J', 'Q', 'K']

    suit = card[-1]
    rank = card[:-1]

    try:
        suit_id = suits.index(suit)
        if rank.isdigit():
            rank_id = ranks.index(int(rank))
        else:
            rank_id = ranks.index(rank)

        return suit_id * len(ranks) + rank_id
    except (ValueError, IndexError):
        return 0


def id_to_card(card_id):
    """Convertit un ID numérique en carte"""
    suits = ['♠', '♥', '♦', '♣']
    ranks = list(range(1, 8)) + ['J', 'Q', 'K']

    if card_id >= len(suits) * len(ranks) or card_id < 0:
        return "1♠"

    suit_id = card_id // len(ranks)
    rank_id = card_id % len(ranks)

    if suit_id >= len(suits) or rank_id >= len(ranks):
        return "1♠"

    return f"{ranks[rank_id]}{suits[suit_id]}"


class CardGameEnv(gym.Env):
    """Environnement Gymnasium simplifié pour Q-Learning"""

    def __init__(self):
        super(CardGameEnv, self).__init__()

        # Actions: jouer une carte (0-43) ou utiliser une capacité (44-47)
        self.action_space = spaces.Discrete(48)

        # Observation: état du jeu simplifié
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(100,), dtype=np.float32
        )

        self.deck = create_deck()
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        shuffled_deck = self.deck.copy()
        random.shuffle(shuffled_deck)

        self.table = shuffled_deck[:4]
        self.player_hand = shuffled_deck[4:7]
        self.opponent_hand = shuffled_deck[7:10]
        self.remaining_deck = shuffled_deck[10:]

        self.player_captured = []
        self.opponent_captured = []
        self.turn = 0
        self.game_over = False

        # Capacités - simplement disponibles ou non
        self.player_abilities_used = [False, False, False, False]

        return self._get_observation(), {}

    def _get_observation(self):
        """État du jeu pour l'agent Q-Learning"""
        obs = np.zeros(100, dtype=np.float32)

        # Main du joueur (44 positions)
        for card in self.player_hand:
            card_id = card_to_id(card)
            if 0 <= card_id < 44:
                obs[card_id] = 1.0

        # Table (44 positions)
        for card in self.table:
            card_id = card_to_id(card)
            if 0 <= card_id < 44:
                obs[44 + card_id] = 1.0

        # Informations de base
        obs[88] = len(self.player_hand) / 10.0
        obs[89] = len(self.table) / 10.0
        obs[90] = len(self.player_captured) / 20.0
        obs[91] = self.turn / 100.0

        # Capacités disponibles
        for i, used in enumerate(self.player_abilities_used):
            obs[92 + i] = 0.0 if used else 1.0

        return obs

    def step(self, action):
        if self.game_over:
            return self._get_observation(), 0, True, False, {}

        reward = 0

        # Actions de cartes (0-43)
        if action < 44:
            reward = self._play_card_action(action)
        # Actions de capacités (44-47)
        else:
            ability_id = action - 44
            reward = self._use_ability_action(ability_id)

        # Tour de l'adversaire
        if not self.game_over:
            self._opponent_turn()

        # Vérifier fin de partie
        if not self.player_hand and not self.opponent_hand and not self.remaining_deck:
            self.game_over = True
            winner = self._determine_winner()
            if winner == "win":
                reward += 10.0
            elif winner == "lose":
                reward -= 10.0
            # draw = 0 reward

        self._distribute_cards()
        self.turn += 1

        return self._get_observation(), reward, self.game_over, False, {}

    def _play_card_action(self, card_id):
        """Joue une carte - l'agent apprend quoi faire"""
        if card_id >= 44 or card_id < 0:
            return -1.0

        card = id_to_card(card_id)

        if card not in self.player_hand:
            return -1.0  # Action invalide

        cv = card_value(card)

        # Chercher une capture possible
        captured = False
        for table_card in self.table[:]:
            if card_value(table_card) == cv:
                self.player_hand.remove(card)
                self.table.remove(table_card)
                self.player_captured.extend([card, table_card])

                # Récompense pour capture
                reward = 2.0

                # Bonus si table vide
                if not self.table:
                    reward += 3.0

                captured = True
                break

        if not captured:
            # Placer sur la table
            self.player_hand.remove(card)
            self.table.append(card)
            reward = -0.1

        return reward

    def _use_ability_action(self, ability_id):
        """Utilise une capacité - l'agent apprend quand c'est utile"""
        if ability_id >= 4 or self.player_abilities_used[ability_id]:
            return -1.0  # Capacité non disponible

        # Marquer comme utilisée
        self.player_abilities_used[ability_id] = True

        # L'agent doit apprendre quand utiliser chaque capacité
        # Pas de logique prédéfinie - juste des récompenses de base

        if ability_id == 0:  # swap_opponent
            if self.opponent_hand and self.player_hand:
                # Échange aléatoire - l'agent apprend si c'est bon
                opp_card = random.choice(self.opponent_hand)
                my_card = random.choice(self.player_hand)

                self.opponent_hand.remove(opp_card)
                self.player_hand.remove(my_card)
                self.opponent_hand.append(my_card)
                self.player_hand.append(opp_card)

                return 1.0  # Récompense neutre
            return -0.5

        elif ability_id == 1:  # swap_table
            if self.table and self.player_hand:
                table_card = random.choice(self.table)
                my_card = random.choice(self.player_hand)

                self.table.remove(table_card)
                self.player_hand.remove(my_card)
                self.table.append(my_card)
                self.player_hand.append(table_card)

                return 1.0
            return -0.5

        elif ability_id == 2:  # reveal
            # Pas d'effet mécanique - juste une récompense d'information
            return 0.5

        elif ability_id == 3:  # bonus7
            # Bonus basé sur les 7 dans la main
            sevens = sum(1 for card in self.player_hand if card_value(card) == 7)
            return sevens * 0.5

        return 0

    def _opponent_turn(self):
        """Adversaire avec stratégie basique"""
        if not self.opponent_hand:
            return

        # Stratégie simple: capturer si possible, sinon jouer première carte
        played = False
        for card in self.opponent_hand[:]:
            cv = card_value(card)
            for table_card in self.table[:]:
                if card_value(table_card) == cv:
                    self.opponent_hand.remove(card)
                    self.table.remove(table_card)
                    self.opponent_captured.extend([card, table_card])
                    played = True
                    break
            if played:
                break

        if not played and self.opponent_hand:
            card = self.opponent_hand.pop(0)
            self.table.append(card)

    def _distribute_cards(self):
        """Distribue nouvelles cartes"""
        while len(self.player_hand) < 3 and self.remaining_deck:
            self.player_hand.append(self.remaining_deck.pop(0))

        while len(self.opponent_hand) < 3 and self.remaining_deck:
            self.opponent_hand.append(self.remaining_deck.pop(0))

    def _determine_winner(self):
        """Détermine le gagnant - retourne string simple"""
        player_points = 0
        opponent_points = 0

        # Plus de cartes capturées
        if len(self.player_captured) > len(self.opponent_captured):
            player_points += 1
        elif len(self.opponent_captured) > len(self.player_captured):
            opponent_points += 1

        # Plus de 7
        player_sevens = sum(1 for card in self.player_captured if card_value(card) == 7)
        opponent_sevens = sum(1 for card in self.opponent_captured if card_value(card) == 7)

        if player_sevens > opponent_sevens:
            player_points += 1
        elif opponent_sevens > player_sevens:
            opponent_points += 1

        if player_points > opponent_points:
            return "win"
        elif player_points < opponent_points:
            return "lose"
        else:
            return "draw"


class QLearningPlayer:
    """Joueur utilisant uniquement Q-Learning (DQN)"""

    def __init__(self):
        self.model = None
        self.env = None

    def train(self, total_timesteps=500):
        """Entraîne le modèle Q-Learning"""
        print(f"🧠 Entraînement Q-Learning...")

        self.env = CardGameEnv()

        # Configuration DQN pour Q-Learning
        self.model = DQN(
            'MlpPolicy',
            self.env,
            verbose=1,
            learning_rate=1,
            buffer_size=100,
            learning_starts=10,
            batch_size=32,
            tau=1.0,
            gamma=0.99,
            exploration_fraction=0.3,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.1
        )

        self.model.learn(total_timesteps=total_timesteps)
        print(f"✅ Entraînement Q-Learning terminé!")

    def evaluate(self, n_games=50):
        """Évalue les performances"""
        if not self.model:
            print("❌ Modèle non entraîné!")
            return

        print(f"📊 Évaluation sur {n_games} parties...")

        mean_reward, std_reward = evaluate_policy(
            self.model, self.env, n_eval_episodes=n_games
        )

        print(f"Récompense moyenne: {mean_reward:.2f} ± {std_reward:.2f}")
        return mean_reward, std_reward

    def play_game(self):
        """Joue une partie et retourne le résultat"""
        if not self.model:
            return "Modèle non entraîné"

        obs, _ = self.env.reset()
        done = False

        while not done:
            action, _ = self.model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = self.env.step(action)

        return self.env._determine_winner()

    def save_model(self, filename):
        """Sauvegarde le modèle"""
        if self.model:
            self.model.save(filename)
            print(f"💾 Modèle Q-Learning sauvegardé: {filename}")

    def load_model(self, filename):
        """Charge un modèle"""
        self.model = DQN.load(filename)
        print(f"📂 Modèle Q-Learning chargé: {filename}")


def test_qlearning_performance(n_tests=100):
    """Test les performances du Q-Learning"""
    print(f"🎯 Test de performance Q-Learning sur {n_tests} parties")

    player = QLearningPlayer()
    player.train(total_timesteps=200)

    results = {"win": 0, "lose": 0, "draw": 0}

    for i in range(n_tests):
        result = player.play_game()
        results[result] += 1

        if (i + 1) % 20 == 0:
            print(f"Partie {i+1}/{n_tests}: Victoires={results['win']}, Défaites={results['lose']}, Nuls={results['draw']}")

    print("\n📊 RÉSULTATS FINAUX:")
    print(f"Victoires: {results['win']}/{n_tests} ({results['win']/n_tests*100:.1f}%)")
    print(f"Défaites: {results['lose']}/{n_tests} ({results['lose']/n_tests*100:.1f}%)")
    print(f"Nuls: {results['draw']}/{n_tests} ({results['draw']/n_tests*100:.1f}%)")

    return results


if __name__ == "__main__":
    print("🎮 Jeu de Cartes avec Q-Learning Pur")
    print("=" * 50)

    try:
        # Test rapide
        print("\n🧠 Entraînement Q-Learning...")
        player = QLearningPlayer()
        player.train(total_timesteps=100)
        player.evaluate(n_games=10)
        player.save_model("qlearning_model")

        # Test de performance
        print("\n🎯 Test de performance...")
        performance = test_qlearning_performance(50)

        print("\n✅ Test Q-Learning réussi!")

    except Exception as e:
        print(f"❌ Erreur: {e}")
        print("Vérifiez: pip install gymnasium stable-baselines3 torch")

🎮 Jeu de Cartes avec Q-Learning Pur

🧠 Entraînement Q-Learning...
🧠 Entraînement Q-Learning...
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
✅ Entraînement Q-Learning terminé!
📊 Évaluation sur 10 parties...


In [None]:
import random
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy


def create_deck():
    suits = ['♠', '♥', '♦', '♣']
    ranks = list(range(1, 8)) + ['J', 'Q', 'K']
    return [f"{r}{s}" for s in suits for r in ranks]


def card_value(card):
    r = card[:-1]
    return {'J': 8, 'Q': 9, 'K': 10}.get(r, int(r))


def card_to_id(card):
    suits = ['♠', '♥', '♦', '♣']
    ranks = list(range(1, 8)) + ['J', 'Q', 'K']
    try:
        suit_id = suits.index(card[-1])
        rank = card[:-1]
        rank_id = ranks.index(int(rank)) if rank.isdigit() else ranks.index(rank)
        return suit_id * len(ranks) + rank_id
    except:
        return 0


def id_to_card(card_id):
    suits = ['♠', '♥', '♦', '♣']
    ranks = list(range(1, 8)) + ['J', 'Q', 'K']
    if not (0 <= card_id < len(suits) * len(ranks)):
        return "1♠"
    suit_id, rank_id = divmod(card_id, len(ranks))
    return f"{ranks[rank_id]}{suits[suit_id]}"


class CardGameEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.action_space = spaces.Discrete(48)
        self.observation_space = spaces.Box(0, 1, shape=(100,), dtype=np.float32)
        self.deck = create_deck()
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        random.shuffle(self.deck)
        self.table = self.deck[:4]
        self.player_hand = self.deck[4:7]
        self.opponent_hand = self.deck[7:10]
        self.remaining_deck = self.deck[10:]
        self.player_captured = []
        self.opponent_captured = []
        self.turn = 0
        self.game_over = False
        self.player_abilities_used = [False]*4
        return self._get_observation(), {}

    def _get_observation(self):
        obs = np.zeros(100, dtype=np.float32)
        for c in self.player_hand:
            cid = card_to_id(c)
            if 0 <= cid < 44: obs[cid] = 1.0
        for c in self.table:
            cid = card_to_id(c)
            if 0 <= cid < 44: obs[44 + cid] = 1.0
        obs[88] = len(self.player_hand)/10
        obs[89] = len(self.table)/10
        obs[90] = len(self.player_captured)/20
        obs[91] = self.turn/100
        for i, used in enumerate(self.player_abilities_used):
            obs[92+i] = 0.0 if used else 1.0
        return obs

    def step(self, action):
        if self.game_over:
            return self._get_observation(), 0, True, False, {}
        reward = 0
        if action < 44:
            reward = self._play_card_action(action)
        else:
            ability = action - 44
            reward = self._use_ability_action(ability)
        if not self.game_over:
            self._opponent_turn()
        if not self.player_hand and not self.opponent_hand and not self.remaining_deck:
            self.game_over = True
            winner = self._determine_winner()
            reward += 10 if winner=="win" else -10 if winner=="lose" else 0
        self._distribute_cards()
        self.turn += 1
        return self._get_observation(), reward, self.game_over, False, {}

    def _play_card_action(self, card_id):
        if card_id not in range(44): return -1
        card = id_to_card(card_id)
        if card not in self.player_hand: return -1
        cv = card_value(card)
        for tc in self.table[:]:
            if card_value(tc) == cv:
                self.player_hand.remove(card)
                self.table.remove(tc)
                self.player_captured += [card, tc]
                reward = 2 + (3 if not self.table else 0)
                return reward
        self.player_hand.remove(card)
        self.table.append(card)
        return -0.1

    def _use_ability_action(self, ability_id):
        if ability_id >= 4 or self.player_abilities_used[ability_id]:
            return -1
        self.player_abilities_used[ability_id] = True
        if ability_id == 0 and self.opponent_hand and self.player_hand:
            opp_card = random.choice(self.opponent_hand)
            my_card = random.choice(self.player_hand)
            self.opponent_hand.remove(opp_card)
            self.player_hand.remove(my_card)
            self.opponent_hand.append(my_card)
            self.player_hand.append(opp_card)
            return 1
        elif ability_id == 1 and self.table and self.player_hand:
            table_card = random.choice(self.table)
            my_card = random.choice(self.player_hand)
            self.table.remove(table_card)
            self.player_hand.remove(my_card)
            self.table.append(my_card)
            self.player_hand.append(table_card)
            return 1
        elif ability_id == 2:
            return 0.5
        elif ability_id == 3:
            return 0.5 * sum(card_value(c) == 7 for c in self.player_hand)
        return 0

    def _opponent_turn(self):
        if not self.opponent_hand:
            return
        for card in self.opponent_hand[:]:
            cv = card_value(card)
            for tc in self.table[:]:
                if card_value(tc) == cv:
                    self.opponent_hand.remove(card)
                    self.table.remove(tc)
                    self.opponent_captured += [card, tc]
                    return
        if self.opponent_hand:
            self.table.append(self.opponent_hand.pop(0))

    def _distribute_cards(self):
        while len(self.player_hand) < 3 and self.remaining_deck:
            self.player_hand.append(self.remaining_deck.pop(0))
        while len(self.opponent_hand) < 3 and self.remaining_deck:
            self.opponent_hand.append(self.remaining_deck.pop(0))

    def _determine_winner(self):
        p_points = (len(self.player_captured) > len(self.opponent_captured)) + (sum(card_value(c)==7 for c in self.player_captured) > sum(card_value(c)==7 for c in self.opponent_captured))
        o_points = (len(self.opponent_captured) > len(self.player_captured)) + (sum(card_value(c)==7 for c in self.opponent_captured) > sum(card_value(c)==7 for c in self.player_captured))
        if p_points > o_points: return "win"
        elif o_points > p_points: return "lose"
        return "draw"


class QLearningPlayer:
    def __init__(self):
        self.model = None
        self.env = None

    def train(self, total_timesteps=500):
        self.env = CardGameEnv()
        self.model = DQN('MlpPolicy', self.env, verbose=0, learning_rate=5e-4, buffer_size=100, learning_starts=10, batch_size=32, gamma=0.99, exploration_fraction=0.3, exploration_initial_eps=1.0, exploration_final_eps=0.1)
        self.model.learn(total_timesteps=total_timesteps)

    def evaluate(self, n_games=50):
        if not self.model: return
        return evaluate_policy(self.model, self.env, n_eval_episodes=n_games)

    def play_game(self):
        if not self.model: return "Model not trained"
        obs, _ = self.env.reset()
        done = False
        while not done:
            action, _ = self.model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = self.env.step(action)
        return self.env._determine_winner()

    def save_model(self, filename):
        if self.model: self.model.save(filename)

    def load_model(self, filename):
        self.model = DQN.load(filename)


if __name__ == "__main__":
    player = QLearningPlayer()
    print("Training...")
    player.train(total_timesteps=100)
    mean_reward, std_reward = player.evaluate(n_games=10)
    print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")
