<a href="https://colab.research.google.com/github/Saksham17P/C_dir/blob/main/PixelmonFullnFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
import json
from gym import Env
from gym.spaces import Discrete, Box
import copy

# ===============================
# Environment with Pokémon Switching
# ===============================
class EnhancedPokemonBattleEnv(Env):
    def __init__(self, pokemon_data_path="/content/pokemon_trial.json", opponent_q_table=None):
        super(EnhancedPokemonBattleEnv, self).__init__()

        # Load Pokémon data (each with type and moves)
        self.pokemon_data = self.load_pokemon_data(pokemon_data_path)

        # Type effectiveness chart (customize as needed)
        self.type_chart = {
            "Fire": {"Grass": 2.0, "Water": 0.5, "Fire": 0.5},
            "Water": {"Fire": 2.0, "Grass": 0.5, "Water": 0.5},
            "Grass": {"Water": 2.0, "Fire": 0.5, "Grass": 0.5},
            "Electric": {"Water": 2.0, "Grass": 0.5, "Electric": 0.5},
            "Normal": {},
            "Dark": {"Psychic": 2.0, "Fighting": 0.5},
            "Dragon": {"Dragon": 2.0}
            # Add other types as needed.
        }

        # Action space: 0-3 use a move; 4-7 switch to bench slot (0-indexed)
        self.action_space = Discrete(8)
        # Observation: [agent_active_hp, opponent_active_hp, last_agent_damage, last_opponent_damage, agent_remaining]
        self.observation_space = Box(
            low=np.array([0, 0, 0, 0, 1]),
            high=np.array([500, 500, 100, 100, 4]),
            dtype=np.float32
        )

        self.max_steps = 300
        # For self-play: store the opponent's Q-table (if provided)
        self.opponent_q_table = opponent_q_table
        self.reset()

    def load_pokemon_data(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def get_type_multiplier(self, move_type, target_type):
        return self.type_chart.get(move_type, {}).get(target_type, 1.0)

    def select_random_moves(self, pokemon_name):
        all_moves = list(self.pokemon_data[pokemon_name]["moves"].values())
        if len(all_moves) <= 4:
            return all_moves
        return random.sample(all_moves, 4)

    def reset(self):
        self.steps = 0
        self.last_agent_damage = 0
        self.last_opponent_damage = 0
        self.last_move_type = None  # Track the last move type used by opponent

        # --- For the Agent ---
        self.agent_bench = []
        agent_keys = random.sample(list(self.pokemon_data.keys()), 4)
        for key in agent_keys:
            pkmn = {
                "name": key,
                "type": self.pokemon_data[key]["type"],
                "moves": self.select_random_moves(key),
                "hp": 500
            }
            self.agent_bench.append(pkmn)
        self.agent_active_idx = random.randint(0, 3)

        # --- For the Opponent ---
        self.opponent_bench = []
        opponent_keys = random.sample(list(self.pokemon_data.keys()), 4)
        for key in opponent_keys:
            pkmn = {
                "name": key,
                "type": self.pokemon_data[key]["type"],
                "moves": self.select_random_moves(key),
                "hp": 500
            }
            self.opponent_bench.append(pkmn)
        self.opponent_active_idx = random.randint(0, 3)

        return self._get_state()

    def _get_state(self):
        agent_remaining = sum(1 for p in self.agent_bench if p["hp"] > 0)
        return np.array([
            self.agent_bench[self.agent_active_idx]["hp"],
            self.opponent_bench[self.opponent_active_idx]["hp"],
            self.last_agent_damage,
            self.last_opponent_damage,
            agent_remaining
        ], dtype=np.float32)

    def _get_opponent_state(self):
        opponent_remaining = sum(1 for p in self.opponent_bench if p["hp"] > 0)
        return np.array([
            self.opponent_bench[self.opponent_active_idx]["hp"],
            self.agent_bench[self.agent_active_idx]["hp"],
            self.last_opponent_damage,
            self.last_agent_damage,
            opponent_remaining
        ], dtype=np.float32)

    def _get_opponent_action(self, state):
        if self.opponent_q_table is None:
            return self.action_space.sample()
        state_idx = discretize_state(state)
        return np.argmax(self.opponent_q_table[state_idx])

    def _find_best_type_matchup(self, move_type):
        best_resistance = 1.0
        best_idx = self.agent_active_idx
        for i, pokemon in enumerate(self.agent_bench):
            if pokemon["hp"] <= 0:
                continue
            resistance = self.get_type_multiplier(move_type, pokemon["type"])
            if resistance < best_resistance:
                best_resistance = resistance
                best_idx = i
        return best_idx

    def step(self, action):
        """
        Modified step method that accepts either a single integer (for single-agent training)
        or a tuple (agent_action, opponent_action) for adversarial testing.
        """
        # If action is a tuple, unpack it.
        if isinstance(action, tuple):
            agent_action, opponent_action = action
        else:
            agent_action = action
            opponent_action = None

        HIT_POINTS = 500.0
        self.steps += 1
        reward = 0
        self.last_agent_damage = 0
        self.last_opponent_damage = 0

        # Agent Turn: Check if agent's active Pokémon is fainted and force switch if needed.
        if self.agent_bench[self.agent_active_idx]["hp"] <= 0:
            available = [i for i, p in enumerate(self.agent_bench) if p["hp"] > 0]
            if available:
                self.agent_active_idx = random.choice(available)
            else:
                return self._get_state(), reward, True, {"info": "All agent Pokémon fainted"}

        # Optional: Use last opponent move type to decide switching
        if self.last_move_type and agent_action < 4:
            best_type_idx = self._find_best_type_matchup(self.last_move_type)
            if best_type_idx != self.agent_active_idx:
                agent_action = best_type_idx + 4
                reward += 0.1 / HIT_POINTS

        # Process Agent Action
        if agent_action >= 4:
            new_idx = agent_action - 4
            if self.agent_bench[new_idx]["hp"] <= 0:
                reward -= 0.5 / HIT_POINTS
            elif new_idx == self.agent_active_idx:
                reward -= 0.2 / HIT_POINTS
            else:
                self.agent_active_idx = new_idx
                reward -= 0.05 / HIT_POINTS
            agent_damage = 0
        else:
            active_agent = self.agent_bench[self.agent_active_idx]
            if active_agent["hp"] <= 0:
                agent_damage = 0
            else:
                move_idx = agent_action
                if move_idx >= len(active_agent["moves"]):
                    move_idx = 0
                move = active_agent["moves"][move_idx]
                base_damage = move["damage"]
                multiplier = self.get_type_multiplier(move["type"],
                                                      self.opponent_bench[self.opponent_active_idx]["type"])
                agent_damage = base_damage * multiplier
                self.opponent_bench[self.opponent_active_idx]["hp"] = max(
                    0, self.opponent_bench[self.opponent_active_idx]["hp"] - agent_damage
                )
        reward += agent_damage / HIT_POINTS
        self.last_agent_damage = agent_damage

        # Opponent Turn: Check if opponent's active Pokémon is fainted and force switch if needed.
        if self.opponent_bench[self.opponent_active_idx]["hp"] <= 0:
            available = [i for i, p in enumerate(self.opponent_bench) if p["hp"] > 0]
            if available:
                self.opponent_active_idx = random.choice(available)
            else:
                return self._get_state(), reward + 1, True, {"info": "All opponent Pokémon fainted"}

        # Process Opponent Action
        if opponent_action is None:
            if self.opponent_q_table is not None:
                opponent_state = self._get_opponent_state()
                opponent_action = self._get_opponent_action(opponent_state)
            else:
                opponent_action = random.choice(range(self.action_space.n))
        if opponent_action >= 4:
            new_idx = opponent_action - 4
            available = [i for i, p in enumerate(self.opponent_bench) if p["hp"] > 0]
            if available:
                if new_idx in available and new_idx != self.opponent_active_idx:
                    self.opponent_active_idx = new_idx
                else:
                    self.opponent_active_idx = random.choice(available)
            opponent_damage = 0
            self.last_move_type = None
        else:
            opp_active = self.opponent_bench[self.opponent_active_idx]
            if opp_active["hp"] <= 0:
                opponent_damage = 0
                self.last_move_type = None
            else:
                move_idx = opponent_action
                if move_idx >= len(opp_active["moves"]):
                    move_idx = 0
                opp_move = opp_active["moves"][move_idx]
                opp_base_damage = opp_move["damage"]
                opp_multiplier = self.get_type_multiplier(opp_move["type"],
                                                          self.agent_bench[self.agent_active_idx]["type"])
                opponent_damage = opp_base_damage * opp_multiplier
                self.agent_bench[self.agent_active_idx]["hp"] = max(
                    0, self.agent_bench[self.agent_active_idx]["hp"] - opponent_damage
                )
                self.last_move_type = opp_move["type"]
        self.last_opponent_damage = opponent_damage

        if self.opponent_bench[self.opponent_active_idx]["hp"] == 0:
            reward += 1

        done = (all(p["hp"] <= 0 for p in self.agent_bench) or
                all(p["hp"] <= 0 for p in self.opponent_bench) or
                self.steps >= self.max_steps)

        return self._get_state(), reward, done, {}

    def render(self, mode='human'):
        print(f"Step: {self.steps}")
        print(f"Agent Active: {self.agent_bench[self.agent_active_idx]['name']} [{self.agent_bench[self.agent_active_idx]['hp']:.1f} HP]")
        print(f"Opponent Active: {self.opponent_bench[self.opponent_active_idx]['name']} [{self.opponent_bench[self.opponent_active_idx]['hp']:.1f} HP]")
        if self.last_move_type:
            print(f"Last opponent move type: {self.last_move_type}")
        print("-" * 40)

# ===============================
# State Discretization Function (for Q-Learning)
# ===============================
def discretize_state(state):
    """
    State vector:
      [agent_active_hp, opponent_active_hp, last_agent_damage, last_opponent_damage, agent_remaining]
    """
    hp_bins = np.array([0, 100, 200, 300, 400, 500])
    damage_bins = np.array([0, 20, 40, 60, 80, 100])

    hp1_idx = np.digitize(state[0], hp_bins) - 1
    hp1_idx = min(hp1_idx, len(hp_bins) - 3)

    hp2_idx = np.digitize(state[1], hp_bins) - 1
    hp2_idx = min(hp2_idx, len(hp_bins) - 3)

    dmg1_idx = np.digitize(state[2], damage_bins) - 1
    dmg1_idx = min(dmg1_idx, len(damage_bins) - 3)

    dmg2_idx = np.digitize(state[3], damage_bins) - 1
    dmg2_idx = min(dmg2_idx, len(damage_bins) - 3)

    remain_idx = int(state[4]) - 1
    remain_idx = min(max(remain_idx, 0), 3)

    return (hp1_idx, hp2_idx, dmg1_idx, dmg2_idx, remain_idx)

# ===============================
# Minimax Q-Learning Implementation
# ===============================
hp_bins_count = 4
damage_bins_count = 4
remain_bins_count = 4

num_actions = 8  # 4 moves + 4 switching actions
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.9997
alpha = 0.1
gamma = 0.95

q_table_agent = np.zeros((hp_bins_count, hp_bins_count, damage_bins_count, damage_bins_count, remain_bins_count, num_actions))
q_table_opponent = np.zeros((hp_bins_count, hp_bins_count, damage_bins_count, damage_bins_count, remain_bins_count, num_actions))

num_total_episodes = 30000
snapshot_interval = 5000
q_table_snapshots = []
snapshot_record = []

env = EnhancedPokemonBattleEnv("/content/pokemon_trial.json")
recent_rewards = []

def update_minimax_q(q_table, state_idx, action, reward, next_state_idx, alpha, gamma):
    old_value = q_table[state_idx][action]
    worst_case_value = np.min(q_table[next_state_idx])
    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * worst_case_value)
    q_table[state_idx][action] = new_value
    return q_table

for episode in range(num_total_episodes):
    if episode > 0 and episode % snapshot_interval == 0:
        snapshot = copy.deepcopy(q_table_agent)
        q_table_snapshots.append(snapshot)
        snapshot_record.append(episode)
        if len(q_table_snapshots) > 0:
            opponent_q_idx = random.randint(0, len(q_table_snapshots) - 1)
            env = EnhancedPokemonBattleEnv(
                "/content/pokemon_trial.json",
                opponent_q_table=q_table_snapshots[opponent_q_idx]
            )
            print(f"Self-play started: Agent playing against snapshot from episode {snapshot_record[opponent_q_idx]}")

    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        state_idx = discretize_state(state)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table_agent[state_idx])
        next_state, reward, done, _ = env.step(action)
        next_state_idx = discretize_state(next_state)
        q_table_agent = update_minimax_q(q_table_agent, state_idx, action, reward, next_state_idx, alpha, gamma)

        opp_state_idx = (state_idx[1], state_idx[0], state_idx[3], state_idx[2], remain_bins_count - 1 - state_idx[4])
        opp_next_state_idx = (next_state_idx[1], next_state_idx[0], next_state_idx[3], next_state_idx[2], remain_bins_count - 1 - next_state_idx[4])
        if env.last_opponent_damage > 0 or env.last_move_type is not None:
            if env.last_opponent_damage > 0:
                opp_action = 0
            else:
                opp_action = 4
            q_table_opponent = update_minimax_q(q_table_opponent, opp_state_idx, opp_action, -reward, opp_next_state_idx, alpha, gamma)

        state = next_state
        total_reward += reward

    recent_rewards.append(total_reward)
    if len(recent_rewards) > 100:
        recent_rewards.pop(0)
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    if (episode + 1) % 1000 == 0:
        avg_reward = np.mean(recent_rewards)
        print(f"Episode: {episode + 1}, Avg Reward: {avg_reward:.2f}, Epsilon: {epsilon:.3f}")

# ===============================
# Test the Agent Against Different Opponents
# ===============================
def test_against_opponent(agent_q_table, opponent_q_table=None, num_episodes=10):
    test_env = EnhancedPokemonBattleEnv("/content/pokemon_trial.json", opponent_q_table=opponent_q_table)
    wins = 0
    total_rewards = []
    for episode in range(num_episodes):
        state = test_env.reset()
        done = False
        total_reward = 0
        while not done:
            state_idx = discretize_state(state)
            action_values = q_table_agent[state_idx]
            action = np.argmax(action_values)
            state, reward, done, _ = test_env.step(action)
            total_reward += reward
        agent_has_living = any(p["hp"] > 0 for p in test_env.agent_bench)
        opp_all_fainted = all(p["hp"] <= 0 for p in test_env.opponent_bench)
        if agent_has_living and opp_all_fainted:
            wins += 1
        total_rewards.append(total_reward)
        print(f"Test Episode {episode + 1} finished with total reward: {total_reward:.2f}")
    return wins, total_rewards

print("\nTesting Minimax Q-Learning agent against random opponent:")
random_wins, random_rewards = test_against_opponent(q_table_agent, None)
print(f"Win rate against random opponent: {(random_wins/10)*100:.2f}%")
print(f"Average reward: {np.mean(random_rewards):.2f}")

for i, snapshot in enumerate(q_table_snapshots):
    print(f"\nTesting Minimax Q-Learning agent against snapshot {i+1} (from episode {snapshot_record[i]}):")
    snapshot_wins, snapshot_rewards = test_against_opponent(q_table_agent, snapshot)
    print(f"Win rate against snapshot {i+1}: {(snapshot_wins/10)*100:.2f}%")
    print(f"Average reward: {np.mean(snapshot_rewards):.2f}")

print("\nTesting Minimax Q-Learning agent against itself:")
self_wins, self_rewards = test_against_opponent(q_table_agent, q_table_agent)
print(f"Win rate against itself: {(self_wins/10)*100:.2f}%")
print(f"Average reward: {np.mean(self_rewards):.2f}")

# ===============================
# Define Max Damage Agent for Testing
# ===============================
class MaxDamageAgent:
    def __init__(self, env):
        self.env = env
        self.action_dim = env.action_space.n

    def select_action(self, state):
        # The max damage agent acts as the opponent.
        # It looks at its active Pokémon's moves and selects the move (0-3) that would cause the most damage
        # to the agent's active Pokémon.
        opp_active = self.env.opponent_bench[self.env.opponent_active_idx]
        agent_active = self.env.agent_bench[self.env.agent_active_idx]
        best_move = None
        best_damage = -float('inf')
        for i, move in enumerate(opp_active["moves"]):
            multiplier = self.env.get_type_multiplier(move["type"], agent_active["type"])
            damage = move["damage"] * multiplier
            if damage > best_damage:
                best_damage = damage
                best_move = i
        if best_move is None:
            best_move = random.randint(0, 3)
        return best_move, 0  # Dummy log probability

def test_against_max_damage_agent(agent_q_table, max_damage_agent, num_episodes=10):
    test_env = EnhancedPokemonBattleEnv("/content/pokemon_trial.json")
    wins = 0
    total_rewards = []
    for episode in range(num_episodes):
        state = test_env.reset()
        done = False
        total_reward = 0
        while not done:
            state_idx = discretize_state(state)
            agent_action = np.argmax(agent_q_table[state_idx])
            opp_action, _ = max_damage_agent.select_action(state)
            # Pass tuple of (agent_action, opponent_action)
            state, reward, done, _ = test_env.step((agent_action, opp_action))
            total_reward += reward
        agent_alive = any(p["hp"] > 0 for p in test_env.agent_bench)
        opp_all_fainted = all(p["hp"] <= 0 for p in test_env.opponent_bench)
        if agent_alive and opp_all_fainted:
            wins += 1
        total_rewards.append(total_reward)
        print(f"Max Damage Test Episode {episode + 1} finished with total reward: {total_reward:.2f}")
    return wins, total_rewards

max_damage_agent = MaxDamageAgent(env)
print("\nTesting Minimax Q-Learning agent against Max Damage Agent:")
max_wins, max_rewards = test_against_max_damage_agent(q_table_agent, max_damage_agent, num_episodes=10)
print(f"Win rate against max damage agent: {(max_wins/10)*100:.2f}%")
print(f"Average reward: {np.mean(max_rewards):.2f}")

# ===============================
# Analyze the learned strategy
# ===============================
def analyze_strategy(q_table):
    print("\nStrategy Analysis:")
    low_agent_hp_high_opp_hp = (0, 3, 0, 0, 3)
    action = np.argmax(q_table[low_agent_hp_high_opp_hp])
    print(f"When agent HP is low but opponent HP is high: {'Use move' if action < 4 else 'Switch Pokemon'} (Action {action})")
    both_low_hp = (0, 0, 0, 0, 3)
    action = np.argmax(q_table[both_low_hp])
    print(f"When both have low HP: {'Use move' if action < 4 else 'Switch Pokemon'} (Action {action})")
    type_advantage = (2, 2, 3, 0, 3)
    action = np.argmax(q_table[type_advantage])
    print(f"When agent has type advantage (high damage): {'Use move' if action < 4 else 'Switch Pokemon'} (Action {action})")

analyze_strategy(q_table_agent)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Episode: 1000, Avg Reward: 4.72, Epsilon: 0.741
Episode: 2000, Avg Reward: 5.23, Epsilon: 0.549
Episode: 3000, Avg Reward: 5.30, Epsilon: 0.407
Episode: 4000, Avg Reward: 5.36, Epsilon: 0.301
Episode: 5000, Avg Reward: 5.28, Epsilon: 0.223
Self-play started: Agent playing against snapshot from episode 5000
Episode: 6000, Avg Reward: 4.06, Epsilon: 0.165
Episode: 7000, Avg Reward: 4.18, Epsilon: 0.122
Episode: 8000, Avg Reward: 4.47, Epsilon: 0.091
Episode: 9000, Avg Reward: 4.56, Epsilon: 0.067
Episode: 10000, Avg Reward: 4.33, Epsilon: 0.050
Self-play started: Agent playing against snapshot from episode 10000
Episode: 11000, Avg Reward: 4.33, Epsilon: 0.050
Episode: 12000, Avg Reward: 4.54, Epsilon: 0.050
Episode: 13000, Avg Reward: 4.36, Epsilon: 0.050
Episode: 14000, Avg Reward: 4.44, Epsilon: 0.050
Episode: 15000, Avg Reward: 4.42, Epsilon: 0.050
Self-play started: Agent playing against snapshot from episode 5000
Episode: 16000, Avg Reward: 4.65, Epsilon: 0.050
Episode: 17000, Avg 

In [2]:
def test_against_max_damage_agent(agent_q_table, max_damage_agent, num_episodes=1000):
    test_env = EnhancedPokemonBattleEnv("/content/pokemon_trial.json")
    wins = 0
    total_rewards = []
    for episode in range(num_episodes):
        state = test_env.reset()
        done = False
        total_reward = 0
        while not done:
            state_idx = discretize_state(state)
            agent_action = np.argmax(agent_q_table[state_idx])
            opp_action, _ = max_damage_agent.select_action(state)
            state, reward, done, _ = test_env.step((agent_action, opp_action))
            total_reward += reward
        agent_alive = any(p["hp"] > 0 for p in test_env.agent_bench)
        opp_all_fainted = all(p["hp"] <= 0 for p in test_env.opponent_bench)
        if agent_alive and opp_all_fainted:
            wins += 1
        total_rewards.append(total_reward)
        print(f"Max Damage Test Episode {episode + 1} finished with total reward: {total_reward:.2f}")
    return wins, total_rewards

max_damage_agent = MaxDamageAgent(env)
print("\nTesting Minimax Q-Learning agent against Max Damage Agent:")
max_wins, max_rewards = test_against_max_damage_agent(q_table_agent, max_damage_agent, num_episodes=1000)
print(f"Win rate against max damage agent: {(max_wins/1000)*100:.2f}%")
print(f"Average reward: {np.mean(max_rewards):.2f}")


Testing Minimax Q-Learning agent against Max Damage Agent:
Max Damage Test Episode 1 finished with total reward: 3.41
Max Damage Test Episode 2 finished with total reward: 5.45
Max Damage Test Episode 3 finished with total reward: 3.04
Max Damage Test Episode 4 finished with total reward: 5.92
Max Damage Test Episode 5 finished with total reward: 5.28
Max Damage Test Episode 6 finished with total reward: 3.49
Max Damage Test Episode 7 finished with total reward: 3.40
Max Damage Test Episode 8 finished with total reward: 5.52
Max Damage Test Episode 9 finished with total reward: 5.52
Max Damage Test Episode 10 finished with total reward: 5.40
Max Damage Test Episode 11 finished with total reward: 5.30
Max Damage Test Episode 12 finished with total reward: 3.85
Max Damage Test Episode 13 finished with total reward: 5.54
Max Damage Test Episode 14 finished with total reward: 5.34
Max Damage Test Episode 15 finished with total reward: 5.81
Max Damage Test Episode 16 finished with total re