In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import pygame
import matplotlib.pyplot as plt  # Import matplotlib for plotting

# Define your PongEnvironment class
class PongEnvironment:
    ACTION_SPACE = [0, 1]  # Define the action space
    def __init__(self, width=400, height=300):
        self.width = width
        self.height = height
        self.ball_radius = 10
        self.paddle_width = 10
        self.paddle_height = 60
        self.paddle_offset = 20
        self.ball_pos = np.array([self.width // 2, self.height // 2], dtype=float)
        self.ball_vel = np.array([0.03, 0.01], dtype=float)
        self.paddle_pos = self.height // 2
        self.clock = pygame.time.Clock()
        self.screen = pygame.display.set_mode((self.width, self.height))
        pygame.display.set_caption("Pong")

    def generate_expert_transitions(self, num_transitions=100):
        transitions = []

        for _ in range(num_transitions):
            action = np.random.choice(self.ACTION_SPACE)
            state = self.get_state()

            # Execute action in the environment
            self.move_paddle(action)
            next_state = self.get_state()

            # Determine reward and done
            reward = 0
            done = False

            # Check if the ball hits the paddle
            if self.ball_pos[0] >= self.width - self.ball_radius - self.paddle_width:
                if self.paddle_pos - self.paddle_height / 2 <= self.ball_pos[1] <= self.paddle_pos + self.paddle_height / 2:
                    reward = 2  # Reward for hitting the ball
                else:
                    reward = -1  # Penalty for missing the ball
                    done = True

            transitions.append((state, action, reward, next_state, done))

        return transitions

    def reset(self):
        self.ball_pos = np.array([self.width // 2, self.height // 2], dtype=float)
        self.ball_vel = np.array([0.03, 0.01], dtype=float)
        self.paddle_pos = self.height // 2

    def step(self, action):
        self.move_paddle(action)
        self.ball_pos += self.ball_vel

        if self.ball_pos[1] <= self.ball_radius or self.ball_pos[1] >= self.height - self.ball_radius:
            self.ball_vel[1] *= -1

        if self.ball_pos[0] <= self.ball_radius:
            return self.get_state(), -1, True  # Reward -1 for missing the ball, set done to True

        if self.ball_pos[0] >= self.width - self.ball_radius:
            if self.paddle_pos - self.paddle_height / 2 <= self.ball_pos[1] <= self.paddle_pos + self.paddle_height / 2:
                self.ball_vel[0] *= -1
            else:
                return self.get_state(), -1, True  # Reward -1 for missing the ball, set done to True

        return self.get_state(), 0, False

    def move_paddle(self, action):
        self.paddle_pos = np.clip(self.paddle_pos + action, self.paddle_height / 2, self.height - self.paddle_height / 2)

    def get_state(self):
        return np.array([
            self.ball_pos[0] / self.width,
            self.ball_pos[1] / self.height,
            self.ball_vel[0],
            self.ball_vel[1],
            self.paddle_pos / self.height
        ], dtype=np.float32)  # Ensure consistent data type

    def render(self):
        self.screen.fill((0, 0, 0))
        pygame.draw.rect(self.screen, (255, 255, 255), pygame.Rect(0, self.paddle_pos - self.paddle_height / 2, self.paddle_width, self.paddle_height))
        pygame.draw.circle(self.screen, (255, 255, 255), (int(self.ball_pos[0]), int(self.ball_pos[1])), self.ball_radius)
        pygame.display.flip()

# Define the neural network for approximating the model Mθ
class ModelApproximator(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(ModelApproximator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size)
        )

    def forward(self, x):
        return self.fc(x)

# Define the DQN class
class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size)
        )

    def forward(self, x):
        return self.fc(x)

# Define the ReplayBuffer class
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        if state is not None and next_state is not None:
            self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        if len(self.buffer) == 0:
            return None, None, None, None, None

        if len(self.buffer) < batch_size:
            batch_size = len(self.buffer)

        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return torch.FloatTensor(state), torch.LongTensor(action), torch.FloatTensor(reward), torch.FloatTensor(next_state), torch.FloatTensor(done)

# Define the ExpertAgent class
class ExpertAgent:
    def __init__(self, env):
        self.env = env
        self.model = ModelApproximator(state_size=env.get_state().shape[0], action_size=len(env.ACTION_SPACE))
        self.replay_buffer = ReplayBuffer(capacity=10000)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)  # Define optimizer

    def train(self, num_episodes, batch_size):
        rewards_per_episode = []
        losses = []

        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0
            episode_losses = []

            done = False
            while not done:
                action = self.select_action(state)
                next_state, reward, done = self.env.step(action)
                self.replay_buffer.add(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if len(self.replay_buffer.buffer) >= batch_size:
                    states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
                    loss = self.train_step(states, actions, rewards, next_states, dones)
                    episode_losses.append(loss)

            rewards_per_episode.append(total_reward)
            losses.append(np.mean(episode_losses) if episode_losses else 0)
            print(f'Expert Agent - Episode {episode}, Total Reward: {total_reward}, Average Loss: {np.mean(episode_losses) if episode_losses else 0}')

        return rewards_per_episode, losses

    def test(self, num_episodes):
        rewards_per_episode = []

        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0

            done = False
            while not done:
                action = self.select_action(state)
                next_state, reward, done = self.env.step(action)
                state = next_state
                total_reward += reward

            rewards_per_episode.append(total_reward)
            print(f'Expert Agent - Test Episode {episode}, Total Reward: {total_reward}')

        return rewards_per_episode

    def select_action(self, state, epsilon=0.1):
        if state is None:  # Handle None state
            return random.choice(range(len(self.env.ACTION_SPACE)))

        if random.random() > epsilon:
            with torch.no_grad():
                state_t = torch.FloatTensor(state).unsqueeze(0)
                action_values = self.model(state_t)
                action = np.argmax(action_values.cpu().numpy())
        else:
            action = random.choice(range(len(self.env.ACTION_SPACE)))
        return action


    def train_step(self, states, actions, rewards, next_states, dones):
        state_action_values = self.model(states).gather(1, actions.view(-1, 1))
        next_state_values = self.model(next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values * 0.99 * (1 - dones)) + rewards
        loss = nn.MSELoss()(state_action_values.squeeze(), expected_state_action_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def get_observations(self, num_samples=10):
        observations = []
        for _ in range(num_samples):
            state = self.env.reset()
            action = random.choice(self.env.ACTION_SPACE)
            action = random.choice(self.env.ACTION_SPACE)
            next_state, reward, done = self.env.step(action)
            observations.append((state, action, reward, next_state, done))
        return observations

# Define the NonExpertAgent class
class NonExpertAgent:
    def __init__(self, env, expert_agents):
        self.env = env
        self.model = ModelApproximator(state_size=env.get_state().shape[0], action_size=len(env.ACTION_SPACE))
        self.replay_buffer = ReplayBuffer(capacity=10000)
        self.expert_agents = expert_agents
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)  # Initialize optimizer

    def train(self, num_episodes, batch_size):
        rewards_per_episode = []
        losses = []
        criterion = nn.CrossEntropyLoss()

        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0
            episode_losses = []

            done = False
            while not done:
                action = self.select_action(state)
                next_state, reward, done = self.env.step(action)
                self.replay_buffer.add(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if len(self.replay_buffer.buffer) >= batch_size:
                    self.collect_from_experts(batch_size)
                    states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
                    state_action_values = self.model(states)
                    actions = actions.type(torch.LongTensor)  # Convert actions to LongTensor
                    loss = criterion(state_action_values, actions)  # Use CrossEntropyLoss for classification
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    episode_losses.append(loss.item())

            rewards_per_episode.append(total_reward)
            losses.append(np.mean(episode_losses) if episode_losses else 0)
            print(f'Non-Expert Agent - Episode {episode}, Total Reward: {total_reward}, Average Loss: {np.mean(episode_losses) if episode_losses else 0}')

        return rewards_per_episode, losses

    def collect_from_experts(self, batch_size):
        for expert_agent in self.expert_agents:
            observations = expert_agent.get_observations(batch_size)
            for state, _, reward, next_state, done in observations:
                self.replay_buffer.add(state, None, reward, next_state, done)


    def select_action(self, state, epsilon=0.1):
        if state is None:  # Handle None state
            return random.choice(range(len(self.env.ACTION_SPACE)))

        if random.random() > epsilon:
            with torch.no_grad():
                state_t = torch.FloatTensor(state).unsqueeze(0)
                action_values = self.model(state_t)
                action = np.argmax(action_values.cpu().numpy())
        else:
            action = random.choice(range(len(self.env.ACTION_SPACE)))
        return action

# Main function
if __name__ == "__main__":
    pygame.init()
    env = PongEnvironment()
    expert_agents = [ExpertAgent(env) for _ in range(3)]  # Create multiple expert agents
    non_expert_agent = NonExpertAgent(env, expert_agents)

    num_episodes = 100
    batch_size = 32

    # Train expert agents
    for expert_agent in expert_agents:
        expert_agent.train(num_episodes, batch_size)

    # Train non-expert agent
    non_expert_agent.train(num_episodes, batch_size)

    pygame.quit()

    # Plot rewards per episode for expert agents
    for i, expert_agent in enumerate(expert_agents):
        rewards_per_episode = expert_agent.test(num_episodes)
        plt.plot(rewards_per_episode, label=f'Expert Agent {i+1}')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Rewards per Episode for Expert Agents')
    plt.legend()
    plt.show()

    # Plot rewards per episode for non-expert agent
    rewards_per_episode, _ = non_expert_agent.test(num_episodes)
    plt.plot(rewards_per_episode, label='Non-Expert Agent')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Rewards per Episode for Non-Expert Agent')
    plt.legend()
    plt.show()
