In [18]:
import gym
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt

# Create the environment
env = gym.make('MountainCar-v0')

# Set random seeds for reproducibility
env.seed(0)
torch.manual_seed(0)
random.seed(0)

# Get the state and action dimensions
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Set hyperparameters
learning_rate = 0.001
gamma = 0.99
episodes = 100
max_steps = 200
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
batch_size = 64
target_update_interval = 10

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the Double Q-learning agent
class DoubleQLearningAgent:
    def __init__(self, state_size, action_size, learning_rate, gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.gamma = gamma

        self.model = QNetwork(state_size, action_size)
        self.target_model = QNetwork(state_size, action_size)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.memory = deque(maxlen=100000)

    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, self.action_size - 1)
        else:
            state_tensor = torch.FloatTensor(state)
            with torch.no_grad():
                q_values = self.model(state_tensor)
                return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))




    def update_q_networks(self, batch_size):
        if len(self.memory) < batch_size:
            return

        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states_tensor = torch.FloatTensor(states)
        actions_tensor = torch.LongTensor(actions)
        rewards_tensor = torch.FloatTensor(rewards)
        next_states_tensor = torch.FloatTensor(next_states)
        dones_tensor = torch.FloatTensor(dones)

        q_values = self.model(states_tensor)
        next_q_values = self.model(next_states_tensor)
        target_next_q_values = self.target_model(next_states_tensor)

        q_value_targets = rewards_tensor + self.gamma * target_next_q_values.gather(1, torch.argmax(next_q_values, dim=1, keepdim=True)) * (1 - dones_tensor)

        q_value_estimates = q_values.gather(1, actions_tensor.unsqueeze(1))

        loss = F.mse_loss(q_value_estimates, q_value_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_model.load_state_dict(self.model.state_dict())

# Create the agent
agent = DoubleQLearningAgent(state_size, action_size, learning_rate, gamma)

# Create lists to track rewards and epsilon values over episodes
rewards = []
epsilons = []

# Training loop
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    epsilon = max(epsilon_end, epsilon_start * (epsilon_decay ** episode))

    for step in range(max_steps):
        # Select an action
        action = agent.select_action(state, epsilon)

        # Take a step in the environment
        next_state, reward, done, _ = env.step(action)

        # Store the experience in agent's memory
        agent.remember(state, action, reward, next_state, done)

        # Update the Q-networks
        agent.update_q_networks(batch_size)

        # Update the target network periodically
        if step % target_update_interval == 0:
            agent.update_target_network()

        # Update the state and total reward
        state = next_state
        total_reward += reward

        if done:
            break

    # Track the rewards and epsilon values
    rewards.append(total_reward)
    epsilons.append(epsilon)

    # Print the episode information
    print(f"Episode: {episode + 1}/{episodes}, Total Reward: {total_reward}, Epsilon: {epsilon:.4f}")

# Plot the rewards and epsilon values
plt.plot(rewards)
plt.title("Rewards")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.show()

plt.plot(epsilons)
plt.title("Epsilon")
plt.xlabel("Episode")
plt.ylabel("Epsilon Value")
plt.show()

