In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt


# 1. Create the environment
env = gym.make("CartPole-v1")

# 2. Define the Neural Network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)   # First hidden layer
        self.fc2 = nn.Linear(24, 24)            # Second hidden layer
        self.fc3 = nn.Linear(24, action_size)   # Output layer (one Q-value per action)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# 3. Hyperparameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
gamma = 0.99            # Discount factor
epsilon = 1.0           # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 64
memory = deque(maxlen=10000)

# 4. Initialize network and optimizer
qnetwork = QNetwork(state_size, action_size)
optimizer = optim.Adam(qnetwork.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

# 5. Helper function to choose action
def act(state, epsilon):
    if random.random() <= epsilon:
        return random.choice(range(action_size))   # Explore
    state = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
        q_values = qnetwork(state)
    return torch.argmax(q_values).item()           # Exploit

# 6. Training Loop
episodes = 500
rewards_per_episode = []

for e in range(episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = act(state, epsilon)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        # Learning from a minibatch
        if len(memory) >= batch_size:
            minibatch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)

            states = torch.FloatTensor(states)
            actions = torch.LongTensor(actions).unsqueeze(1)
            rewards = torch.FloatTensor(rewards).unsqueeze(1)
            next_states = torch.FloatTensor(next_states)
            dones = torch.FloatTensor(dones).unsqueeze(1)

            q_values = qnetwork(states).gather(1, actions)
            with torch.no_grad():
                q_next = qnetwork(next_states).max(1)[0].unsqueeze(1)
                q_targets = rewards + gamma * q_next * (1 - dones)

            loss = loss_fn(q_values, q_targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    rewards_per_episode.append(total_reward)


    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if (e + 1) % 10 == 0:
        print(f"Episode {e+1}: Total Reward = {total_reward:.2f}, Epsilon = {epsilon:.3f}")
    # Plotting Rewards

env.close()
plt.figure(figsize=(10,5))
plt.plot(rewards_per_episode, label='Total Reward per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Deep Q-Learning Performance on CartPole')
plt.legend()
plt.grid()
plt.show()



  states = torch.FloatTensor(states)
