In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import numpy as np

In [2]:
1

1

In [None]:
# Simple Q-Network
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.m = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )

    def forward(self, x):
        return self.m(x)


class DQNAgent:
    def __init__(self, state_dim, action_dim):    
        self.main = DQN(state_dim, action_dim).cuda()
        self.target = DQN(state_dim, action_dim).cuda()
        self.target.load_state_dict(self.main.state_dict())

        self.action_dim = action_dim
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.min_epsilon = 0.05

        self.gamma = 0.99

        self.memory = deque(maxlen = 10000)
        self.batch_size = 64
        self.optimiser = torch.optim.Adam(self.main.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def remember(self, x):
        self.memory.append(x)

    def act(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_dim)
        
        with torch.no_grad():
            q_values = self.main(torch.FloatTensor(state).cuda())
        return q_values.argmax().item()
    
    def replay(self):
        if len(self.memory) < self.batch_size:
            return
    
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, new_states, dones = zip(*batch)

        states = torch.FloatTensor(states).cuda()
        actions = torch.LongTensor(actions).cuda()
        rewards = torch.FloatTensor(rewards).cuda()
        new_states = torch.FloatTensor(new_states).cuda()
        dones = torch.FloatTensor(dones).cuda()

        q_values = self.main(states)
        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        with torch.no_grad():
            next_q = self.target(new_states).max(1)[0]
            q_target = rewards + self.gamma*(1-dones)*next_q

        loss = self.loss(q_values, q_target)
        self.optimiser.zero_grad()
        loss.backward()
        self.optimiser.step()

        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay

    def update_target(self):
        self.target.load_state_dict(self.main.state_dict())


agent = DQNAgent(4, 2)


In [34]:
env = gym.make('CartPole-v1', render_mode='human')  # Remove render_mode for faster training

scores = []
for ep in range(300):
    state, _ = env.reset()
    state = state + np.random.normal(0, 0.1, size=state.shape)
    state = state + np.random.normal(0, 0.1, size=state.shape)
    env.unwrapped.state = state     
    score = 0

    for i in range(500):
        action = agent.act(state)
        next_state, reward, done, truncated, _ = env.step(action)
        agent.remember((state, action, reward, next_state, done))

        agent.replay()

        reward = reward if not done else -10 # why this?

        state = next_state
        score += reward

        if done or truncated:
            break        

    scores.append(score)

    print(f"Episode {ep}, Score: {score:.0f}, Epsilon: {agent.epsilon:.3f}, Avg: {np.mean(scores[-10:]):.1f}")
    
    if ep % 10 == 0:
        agent.update_target()

    if len(scores) >= 100 and np.mean(scores[-50:]) >= 195:
        print(f"Solved in {ep} episodes!")
        break

env.close()

Episode 0, Score: -1, Epsilon: 1.000, Avg: -1.0
Episode 1, Score: 6, Epsilon: 1.000, Avg: 2.5
Episode 2, Score: 1, Epsilon: 1.000, Avg: 2.0
Episode 3, Score: 15, Epsilon: 0.980, Avg: 5.2
Episode 4, Score: 1, Epsilon: 0.869, Avg: 4.4
Episode 5, Score: 43, Epsilon: 0.505, Avg: 10.8
Episode 6, Score: 15, Epsilon: 0.389, Avg: 11.4
Episode 7, Score: 1, Epsilon: 0.345, Avg: 10.1
Episode 8, Score: 2, Epsilon: 0.302, Avg: 9.2
Episode 9, Score: -2, Epsilon: 0.276, Avg: 8.1
Episode 10, Score: -1, Epsilon: 0.250, Avg: 8.1
Episode 11, Score: 9, Epsilon: 0.204, Avg: 8.4
Episode 12, Score: -1, Epsilon: 0.185, Avg: 8.2
Episode 13, Score: 3, Epsilon: 0.161, Avg: 7.0
Episode 14, Score: 6, Epsilon: 0.135, Avg: 7.5
Episode 15, Score: -1, Epsilon: 0.122, Avg: 3.1
Episode 16, Score: 3, Epsilon: 0.106, Avg: 1.9
Episode 17, Score: 6, Epsilon: 0.090, Avg: 2.4
Episode 18, Score: 1, Epsilon: 0.079, Avg: 2.3
Episode 19, Score: -1, Epsilon: 0.072, Avg: 2.4
Episode 20, Score: 1, Epsilon: 0.064, Avg: 2.6
Episode 21

In [35]:
# After training, watch the trained agent
def watch_agent(agent, env, episodes=5):
    env = gym.make('CartPole-v1', render_mode='human')
    for ep in range(episodes):
        state, _ = env.reset()
        env.unwrapped.state = env.unwrapped.state + np.random.normal(0, 0.1, size=4)
        state = env.unwrapped.state.copy()
        done = False
        score = 0
        while not done:
            action = agent.act(state)
            state, reward, done, truncated, _ = env.step(action)
            score += reward
            done = done or truncated
        print(f"Episode {ep+1}, Score: {score}")
    env.close()

watch_agent(agent, env)

Episode 1, Score: 314.0
Episode 2, Score: 233.0
Episode 3, Score: 324.0
Episode 4, Score: 290.0
Episode 5, Score: 195.0
