In [1]:
!pip install numpy==1.23.5 --upgrade --force-reinstall

Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incompatible.
db-dtypes 1.4.3 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
imbalanced-learn 0.13.0 requires numpy<3,>=1.24.3, but you have numpy 1.23.5 which is incompatible.
xarray 2025.3.1 requires numpy>=1.24, but you

In [2]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# Create environment using default API (old style)
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Hyperparameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
lr = 0.001
episodes = 100
batch_size = 64
memory = deque(maxlen=2000)

# DQN model
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.out = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x)

model = DQN(state_size, action_size)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.MSELoss()

# Choose action (epsilon-greedy)
def act(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    state_tensor = torch.FloatTensor(state)
    with torch.no_grad():
        q_values = model(state_tensor)
    return torch.argmax(q_values).item()

# Train on experience replay
def replay():
    global epsilon
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            next_q = model(torch.FloatTensor(next_state))
            target += gamma * torch.max(next_q).item()
        current_q = model(torch.FloatTensor(state))[0][action]
        loss = loss_fn(current_q, torch.tensor(target))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Training loop
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    for t in range(200):
        action = act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward
        if done:
            break
        replay()
    print(f"Episode {episode+1}/{episodes} - Score: {total_reward} - Epsilon: {epsilon:.2f}")


  deprecation(
  deprecation(


Episode 1/100 - Score: 23.0 - Epsilon: 1.00
Episode 2/100 - Score: 44.0 - Epsilon: 0.99
Episode 3/100 - Score: 17.0 - Epsilon: 0.91
Episode 4/100 - Score: 18.0 - Epsilon: 0.83
Episode 5/100 - Score: 41.0 - Epsilon: 0.68
Episode 6/100 - Score: 20.0 - Epsilon: 0.62
Episode 7/100 - Score: 14.0 - Epsilon: 0.58
Episode 8/100 - Score: 12.0 - Epsilon: 0.55
Episode 9/100 - Score: 12.0 - Epsilon: 0.52
Episode 10/100 - Score: 9.0 - Epsilon: 0.50
Episode 11/100 - Score: 18.0 - Epsilon: 0.46
Episode 12/100 - Score: 23.0 - Epsilon: 0.41
Episode 13/100 - Score: 13.0 - Epsilon: 0.39
Episode 14/100 - Score: 9.0 - Epsilon: 0.37
Episode 15/100 - Score: 21.0 - Epsilon: 0.34
Episode 16/100 - Score: 15.0 - Epsilon: 0.31
Episode 17/100 - Score: 9.0 - Epsilon: 0.30
Episode 18/100 - Score: 9.0 - Epsilon: 0.29
Episode 19/100 - Score: 11.0 - Epsilon: 0.28
Episode 20/100 - Score: 13.0 - Epsilon: 0.26
Episode 21/100 - Score: 11.0 - Epsilon: 0.25
Episode 22/100 - Score: 9.0 - Epsilon: 0.24
Episode 23/100 - Score: 