In [2]:
# %pip install --upgrade gym
# %pip install numpy==1.23.5
import gym
import numpy as np

# Ensure compatibility with NumPy >= 1.24
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_

# Create environment
env = gym.make("FrozenLake-v1", is_slippery=False)

# Detect reset() return format
def reset_env(env):
    result = env.reset()
    if isinstance(result, tuple):  # New Gym API
        return result[0]
    return result  # Old Gym API

# Detect step() return format
def step_env(env, action):
    result = env.step(action)
    if len(result) == 5:  # New Gym API
        next_state, reward, terminated, truncated, info = result
        return next_state, reward, (terminated or truncated), info
    else:  # Old Gym API
        next_state, reward, done, info = result
        return next_state, reward, done, info

# Initialize Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))

# Hyperparameters
alpha = 0.8
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.99
episodes = 2000

# Training
for episode in range(episodes):
    state = reset_env(env)
    done = False

    while not done:
        # Choose action
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q[state, :])     # Exploit

        # Take action
        next_state, reward, done, info = step_env(env, action)

        # Update Q-table
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * np.max(Q[next_state, :]) - Q[state, action]
        )

        state = next_state

    # Decay epsilon
    epsilon = max(0.01, epsilon * epsilon_decay)

print("Training complete. Final Q-table:")
print(Q)

# Testing trained agent
state = reset_env(env)
env.render()
done = False
total_reward = 0

while not done:
    action = np.argmax(Q[state, :])
    next_state, reward, done, info = step_env(env, action)
    env.render()
    state = next_state
    total_reward += reward

print("Final Reward:", total_reward)
env.close()


Training complete. Final Q-table:
[[0.73509189 0.77378094 0.6983373  0.73509189]
 [0.73509189 0.         0.56656416 0.69149778]
 [0.68646594 0.         0.         0.40468869]
 [0.         0.         0.         0.        ]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.90249769 0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.857375   0.95       0.         0.85732504]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.90082281 0.95       0.85049776]
 [0.9025     0.95       1.         0.90249998]
 [0.         0.         0.         0.        ]]
Final Reward: 1.0


In [3]:
# %pip install numpy==1.23.5
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

# Hyperparameters
EPISODES = 500
GAMMA = 0.95
LR = 0.001
BATCH_SIZE = 64
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
MEMORY_SIZE = 10000
TARGET_UPDATE = 10

# Q-Network
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Replay Buffer
memory = deque(maxlen=MEMORY_SIZE)

# Environment
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Networks
policy_net = DQN(state_size, action_size)
target_net = DQN(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=LR)

epsilon = EPSILON_START

# Training Loop
for episode in range(EPISODES):
    state = env.reset()
    if isinstance(state, tuple):  # gym API fix
        state = state[0]
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    done = False
    while not done:
        # Epsilon-greedy
        if random.random() < epsilon:
            action = random.randrange(action_size)
        else:
            with torch.no_grad():
                action = torch.argmax(policy_net(torch.FloatTensor(state))).item()

        # Step
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = np.reshape(next_state, [1, state_size])

        # Store experience
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        # Train from memory
        if len(memory) >= BATCH_SIZE:
            minibatch = random.sample(memory, BATCH_SIZE)
            states, actions, rewards, next_states, dones = zip(*minibatch)

            states = torch.FloatTensor(np.vstack(states))
            actions = torch.LongTensor(actions).unsqueeze(1)
            rewards = torch.FloatTensor(rewards)
            next_states = torch.FloatTensor(np.vstack(next_states))
            dones = torch.FloatTensor(dones)

            # Q values
            current_q = policy_net(states).gather(1, actions).squeeze()
            next_q = target_net(next_states).max(1)[0]
            target_q = rewards + (GAMMA * next_q * (1 - dones))

            # Loss and optimize
            loss = nn.MSELoss()(current_q, target_q.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Decay epsilon
    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)

    # Update target network
    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode} - Reward: {total_reward}, Epsilon: {epsilon:.3f}")

env.close()


Episode 0 - Reward: 19.0, Epsilon: 0.995
Episode 1 - Reward: 55.0, Epsilon: 0.990
Episode 2 - Reward: 16.0, Epsilon: 0.985
Episode 3 - Reward: 9.0, Epsilon: 0.980
Episode 4 - Reward: 30.0, Epsilon: 0.975
Episode 5 - Reward: 41.0, Epsilon: 0.970
Episode 6 - Reward: 8.0, Epsilon: 0.966
Episode 7 - Reward: 66.0, Epsilon: 0.961
Episode 8 - Reward: 56.0, Epsilon: 0.956
Episode 9 - Reward: 22.0, Epsilon: 0.951
Episode 10 - Reward: 28.0, Epsilon: 0.946
Episode 11 - Reward: 15.0, Epsilon: 0.942
Episode 12 - Reward: 17.0, Epsilon: 0.937
Episode 13 - Reward: 10.0, Epsilon: 0.932
Episode 14 - Reward: 13.0, Epsilon: 0.928
Episode 15 - Reward: 26.0, Epsilon: 0.923
Episode 16 - Reward: 31.0, Epsilon: 0.918
Episode 17 - Reward: 37.0, Epsilon: 0.914
Episode 18 - Reward: 24.0, Epsilon: 0.909
Episode 19 - Reward: 11.0, Epsilon: 0.905
Episode 20 - Reward: 15.0, Epsilon: 0.900
Episode 21 - Reward: 37.0, Epsilon: 0.896
Episode 22 - Reward: 20.0, Epsilon: 0.891
Episode 23 - Reward: 46.0, Epsilon: 0.887
Epis

KeyboardInterrupt: 