In [2]:
import gym
import torch
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
!pip install numpy==1.23.5 --quiet
import os
os.kill(os.getpid(), 9)  # Force restart the runtime

In [1]:
import gym
import torch
import numpy as np
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

✅ Using device: cuda


# Q-learning CPU

In [6]:
import gym
import numpy as np
import time

# Environment
env = gym.make("FrozenLake-v1", is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n

# Hyperparameters
episodes = 1000
max_steps = 100
alpha = 0.8
gamma = 0.95
epsilon = 1.0
min_epsilon = 0.01
epsilon_decay = 0.995

# Q-table
q_table = np.zeros((n_states, n_actions))

start_cpu = time.time()

for ep in range(episodes):
    state = env.reset()
    done = False

    for _ in range(max_steps):
        if np.random.rand() < epsilon:
            action = np.random.randint(n_actions)
        else:
            action = np.argmax(q_table[state])

        new_state, reward, done, _ = env.step(action)
        if reward > 0:
            print(f"🏁 Goal reached at episode {ep}")

        q_table[state, action] += alpha * (reward + gamma * np.max(q_table[new_state]) - q_table[state, action])
        state = new_state

        if done:
            break

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

end_cpu = time.time()
cpu_time = end_cpu - start_cpu

print(f"🧠 CPU Q-learning time: {cpu_time:.4f} seconds")
print("Q-table (rounded):\n", np.round(q_table, 2))


🏁 Goal reached at episode 7
🏁 Goal reached at episode 57
🏁 Goal reached at episode 58
🏁 Goal reached at episode 69
🏁 Goal reached at episode 73
🏁 Goal reached at episode 75
🏁 Goal reached at episode 85
🏁 Goal reached at episode 86
🏁 Goal reached at episode 97
🏁 Goal reached at episode 100
🏁 Goal reached at episode 101
🏁 Goal reached at episode 103
🏁 Goal reached at episode 115
🏁 Goal reached at episode 116
🏁 Goal reached at episode 117
🏁 Goal reached at episode 118
🏁 Goal reached at episode 120
🏁 Goal reached at episode 124
🏁 Goal reached at episode 128
🏁 Goal reached at episode 131
🏁 Goal reached at episode 133
🏁 Goal reached at episode 134
🏁 Goal reached at episode 135
🏁 Goal reached at episode 137
🏁 Goal reached at episode 138
🏁 Goal reached at episode 141
🏁 Goal reached at episode 143
🏁 Goal reached at episode 144
🏁 Goal reached at episode 145
🏁 Goal reached at episode 147
🏁 Goal reached at episode 150
🏁 Goal reached at episode 152
🏁 Goal reached at episode 153
🏁 Goal reached at ep

## Q - Learning GPU

In [9]:
import gym
import torch
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("⚡ Using device:", device)

env = gym.make("FrozenLake-v1", is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n

episodes = 1000
max_steps = 200
alpha = 0.8
gamma = 0.95
epsilon = 1.0
min_epsilon = 0.01
epsilon_decay = 0.995

Q = torch.zeros((n_states, n_actions), dtype=torch.float32, device=device)

start_gpu = time.time()

for ep in range(episodes):
    state = env.reset()
    if isinstance(state, tuple):  # Gym version compatibility
        state = state[0]
    state = int(state)

    done = False

    for _ in range(max_steps):
        if torch.rand(1).item() < epsilon:
            action = torch.randint(0, n_actions, (1,), device=device).item()
        else:
            action = torch.argmax(Q[state]).item()

        next_state, reward, done, _ = env.step(action)
        if isinstance(next_state, tuple):
            next_state = next_state[0]
        next_state = int(next_state)

        if reward > 0:
            print(f"🏁 Goal reached at episode {ep}")

        # Q-learning update
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * torch.max(Q[next_state]) - Q[state, action])

        state = next_state

        if done:
            break

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

torch.cuda.synchronize()
end_gpu = time.time()

gpu_time = end_gpu - start_gpu
print(f"\n⚡ GPU Q-learning time: {gpu_time:.4f} seconds")
print("Q-table (rounded):\n", torch.round(Q.cpu(), decimals=2))


⚡ Using device: cuda

⚡ GPU Q-learning time: 14.7295 seconds
Q-table (rounded):
 tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])
