In [21]:
%pip install gymnasium

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Check if GPU is available
gpu_available = torch.cuda.is_available()
print(f"GPU Available: {gpu_available}")

# If GPU is available, print the GPU name
if gpu_available:
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

# Check if tensors are being moved to GPU
device = torch.device("cuda" if gpu_available else "cpu")
tensor = torch.rand(3, 3).to(device)
print(f"Tensor is on device: {tensor.device}")

GPU Available: False
Tensor is on device: cpu


In [3]:
# Setup the environment
env = gym.make("CartPole-v1")

# Get the initial state
state, info = env.reset()   
num_bins=10                         # Number of bins to Discretize
num_actions = env.action_space.n    # Number of actions (left, right)
state_space_size = (num_bins,) * 4  # State space size after discretization

In [4]:
"""
Helper Functions for Temporal Difference (TD) Learning with Linear Function Approximation
"""

def extract_features(state):
    """
    Function to extract features from the state
    """
    cart_pos, cart_vel, pole_angle, pole_vel = state
    return np.array([cart_pos / 2.4,cart_vel / 2.0,pole_angle / 0.2095,pole_vel / 3.0])

def choose_action_linear(weights, state, exploration_rate):
    """
    Function to choose an action using epsilon-greedy policy
    """
    if np.random.rand() < exploration_rate:
        # Choose a random action
        return np.random.randint(num_actions)
    else: 
        # Choose the action with the highest Q-value
        q_values = [np.dot(weights[a], extract_features(state)) for a in range(env.action_space.n)]
        return np.argmax(q_values) 

In [5]:
"""
Part 2.1: Temporal Difference (TD) Learning with Linear Function Approximation (SARSA)
"""

# Hyperparameters
learning_rate = 0.01  
discount_factor = 0.90  
num_features = 4            # Number of state features

# Initialize weights for linear approximation
weights_sarsa = np.zeros((env.action_space.n, num_features))

In [6]:
def sarsa(env, num_episodes, exploration_rate):
    """
    Function to perform SARSA with linear function approximation
    """

    # Loop over episodes
    for episode in range(num_episodes):
        # Reset the environment and choose the initial action
        state, _ = env.reset()
        action = choose_action_linear(weights_sarsa, state, exploration_rate)
        
        # Initialize metrics
        total_reward = 0  # Tracking total reward 
        total_td_error = 0  # Tracking total TD error 

        # Loop over steps within the episode
        done = False
        while not done:
            # Take a step in the environment and choose the next action
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_action = choose_action_linear(weights_sarsa, next_state, exploration_rate)

            # Compute the Q-values for the current state
            features = extract_features(state)
            q_value = np.dot(weights_sarsa[action], features)

            # Compute the Q-value for the next state
            next_features = extract_features(next_state)
            next_q_value = np.dot(weights_sarsa[next_action], next_features)

            # Compute the TD error
            td_error = np.add(reward, np.multiply(discount_factor, next_q_value)) - q_value

            # Update the weights using the TD error
            weights_sarsa[action] += learning_rate * td_error * features

            # Accumulate reward and TD error
            total_reward += reward
            total_td_error += abs(td_error)

            # Move to the next state and action
            state, action = next_state, next_action
            done = terminated or truncated

        # Print metrics every 100 episodes
        if (episode + 1) % 1000 == 0:
            print(f"SARSA Episode {episode + 1}: \tTotal Reward = {total_reward:.2f}, "
                  f"\tTotal TD Error = {total_td_error:.4f}")

In [7]:
"""
Perform SARSA with linear function approximation
"""
print("Running SARSA with Linear Function Approximation...")

# Call the SARSA function to learn the Q-values
sarsa(env, num_episodes = 20000, exploration_rate = 0.5) 

# Close the environment
env.close()

Running SARSA with Linear Function Approximation...
SARSA Episode 1000: 	Total Reward = 11.00, 	Total TD Error = 12.9850
SARSA Episode 2000: 	Total Reward = 9.00, 	Total TD Error = 9.3616
SARSA Episode 3000: 	Total Reward = 22.00, 	Total TD Error = 50.6003
SARSA Episode 4000: 	Total Reward = 65.00, 	Total TD Error = 63.8202
SARSA Episode 5000: 	Total Reward = 20.00, 	Total TD Error = 20.8382
SARSA Episode 6000: 	Total Reward = 69.00, 	Total TD Error = 69.7341
SARSA Episode 7000: 	Total Reward = 31.00, 	Total TD Error = 30.5258
SARSA Episode 8000: 	Total Reward = 36.00, 	Total TD Error = 37.2873
SARSA Episode 9000: 	Total Reward = 39.00, 	Total TD Error = 40.5374
SARSA Episode 10000: 	Total Reward = 39.00, 	Total TD Error = 38.5351
SARSA Episode 11000: 	Total Reward = 61.00, 	Total TD Error = 59.7580
SARSA Episode 12000: 	Total Reward = 15.00, 	Total TD Error = 15.2136
SARSA Episode 13000: 	Total Reward = 23.00, 	Total TD Error = 22.3227
SARSA Episode 14000: 	Total Reward = 15.00, 	Tota

In [8]:
"""
Part 2.1: Temporal Difference (TD) Learning with Linear Function Approximation (Q-Learning)
"""

# Hyperparameters
learning_rate = 0.01  
discount_factor = 0.90  
num_features = 4            # Number of state features

# Initialize weights for linear approximation
weights_q_learning = np.zeros((env.action_space.n, num_features))

In [9]:
def q_learning(env, num_episodes, exploration_rate):
    """
    Function to perform Q-Learning with linear function approximation
    """

    # Loop over episodes
    for episode in range(num_episodes):
        # Reset the environment
        state, _ = env.reset()
        
        # Initialize metrics
        total_reward = 0  # Track total reward in the episode
        total_td_error = 0  # Track total TD error in the episode

        # Loop over steps within the episode
        done = False
        while not done:
            # Choose an action using epsilon-greedy policy
            action = choose_action_linear(weights_q_learning, state, exploration_rate)

            # Take the action and observe the next state and reward
            next_state, reward, terminated, truncated, _ = env.step(action)

            # Compute Q-value for the current state and action
            features = extract_features(state)
            q_value = np.dot(weights_q_learning[action], features)

            # Compute the Q-value for the next state
            next_features = extract_features(next_state)
            next_q_values = [np.dot(weights_q_learning[a], next_features) for a in range(env.action_space.n)]

            # Compute the TD error
            td_error = np.add(reward, np.multiply(discount_factor, np.max(next_q_values))) - q_value

            # Update the weights using the TD error
            weights_q_learning[action] = np.add(weights_q_learning[action], np.multiply(learning_rate * td_error, features))

            # Accumulate reward and TD error
            total_reward += reward
            total_td_error += abs(td_error)

            # Move to the next state
            state = next_state
            done = terminated or truncated

        # Print metrics every 500 episodes
        if (episode + 1) % 500 == 0:
            print(f"Q-Learning Episode {episode + 1}: \tTotal Reward = {total_reward:.2f}, "
                  f"\tTotal TD Error = {total_td_error:.4f}")

In [10]:
"""
Perform Q-Learning with linear function approximation
"""
print("\nRunning Q-Learning with Linear Function Approximation...")

# Call the Q-Learning function to learn the Q-values
q_learning(env, num_episodes = 5000, exploration_rate = 0.2)

# Close the environment
env.close()


Running Q-Learning with Linear Function Approximation...
Q-Learning Episode 500: 	Total Reward = 9.00, 	Total TD Error = 16.3235
Q-Learning Episode 1000: 	Total Reward = 10.00, 	Total TD Error = 22.9546
Q-Learning Episode 1500: 	Total Reward = 9.00, 	Total TD Error = 16.2150
Q-Learning Episode 2000: 	Total Reward = 9.00, 	Total TD Error = 12.8818
Q-Learning Episode 2500: 	Total Reward = 10.00, 	Total TD Error = 30.9727
Q-Learning Episode 3000: 	Total Reward = 9.00, 	Total TD Error = 65.9485
Q-Learning Episode 3500: 	Total Reward = 10.00, 	Total TD Error = 121.7913
Q-Learning Episode 4000: 	Total Reward = 500.00, 	Total TD Error = 744.7463
Q-Learning Episode 4500: 	Total Reward = 486.00, 	Total TD Error = 541.1092
Q-Learning Episode 5000: 	Total Reward = 500.00, 	Total TD Error = 520.4110


In [11]:
"""
Part 2.2: Temporal Difference (TD) Learning with Deep Q-Network (DQN)
"""

class QNetwork(nn.Module):
    """
    Deep Q-Network (DQN) class
    """
    def __init__(self, input_size, hidden_size, output_size):
        """
        Constructor for QNetwork class
        """
        super(QNetwork, self).__init__()
        # Define layers
        self.fc1 = nn.Linear(input_size, hidden_size)   # Input to hidden layer 1
        self.fc2 = nn.Linear(hidden_size, hidden_size)  # Hidden layer 1 to hidden layer 2
        self.fc3 = nn.Linear(hidden_size, output_size)  # Hidden layer 2 to output

    def forward(self, x):
        """
        Forward pass of the network
        """
        x = torch.relu(self.fc1(x))     # ReLU activation for hidden layer 1
        x = torch.relu(self.fc2(x))     # ReLU activation for hidden layer 2
        x = self.fc3(x)                 # Output layer (raw Q-values)
        return x
    
def choose_action_dqn(network, state, exploration_rate):
    """
    Function to choose an action using epsilon-greedy policy
    """
    if np.random.rand() < exploration_rate:
        # Choose a random action
        return np.random.randint(num_actions)
    else:
        # Choose the action with the highest Q-value
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        q_values = network(state).cpu().detach().numpy()
        return np.argmax(q_values)
    

In [12]:
"""
Initialize the DQN model for SARSA and Q-Learning
"""

# Hyperparameters
input_size = env.observation_space.shape[0]     # State size (4 for CartPole)
hidden_size = 64                                # Number of neurons in the hidden layers
output_size = env.action_space.n                # Number of actions (2 for CartPole)
learning_rate = 0.00001
min_exploration_rate = 0.01

# Initialize the DQN model
q_network_sarsa = QNetwork(input_size, hidden_size, output_size)
q_network_q_learning = QNetwork(input_size, hidden_size, output_size)

# Initialize the optimizer
optimizer_sarsa = optim.Adam(q_network_sarsa.parameters(), lr=learning_rate)
optimizer_q_learning = optim.Adam(q_network_q_learning.parameters(), lr=learning_rate)

# Initialize the loss function
loss_fn = nn.MSELoss()

In [13]:
def sarsa_dqn(env, num_episodes, exploration_rate, exploration_decay):
    """
    Function to perform SARSA with DQN
    """

    # Loop over episodes
    for episode in range(num_episodes):
        # Reset the environment and choose the initial action
        state, _ = env.reset()
        action = choose_action_dqn(q_network_sarsa, state, exploration_rate)
        total_reward = 0
        total_td_error = 0

        # Loop over steps within the episode
        done = False
        while not done:
            # Take a step in the environment and choose the next action
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_action = choose_action_dqn(q_network_sarsa, next_state, exploration_rate)

            # Compute target Q-value and predicted Q-value
            target_q_value = reward + discount_factor * q_network_sarsa(
                torch.FloatTensor(next_state).unsqueeze(0)
            )[0][next_action].item()
            predicted_q_value = q_network_sarsa(torch.FloatTensor(state).unsqueeze(0))[0][action]

            # Compute TD error
            td_error = target_q_value - predicted_q_value.item()
            
            # Update the Q-value using the TD error
            loss = loss_fn(predicted_q_value, torch.tensor(target_q_value))
            optimizer_sarsa.zero_grad()
            loss.backward()
            optimizer_sarsa.step()

            # Accumulate reward and TD error
            total_reward += reward
            total_td_error += abs(td_error)

            # Move to the next state and action
            state, action = next_state, next_action
            done = terminated or truncated

        # Decay exploration rate
        exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay)

        # Print metrics every 500 episodes
        if (episode + 1) % 500 == 0:
            print(f"SARSA Episode {episode + 1}: \tTotal Reward = {total_reward}, "
                  f"\tTotal TD Error = {total_td_error:.4f}, \tExploration Rate = {exploration_rate:.4f}")# Training parameters


In [14]:
# Training parameters for SARSA with DQN
num_episodes = 5000
initial_exploration_rate = 1.0      # Initial exploration rate
exploration_decay = 0.999           # Exploration decay rate
min_exploration_rate = 0.01         # Minimum exploration rate
discount_factor = 0.99              # Discount factor
learning_rate = 0.00001             # Learning rate

# Initialize the DQN model
print("\nRunning SARSA with Deep Q-Network Approximation...")
sarsa_dqn(env, num_episodes, initial_exploration_rate, exploration_decay)

# Close the environment
env.close()


Running SARSA with Deep Q-Network Approximation...
SARSA Episode 500: 	Total Reward = 11.0, 	Total TD Error = 19.3427, 	Exploration Rate = 0.6064
SARSA Episode 1000: 	Total Reward = 11.0, 	Total TD Error = 32.3000, 	Exploration Rate = 0.3677
SARSA Episode 1500: 	Total Reward = 14.0, 	Total TD Error = 68.3480, 	Exploration Rate = 0.2230
SARSA Episode 2000: 	Total Reward = 9.0, 	Total TD Error = 65.5374, 	Exploration Rate = 0.1352
SARSA Episode 2500: 	Total Reward = 10.0, 	Total TD Error = 105.1397, 	Exploration Rate = 0.0820
SARSA Episode 3000: 	Total Reward = 10.0, 	Total TD Error = 151.2536, 	Exploration Rate = 0.0497
SARSA Episode 3500: 	Total Reward = 8.0, 	Total TD Error = 178.5381, 	Exploration Rate = 0.0301
SARSA Episode 4000: 	Total Reward = 10.0, 	Total TD Error = 310.5276, 	Exploration Rate = 0.0183
SARSA Episode 4500: 	Total Reward = 9.0, 	Total TD Error = 394.0682, 	Exploration Rate = 0.0111
SARSA Episode 5000: 	Total Reward = 8.0, 	Total TD Error = 475.6613, 	Exploration R

In [15]:
def q_learning_dqn(env, num_episodes, exploration_rate, exploration_decay):
    """
    Function to perform Q-Learning with DQN
    """
    discount_factor = 0.99  # Set your discount factor here
    
    # Loop over episodes
    for episode in range(num_episodes):
        # Reset the environment and choose the initial action
        state, _ = env.reset()
        total_reward = 0
        total_td_error = 0

        # Loop over steps within the episode
        done = False
        while not done:
            # Choose the action
            action = choose_action_dqn(q_network_q_learning, state, exploration_rate)

            # Take a step in the environment
            next_state, reward, terminated, truncated, _ = env.step(action)

            # Compute target Q-value using max Q-value for the next state (Q-learning update)
            next_q_values = q_network_q_learning(torch.FloatTensor(next_state).unsqueeze(0))
            max_next_q_value = torch.max(next_q_values).item()

            target_q_value = reward + discount_factor * max_next_q_value
            predicted_q_value = q_network_q_learning(torch.FloatTensor(state).unsqueeze(0))[0][action]

            # Compute TD error
            td_error = target_q_value - predicted_q_value.item()

            # Update the Q-value using the TD error
            loss = loss_fn(predicted_q_value, torch.tensor(target_q_value))
            optimizer_q_learning.zero_grad()
            loss.backward()
            optimizer_q_learning.step()

            # Accumulate reward and TD error
            total_reward += reward
            total_td_error += abs(td_error)

            # Move to the next state
            state = next_state
            done = terminated or truncated

        # Decay exploration rate
        exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay)

        if (episode + 1) % 500 == 0:
            print(f"Q-Learning Episode {episode + 1}: \tTotal Reward = {total_reward}, "
                  f"\tTotal TD Error = {total_td_error:.4f}, \tExploration Rate = {exploration_rate:.4f}")


In [18]:
# Training parameters for Q-Learning with DQN
num_episodes = 5000
initial_exploration_rate = 1.0      # Initial exploration rate
exploration_decay = 0.999           # Exploration decay rate
min_exploration_rate = 0.01         # Minimum exploration rate
discount_factor = 0.99              # Discount factor
learning_rate = 0.00001             # Learning rate

# Initialize the DQN model
print("\nRunning Q-Learning with Deep Q-Network Approximation...")
q_learning_dqn(env, num_episodes, initial_exploration_rate, exploration_decay)

# Close the environment
env.close()


Running Q-Learning with Deep Q-Network Approximation...
Q-Learning Episode 500: 	Total Reward = 21.0, 	Total TD Error = 11156.4584, 	Exploration Rate = 0.6064
Q-Learning Episode 1000: 	Total Reward = 9.0, 	Total TD Error = 6629.9751, 	Exploration Rate = 0.3677
Q-Learning Episode 1500: 	Total Reward = 11.0, 	Total TD Error = 6177.3909, 	Exploration Rate = 0.2230
Q-Learning Episode 2000: 	Total Reward = 8.0, 	Total TD Error = 5334.5876, 	Exploration Rate = 0.1352
Q-Learning Episode 2500: 	Total Reward = 9.0, 	Total TD Error = 11208.8033, 	Exploration Rate = 0.0820
Q-Learning Episode 3000: 	Total Reward = 9.0, 	Total TD Error = 7842.2059, 	Exploration Rate = 0.0497
Q-Learning Episode 3500: 	Total Reward = 10.0, 	Total TD Error = 9738.6653, 	Exploration Rate = 0.0301
Q-Learning Episode 4000: 	Total Reward = 11.0, 	Total TD Error = 12063.9040, 	Exploration Rate = 0.0183
Q-Learning Episode 4500: 	Total Reward = 8.0, 	Total TD Error = 10089.2021, 	Exploration Rate = 0.0111
Q-Learning Episode