In [1]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv  


In [5]:
import gym
import numpy as np
import random


episodes = 100

def evaluate_policy(env, policy_func, episodes=100):
    total_reward = 0
    for _ in range(episodes):
        state, _ = env.reset()  
        done = False
        episode_reward = 0
        while not done:
            action = policy_func(state)
            state, reward, done, truncated, info = env.step(action) 
            episode_reward += reward
            if done or truncated:
                break
        total_reward += episode_reward
    average_reward = total_reward / episodes
    return average_reward

# Define Angle-Based Policy
def angle_based_policy(state):
    angle = state[2]  # The pole's angle is at index 2
    return 0 if angle < 0 else 1

# Define Position-Based Policy
def position_based_policy(state):
    position = state[0]  # The cart's position is at index 0
    return 0 if position < 0 else 1

# Define Velocity-Based Policy
def velocity_based_policy(state):
    velocity = state[1]  # The cart's velocity is at index 1
    return 0 if velocity < 0 else 1

# Define Combined Policy (Angle + Velocity)
def combined_policy(state):
    angle = state[2]  # The pole's angle is at index 2
    velocity = state[1]  # The cart's velocity is at index 1
    if angle < 0 and velocity < 0:
        return 0  # Move left
    elif angle > 0 and velocity > 0:
        return 1  # Move right
    else:
        return random.choice([0, 1])  # Choose randomly

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Evaluate each policy
angle_policy_reward = evaluate_policy(env, angle_based_policy, episodes)
position_policy_reward = evaluate_policy(env, position_based_policy, episodes)
velocity_policy_reward = evaluate_policy(env, velocity_based_policy, episodes)
combined_policy_reward = evaluate_policy(env, combined_policy, episodes)

# Print the performance of each policy
print(f"Angle-Based Policy Average Reward: {angle_policy_reward}")
print(f"Position-Based Policy Average Reward: {position_policy_reward}")
print(f"Velocity-Based Policy Average Reward: {velocity_policy_reward}")
print(f"Combined Policy Average Reward: {combined_policy_reward}")

env.close()

Angle-Based Policy Average Reward: 42.42
Position-Based Policy Average Reward: 9.34
Velocity-Based Policy Average Reward: 9.45
Combined Policy Average Reward: 15.13


In [3]:
import gym
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env

# Create the CartPole environment with render_mode='human'
env = gym.make('CartPole-v1', render_mode='human')

# Wrap the environment in VecEnv
env = make_vec_env(lambda: env, n_envs=1)

# Define the DQN model with 2 hidden layers (24 units each)
model = DQN('MlpPolicy', env, verbose=1, learning_rate=0.001, policy_kwargs={'net_arch': [24, 24]})

# Train the model
model.learn(total_timesteps=10000)
model.save("dqn_cartpole")

# Load the trained model
model = DQN.load("dqn_cartpole", env=env)

# Evaluate the trained agent
total_reward = 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)
        episode_reward += reward
    total_reward += episode_reward

# average reward over 100 episodes
average_reward = total_reward / episodes
print(f'Average Reward over {episodes} episodes: {average_reward}')

# Visualize the trained agent for 20 episodes
for _ in range(20):
    state = env.reset()
    done = False
    while not done:
        env.render()
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)

env.close()




Using cpu device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 22.5     |
|    ep_rew_mean      | 22.5     |
|    exploration_rate | 0.914    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 26       |
|    time_elapsed     | 3        |
|    total_timesteps  | 90       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23.8     |
|    ep_rew_mean      | 23.8     |
|    exploration_rate | 0.819    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 33       |
|    time_elapsed     | 5        |
|    total_timesteps  | 190      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.535    |
|    n_updates        | 22       |
----------------------------------
----------------------------------
| rollout/            |          |
|  