In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import DQN,PPO
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv

In [None]:
"""
This script demonstrates how to train a DQN agent on the CartPole-v1 environment using Stable Baselines3.
It includes the setup of the environment, training the agent, and evaluating its performance.
"""

In [None]:
env = gym.make("CartPole-v1",render_mode="human")


In [None]:
obs = env.reset()
env.render()
for _ in range(1000):
    action = env.action_space.sample()  # Random action
    obs, reward, terminated, truncated,info = env.step(action)
    done = terminated or truncated
    
    # print(f"Observation: {obs}, Reward: {reward}, Done: {done}")
    if done:
        obs = env.reset()
env.close()

In [None]:
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent for 100_000 timesteps (~1000 episodes)
model.learn(total_timesteps=35_000)

# Save the trained model
model.save("models/ppo_cartpole_expert")

In [None]:
!pwd

In [None]:
model.save("models/ppo_cartpole_expert")

In [None]:
expert_model = PPO.load("models/ppo_cartpole_expert")

In [None]:
env = gym.make("CartPole-v1",render_mode="human")

In [None]:
env.reset()

In [None]:
obs, _ = env.reset()
env.render()
g_reward = 0
while True:
    # Predict action
    action, _states = expert_model.predict(obs, deterministic=True)

    # Step in environment
    obs, reward, done, truncated, info = env.step(action)
    g_reward += reward
    # Optional: render is handled automatically with render_mode="human"
    
    print(f"Reward: {g_reward}")
env.close()

In [None]:
env.close()

In [None]:
from imitation.algorithms.bc import BC
from imitation.data import rollout

In [None]:
vec_env = DummyVecEnv([lambda: gym.make("CartPole-v1")])

# Create RNG
rng = np.random.default_rng(0)

# Collect expert rollouts (e.g., 50 episodes)
rollouts = rollout.rollout(
    expert_model,
    vec_env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
    unwrap=False
)

In [None]:
PPO.policy_aliases

In [None]:
PPO.policy_aliases["MlpPolicy"]

In [None]:
# Initialize BC model
bc_trainer = BC(
    observation_space=vec_env.observation_space,
    action_space=vec_env.action_space,
    demonstrations=rollouts,
    batch_size=32,
    rng=rng
)

In [None]:

# Train BC model
bc_trainer.train(n_epochs=50)  # You can increase n_epochs for better results


In [None]:

# Save trained model
bc_trainer.policy.save("models/bc_cartpole_policy")

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
reward, _ = evaluate_policy(bc_trainer.policy, env, 10)
print("Reward:", reward)