In [None]:
import sys
!{sys.executable} -m pip install gym==0.21.0
!{sys.executable} -m pip install numpy==1.19.5
!{sys.executable} -m pip install "gym[atari]"==0.21.0
!{sys.executable} -m pip install atari-py==0.2.9
!{sys.executable} -m pip install autorom[accept-rom-license]

In [None]:
!pip uninstall gymnasium
!pip install gym==0.21.0
!pip install numpy==1.23.5
!pip install gym[atari]
!pip install atari-py

^C
Collecting gym==0.21.0
  Using cached gym-0.21.0.tar.gz (1.5 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Collecting gym==0.21.0
  Using cached gym-0.21.0.tar.gz (1.5 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [3 lines of output]
      error in gym setup command: 'extras_require' must be a dictionary whose values are strings or lists of strings containing valid project/version requirement specifiers.
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

× Encountered error while generating package metadata.
╰─> See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.

[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install autorom[accept-rom-license]

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

def explore_environment(env_name, num_episodes=5, max_steps=1000):
    """
    Load environment, print state/action spaces, and run random agent.
    
    Args:
        env_name: Name of the Gym environment
        num_episodes: Number of episodes to run with random agent
        max_steps: Maximum steps per episode
    """
    print(f"\n{'='*70}")
    print(f"EXPLORING ENVIRONMENT: {env_name}")
    print(f"{'='*70}\n")
    
    # Load environment
    env = gym.make(env_name)
    
    # Print State Space Information
    print("STATE SPACE:")
    print(f"  Type: {type(env.observation_space)}")
    print(f"  Shape: {env.observation_space.shape}")
    
    if hasattr(env.observation_space, 'low') and hasattr(env.observation_space, 'high'):
        print(f"  Low bounds: {env.observation_space.low}")
        print(f"  High bounds: {env.observation_space.high}")
    
    # Print Action Space Information
    print("\nACTION SPACE:")
    print(f"  Type: {type(env.action_space)}")
    if isinstance(env.action_space, gym.spaces.Discrete):
        print(f"  Number of actions: {env.action_space.n}")
    else:
        print(f"  Shape: {env.action_space.shape}")
    
    # Run random agent
    print(f"\n{'='*70}")
    print("RANDOM AGENT ANALYSIS")
    print(f"{'='*70}\n")
    
    episode_rewards = []
    episode_lengths = []
    state_samples = []
    reward_history = []
    
    for episode in range(num_episodes):
        state = env.reset()
        
        episode_reward = 0
        steps = 0
        episode_reward_list = []
        
        # Store initial state
        if episode == 0:
            state_samples.append(state.copy())
        
        for step in range(max_steps):
            # Random action
            action = env.action_space.sample()
            
            # Step environment (old Gym API returns 4 values)
            next_state, reward, done, info = env.step(action)
            
            episode_reward += reward
            episode_reward_list.append(reward)
            steps += 1
            
            # Store sample states
            if episode == 0 and step < 5:
                state_samples.append(next_state.copy())
            
            state = next_state
            
            if done:
                break
        
        episode_rewards.append(episode_reward)
        episode_lengths.append(steps)
        reward_history.extend(episode_reward_list)
        
        print(f"Episode {episode + 1}:")
        print(f"  Total Reward: {episode_reward:.2f}")
        print(f"  Episode Length: {steps} steps")
        print(f"  Average Reward per Step: {episode_reward/steps:.4f}")
    
    # Summary Statistics
    print(f"\n{'='*70}")
    print("SUMMARY STATISTICS")
    print(f"{'='*70}\n")
    
    print(f"Average Episode Reward: {np.mean(episode_rewards):.2f} ± {np.std(episode_rewards):.2f}")
    print(f"Average Episode Length: {np.mean(episode_lengths):.1f} ± {np.std(episode_lengths):.1f}")
    print(f"Min/Max Episode Reward: {np.min(episode_rewards):.2f} / {np.max(episode_rewards):.2f}")
    print(f"Min/Max Episode Length: {np.min(episode_lengths)} / {np.max(episode_lengths)}")
    
    # Reward distribution
    unique_rewards = np.unique(reward_history)
    print(f"\nUnique Reward Values: {unique_rewards}")
    print("Reward Distribution:")
    for reward_val in unique_rewards:
        count = reward_history.count(reward_val)
        percentage = (count / len(reward_history)) * 100
        print(f"  Reward {reward_val:6.1f}: {count:5d} times ({percentage:5.1f}%)")
    
    # Sample states
    print("\nSample States (first 3):")
    for i, state in enumerate(state_samples[:3]):
        if len(state) <= 10:  # For small state spaces
            print(f"  State {i}: {state}")
        else:  # For large state spaces (like images)
            print(f"  State {i}: Shape {state.shape}, "
                  f"Range [{state.min():.2f}, {state.max():.2f}], "
                  f"Mean {state.mean():.2f}")
    
    env.close()
    
    return {
        'episode_rewards': episode_rewards,
        'episode_lengths': episode_lengths,
        'reward_history': reward_history
    }

def plot_results(mountaincar_results, pong_results):
    """Plot comparison of the two environments."""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # MountainCar episode rewards
    axes[0, 0].plot(mountaincar_results['episode_rewards'], marker='o')
    axes[0, 0].set_xlabel('Episode')
    axes[0, 0].set_ylabel('Total Reward')
    axes[0, 0].set_title('MountainCar-v0: Episode Rewards')
    axes[0, 0].grid(True, alpha=0.3)
    
    # MountainCar episode lengths
    axes[0, 1].plot(mountaincar_results['episode_lengths'], marker='o', color='orange')
    axes[0, 1].set_xlabel('Episode')
    axes[0, 1].set_ylabel('Episode Length')
    axes[0, 1].set_title('MountainCar-v0: Episode Lengths')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Pong episode rewards
    axes[1, 0].plot(pong_results['episode_rewards'], marker='o', color='green')
    axes[1, 0].set_xlabel('Episode')
    axes[1, 0].set_ylabel('Total Reward')
    axes[1, 0].set_title('Pong-v0: Episode Rewards')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Pong episode lengths
    axes[1, 1].plot(pong_results['episode_lengths'], marker='o', color='red')
    axes[1, 1].set_xlabel('Episode')
    axes[1, 1].set_ylabel('Episode Length')
    axes[1, 1].set_title('Pong-v0: Episode Lengths')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('environment_comparison.png', dpi=150, bbox_inches='tight')
    print("\nPlot saved as 'environment_comparison.png'")
    plt.show()

if __name__ == "__main__":
    print("\n" + "="*70)
    print("GYM ENVIRONMENT EXPLORATION FOR DQN")
    print("="*70)
    
    # Explore MountainCar-v0
    mountaincar_results = explore_environment('MountainCar-v0', num_episodes=5, max_steps=200)
    
    # Explore Pong-v0 (fewer episodes due to longer runtime)
    print("\n" + "="*70)
    print("Note: Pong-v0 episodes can be very long. Running with reduced episodes.")
    print("="*70)
    pong_results = explore_environment('Pong-v0', num_episodes=3, max_steps=2000)

    plot_results(mountaincar_results, pong_results)
    
    # Key Observations
    print("\n" + "="*70)
    print("KEY OBSERVATIONS")
    print("="*70)
    print("""
MountainCar-v0:
- State Space: 2D continuous (position, velocity)
  * Position: [-1.2, 0.6] (negative = left, positive = right)
  * Velocity: [-0.07, 0.07] (negative = leftward, positive = rightward)
- Action Space: Discrete with 3 actions
  * Action 0: Push Left
  * Action 1: No Push (coast)
  * Action 2: Push Right
- Reward Structure: -1 for each time step until goal is reached at position 0.5
- Challenge: Sparse rewards make learning difficult; agent must learn
  to build momentum by going back and forth
- Episode Length: Maximum 200 steps
- Random Agent Performance: Always gets reward of -200 (never reaches goal)
- Key Learning Challenge: Delayed reward problem - actions early in episode
  affect ability to reach goal later

Pong-v0:
- State Space: High-dimensional (210x160x3 RGB image pixels = 100,800 values!)
  * Each pixel has 3 color channels (Red, Green, Blue)
  * Values range from 0-255
- Action Space: Discrete with 6 actions
  * Action 0: NOOP (no operation)
  * Action 1: FIRE (not useful in Pong)
  * Action 2: Move paddle UP
  * Action 3: Move paddle DOWN
  * Actions 4, 5: RIGHT-FIRE, LEFT-FIRE (not useful in Pong)
  * Effectively only actions 0, 2, 3 are needed
- Reward Structure: 
  * +1 for winning a volley (opponent misses ball)
  * -1 for losing a volley (agent misses ball)
  * 0 for most time steps (when ball is in play)
- Challenge: High-dimensional visual input requires preprocessing
- Episode Length: Varies (game ends when one player reaches 21 points)
- Random Agent Performance: Typically loses badly (around -21 total reward)

Recommended Preprocessing for Pong (for DQN):
1. Convert RGB to grayscale (reduces from 3 channels to 1)
2. Downsample image (e.g., 84x84 instead of 210x160)
3. Stack 4 consecutive frames to capture motion/velocity
4. Frame skipping (e.g., repeat action for 4 frames) to reduce computation
5. Normalize pixel values to [0, 1] range

These preprocessing steps reduce state space from 100,800 to ~28,224 values
(84x84x4) and help the network learn motion patterns.
""")


GYM ENVIRONMENT EXPLORATION FOR DQN

EXPLORING ENVIRONMENT: MountainCar-v0

STATE SPACE:
  Type: <class 'gymnasium.spaces.box.Box'>
  Shape: (2,)
  Low bounds: [-1.2  -0.07]
  High bounds: [0.6  0.07]

ACTION SPACE:
  Type: <class 'gymnasium.spaces.discrete.Discrete'>
  Number of actions: 3

RANDOM AGENT ANALYSIS

Episode 1:
  Total Reward: -200.00
  Episode Length: 200 steps
  Average Reward per Step: -1.0000
Episode 2:
  Total Reward: -200.00
  Episode Length: 200 steps
  Average Reward per Step: -1.0000
Episode 3:
  Total Reward: -200.00
  Episode Length: 200 steps
  Average Reward per Step: -1.0000
Episode 4:
  Total Reward: -200.00
  Episode Length: 200 steps
  Average Reward per Step: -1.0000
Episode 5:
  Total Reward: -200.00
  Episode Length: 200 steps
  Average Reward per Step: -1.0000

SUMMARY STATISTICS

Average Episode Reward: -200.00 ± 0.00
Average Episode Length: 200.0 ± 0.0
Min/Max Episode Reward: -200.00 / -200.00
Min/Max Episode Length: 200 / 200

Unique Reward Values

NameNotFound: Environment `Pong` doesn't exist.