In [None]:
import gymnasium as gym
import numpy as np
import time
import argparse


def train_q_learning(env, num_episodes=10000, max_steps=100, alpha=0.1, gamma=0.99,
                       epsilon=1.0, epsilon_decay=0.999, min_epsilon=0.01):
    """
    Trains a Q-learning agent on the given environment.
    
    Parameters:
        env: gym environment
        num_episodes: number of training episodes
        max_steps: maximum steps per episode
        alpha: learning rate
        gamma: discount factor
        epsilon: initial exploration rate
        epsilon_decay: decay rate for epsilon per episode
        min_epsilon: minimum exploration rate

    Returns:
        Q-table (numpy array)
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states, n_actions))

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        for step in range(max_steps):
            # Choose action: epsilon-greedy policy
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Q-learning update
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + gamma * Q[next_state][best_next_action]
            td_error = td_target - Q[state][action]
            Q[state][action] += alpha * td_error

            state = next_state
            if done:
                break

        # Decay epsilon
        epsilon = max(min_epsilon, epsilon * epsilon_decay)

        # Optionally, print progress every 1000 episodes
        if (episode + 1) % 1000 == 0:
            print(f"Episode {episode + 1}/{num_episodes} completed. Epsilon: {epsilon:.3f}")
    
    return Q

def run_trained_agent(env, Q, max_steps=100, sleep_time=0.5, render_mode="ansi"):
    """
    Runs a single episode using the trained Q-table and displays the frames.

    Parameters:
        env: gym environment
        Q: trained Q-table
        max_steps: maximum steps to run in the episode
        sleep_time: delay between frames in seconds
        render_mode: rendering mode ("ansi" for text rendering)
    
    Returns:
        total_reward: cumulative reward for the episode
        steps: number of steps taken
    """
    state, _ = env.reset()
    total_reward = 0
    steps = 0

    for step in range(max_steps):
        # Clear the console and print the current frame
        frame = env.render(mode=render_mode)
        print(frame)
        print(f"Step: {step+1}")
        time.sleep(sleep_time)

        # Select the best action (greedy policy)
        action = np.argmax(Q[state])
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward
        state = next_state
        steps += 1

        if done:
            frame = env.render(mode=render_mode)
            print(frame)
            print(f"Episode finished after {steps} steps with total reward: {total_reward}")
            break

    return total_reward, steps

def main():
    # Parse command line arguments for map selection and training episodes.
    parser = argparse.ArgumentParser(description="Q-learning agent for FrozenLake")
    parser.add_argument("--map", type=str, default="4x4",
                        help="Map size for FrozenLake ('4x4' or '8x8').")
    parser.add_argument("--episodes", type=int, default=10000,
                        help="Number of training episodes.")
    args = parser.parse_args()

    # Create the FrozenLake environment with the selected map.
    try:
        env = gym.make("FrozenLake-v1", map_name=args.map, is_slippery=True)
    except Exception as e:
        print(f"Error creating environment: {e}")
        return

    print(f"Training on FrozenLake-{args.map} for {args.episodes} episodes...")
    Q = train_q_learning(env, num_episodes=args.episodes)
    print("Training completed!")

    # Run a few test episodes and gather statistics.
    num_test_episodes = 5
    rewards = []
    steps_taken = []
    for i in range(num_test_episodes):
        print(f"\n=== Running test episode {i+1} ===")
        reward, steps = run_trained_agent(env, Q)
        rewards.append(reward)
        steps_taken.append(steps)
        time.sleep(1)

    # Display overall stats.
    print("\n=== Test Episode Statistics ===")
    for i in range(num_test_episodes):
        print(f"Episode {i+1}: Reward = {rewards[i]}, Steps = {steps_taken[i]}")
    print(f"Average Reward: {np.mean(rewards):.2f}")
    print(f"Average Steps: {np.mean(steps_taken):.2f}")

    env.close()

if __name__ == "__main__":
    main()


: 