<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/ReinforcementLearning_value_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gymnasium as gym
import numpy as np
import torch
from collections import defaultdict

In [3]:
# Create the environment
env = gym.make("FrozenLake-v1", is_slippery=False)  # deterministic version

# Define parameters
num_episodes = 5000
gamma = 0.99  # discount factor
learning_rate = 0.1



In [None]:
# Monte Carlo

In [4]:
# Initialize value function as torch tensor
state_size = env.observation_space.n
value_table = torch.zeros(state_size, dtype=torch.float32)

# Helper: generate an episode
def generate_episode(env):
    episode = []
    state, _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()  # random policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode.append((state, action, reward))
        state = next_state
    return episode

# Monte Carlo Prediction
returns_sum = defaultdict(float)
returns_count = defaultdict(int)

for episode_idx in range(num_episodes):
    episode = generate_episode(env)
    visited_states = set()
    G = 0
    for t in reversed(range(len(episode))):
        state, action, reward = episode[t]
        G = gamma * G + reward
        if state not in visited_states:
            visited_states.add(state)
            returns_sum[state] += G
            returns_count[state] += 1
            value_table[state] = returns_sum[state] / returns_count[state]

# Print the learned value function
print("Learned Value Function (state-values):")
for s in range(state_size):
    print(f"State {s}: {value_table[s].item():.3f}")


Learned Value Function (state-values):
State 0: 0.017
State 1: 0.016
State 2: 0.032
State 3: 0.015
State 4: 0.018
State 5: 0.000
State 6: 0.066
State 7: 0.000
State 8: 0.039
State 9: 0.107
State 10: 0.198
State 11: 0.000
State 12: 0.000
State 13: 0.258
State 14: 0.514
State 15: 0.000


In [None]:
# TD learning

In [7]:
# Initialize value function as torch tensor
state_size = env.observation_space.n
value_table = torch.zeros(state_size, dtype=torch.float32)

# Helper: generate an episode
def generate_episode(env):
    episode = []
    state, _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()  # random policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode.append((state, action, reward))
        state = next_state
    return episode

# Monte Carlo Prediction
returns_sum = defaultdict(float)
returns_count = defaultdict(int)

for episode_idx in range(num_episodes):
    episode = generate_episode(env)
    visited_states = set()
    G = 0
    for t in reversed(range(len(episode))):
        state, action, reward = episode[t]
        G = gamma * G + reward
        if state not in visited_states:
            visited_states.add(state)
            returns_sum[state] += G
            returns_count[state] += 1
            value_table[state] = returns_sum[state] / returns_count[state]

# Print the learned value function
print("Learned Value Function (state-values):")
for s in range(state_size):
    print(f"State {s}: {value_table[s].item():.3f}")


Learned Value Function (state-values):
State 0: 0.013
State 1: 0.008
State 2: 0.016
State 3: 0.003
State 4: 0.016
State 5: 0.000
State 6: 0.030
State 7: 0.000
State 8: 0.034
State 9: 0.084
State 10: 0.121
State 11: 0.000
State 12: 0.000
State 13: 0.172
State 14: 0.402
State 15: 0.000
