<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/reinforcementlearning_value_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gymnasium as gym
import numpy as np
import torch
from collections import defaultdict

import torch.nn as nn
import torch.optim as optim

In [5]:
# Create the environment
env = gym.make("FrozenLake-v1", is_slippery=False)  # deterministic version

# Define parameters
num_episodes = 5000
gamma = 0.99  # discount factor
learning_rate = 0.1
alpha = 0.1  # learning rate

state_size = env.observation_space.n

In [None]:
# Monte Carlo

In [None]:
# Initialize value function as torch tensor
value_table = torch.zeros(state_size, dtype=torch.float32)

# Helper: generate an episode
def generate_episode(env):
    episode = []
    state, _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()  # random policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode.append((state, action, reward))
        state = next_state
    return episode

# Monte Carlo Prediction
returns_sum = defaultdict(float)
returns_count = defaultdict(int)

for episode_idx in range(num_episodes):
    episode = generate_episode(env)
    visited_states = set()
    G = 0
    for t in reversed(range(len(episode))):
        state, action, reward = episode[t]
        G = gamma * G + reward
        if state not in visited_states:
            visited_states.add(state)
            returns_sum[state] += G
            returns_count[state] += 1
            value_table[state] = returns_sum[state] / returns_count[state]

# Print the learned value function
print("Learned Value Function (state-values):")
for s in range(state_size):
    print(f"State {s}: {value_table[s].item():.3f}")


Learned Value Function (state-values):
State 0: 0.017
State 1: 0.016
State 2: 0.032
State 3: 0.015
State 4: 0.018
State 5: 0.000
State 6: 0.066
State 7: 0.000
State 8: 0.039
State 9: 0.107
State 10: 0.198
State 11: 0.000
State 12: 0.000
State 13: 0.258
State 14: 0.514
State 15: 0.000


In [None]:
# TD learning

In [6]:
# Initialize value function as torch tensor
value_table = torch.zeros(state_size, dtype=torch.float32)
# Value function (as a torch tensor)
state_size = env.observation_space.n
value_table = torch.zeros(state_size, dtype=torch.float32)

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        action = env.action_space.sample()  # random policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # TD(0) update rule: V(s) ← V(s) + α [R + γ * V(s') - V(s)]
        td_target = reward + gamma * value_table[next_state]
        td_error = td_target - value_table[state]
        value_table[state] += alpha * td_error

        state = next_state

# Print the learned value function
print("Learned Value Function (TD):")
for s in range(state_size):
    print(f"State {s}: {value_table[s].item():.3f}")

Learned Value Function (TD):
State 0: 0.007
State 1: 0.005
State 2: 0.008
State 3: 0.006
State 4: 0.010
State 5: 0.000
State 6: 0.011
State 7: 0.000
State 8: 0.025
State 9: 0.051
State 10: 0.094
State 11: 0.000
State 12: 0.000
State 13: 0.098
State 14: 0.205
State 15: 0.000


In [None]:
# TD learning: neural network

In [None]:
# One-hot encoding for discrete states
def one_hot(state, num_states):
    vec = torch.zeros(num_states, dtype=torch.float32)
    vec[state] = 1.0
    return vec

# Define the value network
class ValueNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Output: V(s)
        )

    def forward(self, x):
        return self.net(x)

# Initialize network and optimizer
value_net = ValueNetwork(state_size)
optimizer = optim.Adam(value_net.parameters(), lr=0.01)

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        action = env.action_space.sample()  # random policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Prepare inputs
        s = one_hot(state, state_size)
        s_next = one_hot(next_state, state_size)

        # Compute TD target: R + γ * V(s')
        with torch.no_grad():
            target = reward + gamma * value_net(s_next)

        # Prediction and loss
        prediction = value_net(s)
        loss = nn.functional.mse_loss(prediction, target)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state

# Show learned value estimates
print("State-Value Estimates (using neural network):")
for s in range(state_size):
    v = value_net(one_hot(s, state_size)).item()
    print(f"State {s}: {v:.3f}")

State-Value Estimates (using neural network):
State 0: 0.008
State 1: 0.007
State 2: 0.013
State 3: 0.021
State 4: 0.008
State 5: -0.008
State 6: 0.009
State 7: 0.035
State 8: 0.019
State 9: -0.002
State 10: -0.014
State 11: -0.008
State 12: 0.037
State 13: -0.003
State 14: -0.040
State 15: 0.006
