# Value Function Approximation

Notes from this [lecture](https://www.youtube.com/watch?v=UoPei5o4fps&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ&index=6)

Slides are [here](https://davidstarsilver.wordpress.com/wp-content/uploads/2025/04/lecture-6-value-function-approximation-.pdf).



- So far we looked at the value function as a look up table
- There are too many states / actions
- Solution, approximate the value function, estimate v everywhere


$$
\hat{v}(s, \mathbf{w}) \approx v_{\pi}(s)\\
\hat{q}(s, a, \mathbf{w}) \approx q_{\pi}(s, a)
$$

- We could use any function approximater, examples: Neural Networks, decision trees, linear combinations of features.
- The training data need to be non-stationary, non-iid data
- could use gradient descent from 

In [4]:
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# ----------------------------
# Q-Network
# ----------------------------
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, action_dim)
        )

    def forward(self, x):
        return self.net(x)

# ----------------------------
# Hyperparameters
# ----------------------------
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

lr = 1e-3
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
batch_size = 64
buffer_size = 10_000
episodes = 200

replay_buffer = deque(maxlen=buffer_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

q_net = DQN(state_dim, action_dim).to(device)
optimizer = optim.Adam(q_net.parameters(), lr=lr)
loss_fn = nn.MSELoss()

# ----------------------------
# Helper: select action
# ----------------------------
def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()  # Explore
    else:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = q_net(state)
        return torch.argmax(q_values).item()

# ----------------------------
# Training Loop
# ----------------------------
for episode in range(episodes):
    state, info = env.reset()
    episode_reward = 0
    done = False

    while not done:
        # Choose action
        action = select_action(state, epsilon)

        # Take step
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        replay_buffer.append((state, action, reward, next_state, done))

        state = next_state
        episode_reward += reward

        # Train
        if len(replay_buffer) > batch_size:
            minibatch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)

            states = torch.FloatTensor(states).to(device)
            actions = torch.LongTensor(actions).unsqueeze(1).to(device)
            rewards = torch.FloatTensor(rewards).to(device)
            next_states = torch.FloatTensor(next_states).to(device)
            dones = torch.FloatTensor(dones).to(device)

            # Q(s,a)
            q_values = q_net(states).gather(1, actions).squeeze()

            # Target = r + gamma * max_a' Q(next_state)
            with torch.no_grad():
                max_next_q = q_net(next_states).max(1)[0]
                targets = rewards + gamma * max_next_q * (1 - dones)

            loss = loss_fn(q_values, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Epsilon decay
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    print(f"Episode {episode}, Reward: {episode_reward}, Epsilon: {epsilon:.3f}")


Episode 0, Reward: 21.0, Epsilon: 0.995
Episode 1, Reward: 23.0, Epsilon: 0.990
Episode 2, Reward: 13.0, Epsilon: 0.985
Episode 3, Reward: 11.0, Epsilon: 0.980
Episode 4, Reward: 18.0, Epsilon: 0.975
Episode 5, Reward: 13.0, Epsilon: 0.970
Episode 6, Reward: 27.0, Epsilon: 0.966
Episode 7, Reward: 20.0, Epsilon: 0.961
Episode 8, Reward: 15.0, Epsilon: 0.956
Episode 9, Reward: 10.0, Epsilon: 0.951


  states = torch.FloatTensor(states).to(device)


Episode 10, Reward: 32.0, Epsilon: 0.946
Episode 11, Reward: 34.0, Epsilon: 0.942
Episode 12, Reward: 20.0, Epsilon: 0.937
Episode 13, Reward: 20.0, Epsilon: 0.932
Episode 14, Reward: 13.0, Epsilon: 0.928
Episode 15, Reward: 12.0, Epsilon: 0.923
Episode 16, Reward: 17.0, Epsilon: 0.918
Episode 17, Reward: 46.0, Epsilon: 0.914
Episode 18, Reward: 18.0, Epsilon: 0.909
Episode 19, Reward: 16.0, Epsilon: 0.905
Episode 20, Reward: 37.0, Epsilon: 0.900
Episode 21, Reward: 21.0, Epsilon: 0.896
Episode 22, Reward: 24.0, Epsilon: 0.891
Episode 23, Reward: 20.0, Epsilon: 0.887
Episode 24, Reward: 30.0, Epsilon: 0.882
Episode 25, Reward: 12.0, Epsilon: 0.878
Episode 26, Reward: 12.0, Epsilon: 0.873
Episode 27, Reward: 20.0, Epsilon: 0.869
Episode 28, Reward: 20.0, Epsilon: 0.865
Episode 29, Reward: 23.0, Epsilon: 0.860
Episode 30, Reward: 11.0, Epsilon: 0.856
Episode 31, Reward: 16.0, Epsilon: 0.852
Episode 32, Reward: 15.0, Epsilon: 0.848
Episode 33, Reward: 35.0, Epsilon: 0.843
Episode 34, Rewa

In [14]:
import imageio
frames = []

state, info = render_env.reset()
done = False

while not done:
    # Greedy action
    state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
    with torch.no_grad():
        action = torch.argmax(q_net(state_tensor)).item()

    next_state, reward, terminated, truncated, info = render_env.step(action)
    done = terminated or truncated

    # Collect frame
    frame = render_env.render()
    frames.append(frame)

    state = next_state

render_env.close()

imageio.mimsave("cartpole_policy.mp4", frames, fps=30)
print("Saved to cartpole_policy.mp4")




Saved to cartpole_policy.mp4


In [15]:
from IPython.display import Video
Video("cartpole_policy.mp4", embed=True)
