# TD算法
## 1.游戏环境

In [94]:
import random


# 简单环境
class SimpleEnvironment:
    def __init__(self):
        # 只有x轴，从x=0出发到达x=3
        self.state_space = [0, 1, 2, 3]
        # 只有不动和向右（x轴正方向）两个动作
        self.action_space = [0, 1]
        self.rewards = {
            (0, 0): -10,
            (0, 1): 1,
            (1, 0): -10,
            (1, 1): 10,
            (2, 0): -10,
            (2, 1): 50,
        }

    def reset(self):
        return 0

    def step(self, state ,action):
        current_state = state  # 假设始终从状态0开始
        next_state = current_state + action

        # 边界检查
        if next_state < 0:
            next_state = 0
        elif next_state > 3:
            next_state = 3

        reward = self.rewards.get((current_state, action), 0)
        done = next_state == 3
        return next_state, reward, done

## 2.环境初始化

In [95]:
# # 初始化value表（V表）
# TD算法无法得到actoin value,只负责计算每一个状态的最优状态值
import numpy as np
V = np.zeros(4)

## 3.TD算法实现
公式
$$v_{t+1}(s_t)=v_t(s_t)-\alpha_t(s_t)[v_t(s_t)-[r_{t+1}+\gamma v_t(s_{t+1})]]$$

In [96]:
def td_learning(env, V, alpha=0.1, gamma=0.9, episodes=30):
    """
    纯TD学习算法

    Args:
        env: 环境
        V: 状态价值函数
        alpha: 学习率
        gamma: 折扣因子
        episodes: 训练回合数

    Returns:
        V: 训练后的状态价值函数
    """
    for episode in range(episodes):
        # 初始化状态
        state = env.reset()
        done = False
        total_reward = 0

        # 遍历游戏回合
        while not done:
            # 只有在 done 为 False 时才选择动作
            action = 1
            if random.random() < 0.1:
                action = 0

            # 执行动作并获得反馈
            next_state, reward, done = env.step(state, action)
            total_reward += reward
            # TD更新
            V[state] = V[state] - alpha * (V[state] - (reward + gamma * V[next_state]))
            # 更新当前状态
            state = next_state

    return V

## 4.训练与结果展示

In [97]:
env = SimpleEnvironment()
V = td_learning(env, V)
V

array([30.12332312, 43.85805089, 45.1838773 ,  0.        ])