In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# 设置环境
env = gym.make('MountainCar-v0')
env = env.unwrapped # 解除环境200步的限制

# 参数设置
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
learning_rate = 1e-2
gamma = 0.99
num_episodes = 1000

# 定义策略网络
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 16),  # 隐藏层 16 单元
            nn.ReLU(),
            nn.Linear(16, action_dim),  # 输出层：动作的 logits
            nn.Softmax(dim=-1)  # 通过 Softmax 输出动作的概率
        )
        # 初始化权重，两个全连接层的权重矩阵初始值服从均值为0、标准差为0.01的正态分布，偏置量初始值为0.1
        nn.init.normal_(self.fc[0].weight, mean=0, std=0.01)
        nn.init.constant_(self.fc[0].bias, 0.1)
        nn.init.normal_(self.fc[2].weight, mean=0, std=0.01)
        nn.init.constant_(self.fc[2].bias, 0.1)
        
    def forward(self, state):
        return self.fc(state)
    
    # 计算折扣累积奖励
def compute_returns(rewards, gamma=0.99):
    returns = []
    R = 0
    for r in reversed(rewards):  # 反向遍历奖励，计算回报
        R = r + gamma * R
        returns.insert(0, R)  # 插入回报
    return returns

def compute_returns_standard(rewards, gamma=0.99):
    '''
    计算折扣累积奖励，并标准化
    '''
    returns = []
    R = 0
    for r in reversed(rewards):  # 反向遍历奖励，计算回报
        R = r + gamma * R
        returns.insert(0, R)  # 插入回报
    returns = np.array(returns)
    # 标准化
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)
    return returns

In [1]:
# 初始化策略网络和优化器
policy = PolicyNetwork(state_dim, action_dim)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

# 训练 REINFORCE
for episode in range(num_episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False
    step_count = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0)  # 转换为张量
        action_probs = policy(state)  # 策略网络输出动作概率
        action_dist = torch.distributions.Categorical(action_probs)  # 采样分布
        action = action_dist.sample()  # 采样动作
        
        log_probs.append(action_dist.log_prob(action))  # 记录 log 概率
        
        next_state, reward, done, _ = env.step(action.item())  # 执行动作
        step_count += 1
        rewards.append(reward)  # 记录奖励
        
        state = next_state
        # 如果到达目标位置
        '''if next_state[0] >= env.unwrapped.goal_position:
            done = True'''
    
    # 计算折扣累积奖励
    returns = compute_returns_standard(rewards, gamma)
    returns = torch.FloatTensor(returns)
    
    # 计算损失
    loss = 0
    for log_prob, R in zip(log_probs, returns):
        loss -= log_prob * R  # 策略梯度的损失函数
    
    # 更新策略网络
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # 输出训练信息
    if episode % 10 == 0:
        print(f"Episode {episode}, Total Reward: {sum(rewards):.2f}")
print(f"Episode {episode}, Total Reward: {sum(rewards):.2f}")
print("Training Complete")
env.close()


Episode 0, Total Reward: -105014.00
Episode 10, Total Reward: -7100.00
Episode 20, Total Reward: -1202.00
Episode 30, Total Reward: -3789.00
Episode 40, Total Reward: -11612.00
Episode 50, Total Reward: -8588.00
Episode 60, Total Reward: -2556.00
Episode 70, Total Reward: -12977.00
Episode 80, Total Reward: -5296.00
Episode 90, Total Reward: -5257.00
Episode 100, Total Reward: -4896.00
Episode 110, Total Reward: -11711.00
Episode 120, Total Reward: -5066.00
Episode 130, Total Reward: -4560.00
Episode 140, Total Reward: -4943.00
Episode 150, Total Reward: -5852.00
Episode 160, Total Reward: -4622.00
Episode 170, Total Reward: -3980.00
Episode 180, Total Reward: -4332.00
Episode 190, Total Reward: -2204.00
Episode 200, Total Reward: -1871.00
Episode 210, Total Reward: -1602.00
Episode 220, Total Reward: -3401.00
Episode 230, Total Reward: -2580.00
Episode 240, Total Reward: -2702.00
Episode 250, Total Reward: -2522.00
Episode 260, Total Reward: -2042.00
Episode 270, Total Reward: -949.00

In [2]:
# 修改采样逻辑为截断轨迹更新
max_steps = 200  # 每次截断的最大步数
# 初始化策略网络和优化器
policy_nsteps = PolicyNetwork(state_dim, action_dim)
optimizer_nsteps = optim.Adam(policy_nsteps.parameters(), lr=learning_rate)

for episode in range(num_episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False
    step = 0
    
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0)
        action_probs = policy_nsteps(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        
        log_probs.append(action_dist.log_prob(action))
        next_state, reward, done, _ = env.step(action.item())
        rewards.append(reward)
        state = next_state
        step += 1
        
        # 每 max_steps 更新一次
        if step % max_steps == 0 or done:
            # 计算截断回报
            returns = compute_returns(rewards, gamma)
            returns = torch.FloatTensor(returns)
            
            # 计算损失并更新
            loss = 0
            for log_prob, R in zip(log_probs, returns):
                loss -= log_prob * R
            
            optimizer_nsteps.zero_grad()
            loss.backward()
            optimizer_nsteps.step()
            
            # 清空缓存
            log_probs = []
            rewards = []
            #step = 0
    
    if episode % 10 == 0:
        print(f"Episode {episode}, Total Reward: {-step}")


Episode 0, Total Reward: -103613
Episode 10, Total Reward: -456816
Episode 20, Total Reward: -4969
Episode 30, Total Reward: -3210


KeyboardInterrupt: 

In [2]:
# 值函数网络
class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Linear(state_dim, 16)
        self.out = nn.Linear(16, 1)

        # 初始化权重和偏置
        nn.init.normal_(self.fc.weight, mean=0.0, std=0.01)
        nn.init.constant_(self.fc.bias, 0.1)
        nn.init.normal_(self.out.weight, mean=0.0, std=0.01)
        nn.init.constant_(self.out.bias, 0.1)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        return self.out(x)


policy_net = PolicyNetwork(state_dim, action_dim)
value_net = ValueNetwork(state_dim)

policy_optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)
value_optimizer = optim.Adam(value_net.parameters(), lr=1e-3)

# 训练循环
for episode in range(num_episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    values = []
    done = False
    step_count = 0
    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        
        # 策略选择动作
        action_probs = policy_net(state_tensor)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        
        log_probs.append(action_dist.log_prob(action))
        
        # 记录值函数预测
        values.append(value_net(state_tensor).squeeze(0))
        
        # 交互
        next_state, reward, done, _ = env.step(action.item())
        step_count += 1
        rewards.append(reward)
        state = next_state
        if step_count >= 50000:
            print(f"Episode {episode}, step_count >= 50000")
            done = True

    # 计算折扣累积奖励
    with torch.no_grad():
        returns = compute_returns(rewards, gamma)
        returns = torch.FloatTensor(returns)
    
    # 计算 Advantage
    values = torch.stack(values)
    advantages = returns - values.detach()
    # 标准化 Advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # 更新策略网络
    policy_loss = -torch.sum(torch.stack(log_probs) * advantages)
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()

    # 更新值函数网络
    value_loss = torch.nn.functional.mse_loss(values.squeeze(1), returns)
    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()

    # 清空缓存
    log_probs.clear()
    rewards.clear()
    values.clear()

    # 日志输出
    if episode % 10 == 0:
        print(f"Episode {episode}, Total Reward: {sum(rewards):.2f}")

Episode 0, step_count >= 50000
Episode 0, Total Reward: -50000.00
Episode 1, step_count >= 50000
Episode 2, step_count >= 50000


KeyboardInterrupt: 

In [1]:
import psutil

# 获取内存使用情况
memory = psutil.virtual_memory()

# 打印内存使用情况
print(f"Total: {memory.total / (1024 ** 3):.2f} GB")
print(f"Available: {memory.available / (1024 ** 3):.2f} GB")
print(f"Used: {memory.used / (1024 ** 3):.2f} GB")
print(f"Percentage: {memory.percent}%")

# 获取当前进程的内存使用情况
process = psutil.Process()
memory_info = process.memory_info()

# 打印当前进程的内存使用情况
print(f"Current Process Memory: {memory_info.rss / (1024 ** 2):.2f} MB")


Total: 63.82 GB
Available: 38.65 GB
Used: 25.17 GB
Percentage: 39.4%
Current Process Memory: 65.01 MB


In [1]:
import gym
import time

# 初始化环境
env = gym.make('MountainCar-v0')
env = env.unwrapped
# 打印环境信息
print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('位置范围 = {}'.format((env.unwrapped.min_position,
        env.unwrapped.max_position)))
print('速度范围 = {}'.format((-env.unwrapped.max_speed,
        env.unwrapped.max_speed)))
print('目标位置 = {}'.format(env.unwrapped.goal_position))

# 重置环境
state = env.reset()

# 运行一个随机策略
done = False
step_count = 0
while not done:
    env.render()  # 可视化环境
    time.sleep(1 / 30)  # 控制帧率
    action = env.action_space.sample()  # 随机选择一个动作
    next_state, reward, done, info = env.step(action)  # 执行动作并获取新状态和奖励
    #print(f"step_count: {step_count}, info: {info}")
    state = next_state
    step_count += 1
    # 如果到达目标位置
    if next_state[0] >= env.unwrapped.goal_position:
        done = True
print(f"Episode finished after {step_count} steps")
env.close()


观测空间 = Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
动作空间 = Box(-1.0, 1.0, (1,), float32)
位置范围 = (-1.2, 0.6)
速度范围 = (-0.07, 0.07)
目标位置 = 0.45
Episode finished after 999 steps


In [1]:
import gym
import numpy as np

# 创建MountainCarContinuous-v0环境
env = gym.make('MountainCarContinuous-v0')

# 重置环境
state = env.reset()

# 设定最大步数
max_steps = 1000

for step in range(max_steps):
    # 渲染环境
    env.render()
    
    # 随机选择一个动作
    action = env.action_space.sample()
    
    # 执行动作
    next_state, reward, done, info = env.step(action)
    
    # 打印当前状态、动作、奖励
    print(f"Step: {step}, State: {state}, Action: {action}, Reward: {reward}")
    
    # 更新状态
    state = next_state
    
    # 如果达到终止状态，则退出循环
    if done:
        print(f"Episode finished after {step+1} steps")
        break

# 关闭环境
env.close()


Step: 0, State: [-0.47609518  0.        ], Action: [-0.6554392], Reward: -0.04296005422973259
Step: 1, State: [-0.47743341 -0.00133823], Action: [-0.0724161], Reward: -0.0005244092173041504
Step: 2, State: [-0.4792254  -0.00179199], Action: [-0.6066748], Reward: -0.03680543012855111
Step: 3, State: [-0.48225922 -0.00303382], Action: [-0.17956644], Reward: -0.0032244107439627356
Step: 4, State: [-0.48587164 -0.00361242], Action: [0.1901974], Reward: -0.003617504841741082
Step: 5, State: [-0.48948112 -0.00360948], Action: [0.31679335], Reward: -0.010035802810296791
Step: 6, State: [-0.49287084 -0.00338972], Action: [0.49758926], Reward: -0.02475950720054465
Step: 7, State: [-0.49574431 -0.00287347], Action: [-0.87949497], Reward: -0.077351139359482
Step: 8, State: [-0.50014569 -0.00440138], Action: [-0.13719268], Reward: -0.0018821831838439218
Step: 9, State: [-0.50492861 -0.00478292], Action: [0.9354454], Reward: -0.08750581485688969
Step: 10, State: [-0.50844832 -0.00351971], Action: [

In [9]:
import gym
import numpy as np
import time

def pendulum():
    # 创建Pendulum-v1环境
    env = gym.make('Pendulum-v1')

    # 重置环境
    state = env.reset()

    # 设定最大步数
    max_steps = 20000

    for step in range(max_steps):
        # 渲染环境
        env.render()
        #time.sleep(1 / 30)  # 控制帧率
        
        # 随机选择一个动作
        action = env.action_space.sample()
        
        # 执行动作
        next_state, reward, done, truncated, info = env.step(action)
        
        # 打印当前状态、动作、奖励
        print(f"Step: {step}, State: {state}, Action: {action}, Reward: {reward}")
        
        # 更新状态
        state = next_state
        
        # 如果达到终止状态，则退出循环
        if done:
            print(f"Episode finished after {step+1} steps")
            break

    # 关闭环境
    env.close()
pendulum()

Step: 0, State: (array([0.9604698 , 0.2783842 , 0.01115166], dtype=float32), {}), Action: [-0.6911717], Reward: -0.08007700759133939
Step: 1, State: [0.95883524 0.2839629  0.11626407], Action: [1.8520905], Reward: -0.08768256141827022
Step: 2, State: [0.949776   0.31293067 0.6070498 ], Action: [1.5370471], Reward: -0.14051378292772854
Step: 3, State: [0.9316414 0.3633791 1.0723048], Action: [-0.4777849], Reward: -0.25351598209549375
Step: 4, State: [0.9066377  0.42190996 1.2731714 ], Action: [0.03242992], Reward: -0.35180222345454576
Step: 5, State: [0.87015754 0.4927736  1.5944685 ], Action: [1.7973772], Reward: -0.5229711787628393
Step: 6, State: [0.8098165 0.5866833 2.2336552], Action: [-0.83318794], Reward: -0.8926909815522329
Step: 7, State: [0.72868836 0.68484545 2.5486896 ], Action: [0.5325575], Reward: -1.2189720956335455
Step: 8, State: [0.61255926 0.79042464 3.1422071 ], Action: [1.8554012], Reward: -1.8216249231493236
Step: 9, State: [0.44271794 0.8966609  4.0133357 ], Actio