In [7]:
from random import randint
import torch
import numpy as np

Carbin_Count = 9  # 


#定义环境
class Environment:
    def __init__(self):
        self.state = None
        self.time_spend = 0  # 使用的总共时间 要惩罚时间的
        self.reset()

    def reset(self):
        # 每个船舱的重量
        self.time_spend = 0  # 使用的总共时间 要惩罚时间的
        self.state = torch.FloatTensor(np.random.uniform(1, 2, (1, Carbin_Count)))
        return self.state

    def is_balance(self):
        def _get_weight(num):
            if num % 2 != 0:
                return [i - (num - 1) // 2 for i in range(num)]
            return [-i for i in range(num // 2, 0, -1)] + list(range(1, num // 2 + 1))

        weight_list = _get_weight(Carbin_Count)
        count = 0
        for i in range(Carbin_Count):
            count += weight_list[i] * self.state[0][i]
        return abs(count) < 5

    def is_over(self, action: torch.Tensor = []):
        def is_fault(action: torch.Tensor):
            last_action = -1
            for i in action:
                index = int(i)  # 映射到货轮
                if index == 0:
                    continue
                if index < last_action:
                    return True
                last_action = index
            return False

        if is_fault(action) or not self.is_balance():
            return -1
        # 判断state每一项是否小于0
        if torch.all(self.state < 0):
            return 1
        return 0

    def get_state_reword(self, action: torch.Tensor):
        # 当前状态的分数,比如为负数了还有人来卸载就应该扣分
        reword = 0
        for i in action:
            index = int(i)  # 映射到货轮
            if index == 0:
                reword -= 3
                continue
            # 动起来的奖励
            reword += 1 * 5
            if self.state[0][index - 1] < 0:
                reword += 5 * (self.state[0][index - 1] * 13)
        is_over = self.is_over(action)
        if is_over == 1:
            reword += 1200
        elif is_over == -1:
            reword -= 200
        return reword

    def step(self, action):
        for i in action:
            index = int(i)  # 映射到货轮
            if index == 0:
                continue
            self.state[0][index - 1] -= 0.1
        self.time_spend += 1
        reword = self.get_state_reword(action)
        return self.state, reword, self.is_over(action) != 0, None


class MyWrapper:
    N = 3  # 港机数量

    def __init__(self):
        self.env = Environment()
        self.step_n = 0

    def reset(self):
        state = self.env.reset()
        self.step_n = 0
        return state

    def step(self, action):
        # print(action)
        state, reward, terminated, info = self.env.step(action)
        over = terminated
        #限制最大步数
        self.step_n += 1
        if self.step_n > 200:
            over = True
        return state, reward, over

    def to_show_state(self):
        return '%s  步骤:%s  游戏是否结束:%s' % (str(self.env.state), str(self.env.time_spend), str(self.env.is_over()))


env = MyWrapper()

env.reset()
env.step(torch.full((1, 1), 0.5, dtype=torch.float32))
env.to_show_state()

'tensor([[1.0408, 1.2495, 1.7930, 1.1497, 1.1469, 1.1127, 1.7079, 1.5717, 1.5242]])  步骤:1  游戏是否结束:0'

In [8]:
import torch


class A2C:

    def __init__(self, model_actor, model_critic, model_critic_delay,
                 optimizer_actor, optimizer_critic):
        self.model_actor = model_actor
        self.model_critic = model_critic
        self.model_critic_delay = model_critic_delay
        self.optimizer_actor = optimizer_actor
        self.optimizer_critic = optimizer_critic

        self.model_critic_delay.load_state_dict(self.model_critic.state_dict())
        self.requires_grad(self.model_critic_delay, False)

    def soft_update(self, _from, _to):
        for _from, _to in zip(_from.parameters(), _to.parameters()):
            value = _to.data * 0.99 + _from.data * 0.01
            _to.data.copy_(value)

    def requires_grad(self, model, value):
        for param in model.parameters():
            param.requires_grad_(value)

    def train_critic(self, state, reward, next_state, over):
        self.requires_grad(self.model_critic, True)
        self.requires_grad(self.model_actor, False)

        #计算values和targets
        value = self.model_critic(state)

        with torch.no_grad():
            target = self.model_critic_delay(next_state)
        target = target * 0.99 * (1 - over) + reward
        # print('xxxx', value.size(), target.size(), reward.size())
        #时序差分误差,也就是tdloss
        loss = torch.nn.functional.mse_loss(value, target)

        loss.backward()
        self.optimizer_critic.step()
        self.optimizer_critic.zero_grad()
        self.soft_update(self.model_critic, self.model_critic_delay)

        #减去value相当于去基线
        return (target - value).detach()

    def train_actor(self, state, action, value):
        self.requires_grad(self.model_critic, False)
        self.requires_grad(self.model_actor, True)

        #重新计算动作的概率
        prob = self.model_actor(state)
        prob = prob.gather(dim=1, index=action)

        #根据策略梯度算法的导函数实现
        #函数中的Q(state,action),这里使用critic模型估算
        prob = (prob + 1e-8).log() * value
        loss = -prob.mean()

        loss.backward()
        self.optimizer_actor.step()
        self.optimizer_actor.zero_grad()

        return loss.item()


model_actor = [
    torch.nn.Sequential(
        torch.nn.Linear(9, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, Carbin_Count + 1),
        torch.nn.Softmax(dim=1),
    ) for _ in range(env.N)
]

model_critic, model_critic_delay = [
    torch.nn.Sequential(
        torch.nn.Linear(9, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 1),
    ) for _ in range(2)
]

optimizer_actor = [
    torch.optim.Adam(model_actor[i].parameters(), lr=1e-3)
    for i in range(env.N)
]
optimizer_critic = torch.optim.Adam(model_critic.parameters(), lr=5e-3)

a2c = [
    A2C(model_actor[i], model_critic, model_critic_delay, optimizer_actor[i],
        optimizer_critic) for i in range(env.N)
]
# x = torch.FloatTensor([1,2,3,4,6,7,8,8,4]).resize(1,Carbin_Count)
# print(model_actor[0](x))
model_actor = None
model_critic = None
model_critic_delay = None
optimizer_actor = None
optimizer_critic = None

a2c


[<__main__.A2C at 0x12cb6cbc040>,
 <__main__.A2C at 0x12cb6cbcbb0>,
 <__main__.A2C at 0x12caed455b0>]

In [9]:

import random


#玩一局游戏并记录数据
def play(show=False):
    state = []
    action = []
    reward = []
    next_state = []
    over = []

    s = env.reset()
    o = False
    while not o:
        a = []
        for i in range(env.N):
            #计算动作
            prob = a2c[i].model_actor(torch.FloatTensor(s).reshape(
                1, -1))[0].tolist()
            # print(s, prob)
            a.append(random.choices(range(Carbin_Count + 1), weights=prob, k=1)[0])

        #执行动作
        ns, r, o = env.step(a)

        state.append(s)
        action.append(a)
        reward.append(r)
        next_state.append(ns)
        over.append(o)

        s = ns

        if show:
            print(a, env.to_show_state())
    # print(state[0])
    # print(type(state), len(state))
    state = torch.tensor([item.numpy() for item in state])
    action = torch.LongTensor(action).unsqueeze(-1)
    reward = torch.FloatTensor(reward).unsqueeze(-1).unsqueeze(-1)
    next_state = torch.tensor([item.numpy() for item in next_state])
    over = torch.LongTensor(over).reshape(-1, 1)

    return state, action, reward, next_state, over, reward.sum().item()


state, action, reward, next_state, over, reward_sum = play()

reward_sum, state.size(), reward.size(), action.size()

(-185.0, torch.Size([1, 1, 9]), torch.Size([1, 1, 1]), torch.Size([1, 3, 1]))

In [12]:
def train():
    #训练N局
    for epoch in range(3_000):
        state, action, reward, next_state, over, _ = play()

        #合并部分字段
        state_c = state.flatten(start_dim=1)
        reward_c = reward.sum(dim=1)
        next_state_c = next_state.flatten(start_dim=1)

        for i in range(env.N):
            value = a2c[i].train_critic(state_c, reward_c, next_state_c, over)
            loss = a2c[i].train_actor(state_c, action[:, i], value)

        if epoch % 250 == 0:
            test_result = sum([play()[-1] for _ in range(20)]) / 20
            print(epoch, loss, test_result)


train()

0 -30.754661560058594 1009.3025482177734
250 -33.201744079589844 1228.970233154297
500 -21.294675827026367 1175.537042236328
750 12.735350608825684 890.4701263427735
1000 -59.23042297363281 1099.33642578125
1250 -34.20117950439453 1165.6182739257813
1500 -25.605192184448242 1345.0637756347655
1750 -42.75160598754883 1330.5882995605468
2000 -55.612022399902344 1258.7829162597657
2250 -29.451799392700195 1372.3510192871095
2500 -56.7697868347168 1264.2888061523438
2750 -21.67510986328125 1224.0494140625


In [11]:
play(True)[-1]

[1, 5, 9] tensor([[1.2603, 1.2243, 1.9355, 1.5128, 1.7008, 1.9535, 1.7696, 1.2639, 1.5422]])  步骤:1  游戏是否结束:0
[1, 5, 9] tensor([[1.1603, 1.2243, 1.9355, 1.5128, 1.6008, 1.9535, 1.7696, 1.2639, 1.4422]])  步骤:2  游戏是否结束:0
[1, 5, 9] tensor([[1.0603, 1.2243, 1.9355, 1.5128, 1.5008, 1.9535, 1.7696, 1.2639, 1.3422]])  步骤:3  游戏是否结束:0
[1, 5, 9] tensor([[0.9603, 1.2243, 1.9355, 1.5128, 1.4008, 1.9535, 1.7696, 1.2639, 1.2422]])  步骤:4  游戏是否结束:0
[1, 5, 9] tensor([[0.8603, 1.2243, 1.9355, 1.5128, 1.3008, 1.9535, 1.7696, 1.2639, 1.1422]])  步骤:5  游戏是否结束:0
[1, 5, 9] tensor([[0.7603, 1.2243, 1.9355, 1.5128, 1.2008, 1.9535, 1.7696, 1.2639, 1.0422]])  步骤:6  游戏是否结束:0
[1, 5, 9] tensor([[0.6603, 1.2243, 1.9355, 1.5128, 1.1008, 1.9535, 1.7696, 1.2639, 0.9422]])  步骤:7  游戏是否结束:0
[1, 5, 9] tensor([[0.5603, 1.2243, 1.9355, 1.5128, 1.0008, 1.9535, 1.7696, 1.2639, 0.8422]])  步骤:8  游戏是否结束:0
[1, 5, 9] tensor([[0.4603, 1.2243, 1.9355, 1.5128, 0.9008, 1.9535, 1.7696, 1.2639, 0.7422]])  步骤:9  游戏是否结束:0
[1, 5, 9] tensor([[

1237.306396484375