In [19]:
import torch
import torch.nn as nn
from torch.autograd import Variable as V
from lib.Exercise1_1 import LQRSolver
from torch.utils.data import TensorDataset, DataLoader
import torch.optim.lr_scheduler as lr_scheduler
import time 

Proj_dtype = torch.double
Proj_device = 'cpu'

class DGMNN(nn.Module):
    def __init__(self):
        super(DGMNN, self).__init__()
        self.layer1 = nn.Linear(3, 100)  
        self.layer2 = nn.Linear(100, 100)
        self.layer3 = nn.Linear(100, 100)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.output = nn.Linear(100, 1)
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.tanh(self.layer2(x))
        x = self.relu(self.layer3(x))

        return self.output(x)
    
# Assuming the use of the provided DGMNN class for the value function approximation
class PolicyNetwork(nn.Module):
    """Neural Network for approximating the policy."""
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(3, 100)  
        self.layer2 = nn.Linear(100, 100)
        self.layer3 = nn.Linear(100, 100)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.output = nn.Linear(100, 2)
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.tanh(self.layer2(x))
        x = self.relu(self.layer3(x))

        return self.output(x)

In [20]:
# Define matrices for LQR problem

H = torch.tensor([[1.2, 0.8], [-0.6, 0.9]], dtype=Proj_dtype, device = Proj_device)
M = torch.tensor([[0.5,0.7], [0.3,1.0]], dtype=Proj_dtype, device = Proj_device)
sigma = torch.tensor([[[0.08],[0.11]]], dtype=Proj_dtype, device = Proj_device)
alpha = torch.tensor([[[1],[1]]], dtype=Proj_dtype, device = Proj_device)
C = torch.tensor([[1.6, 0.0], [0.0, 1.1]], dtype=Proj_dtype, device = Proj_device)
D = torch.tensor([[0.5, 0.0], [0.0, 0.7]], dtype=Proj_dtype, device = Proj_device)
R = torch.tensor([[0.9, 0.0], [0.0, 1.0]], dtype=Proj_dtype, device = Proj_device)
T = torch.tensor(1.0, dtype=Proj_dtype, device = Proj_device)

solver = LQRSolver(H, M, sigma, C, D, R, T=T, method="euler")


In [21]:
def get_hessian(grad,x):
    Hessian = torch.tensor([], device = Proj_device)
    
    for i in range(len(x)):
        hessian = torch.tensor([], device = Proj_device)
        for j in range(len(grad[i])):
            u_xxi = torch.autograd.grad(grad[i][j], x, grad_outputs=torch.ones_like(grad[i][j]), retain_graph=True,create_graph=True, allow_unused=True)[0]           
            hessian = torch.cat((hessian, u_xxi[i].unsqueeze(0)))
        Hessian = torch.cat((Hessian, hessian.unsqueeze(0)),dim = 0)
        #print(Hessian)
    return Hessian

def get_hessian_(model,t,x):
    Hessian = torch.tensor([], device = Proj_device)
    for i in range(len(t)):
        x_i = V(x[i],requires_grad=True)
        input = torch.cat(((t[i]).unsqueeze(0), x_i),dim=0)
        u_in = model(input)
        grad = torch.autograd.grad(u_in, x_i, grad_outputs=torch.ones_like(u_in), create_graph=True, retain_graph=True)[0]
        hessian = torch.tensor([], device = Proj_device)
        for j in range(len(grad)):
            u_xxi = torch.autograd.grad(grad[j], x_i, grad_outputs=torch.ones_like(grad[j]), retain_graph=True,create_graph=True, allow_unused=True)[0]           
            hessian = torch.cat((hessian, u_xxi.unsqueeze(0)))
        Hessian = torch.cat((Hessian, hessian.unsqueeze(0)),dim = 0)
    return Hessian

def pde_residual(value_network, policy_network, t, x):
    # 准备输入
    input = torch.cat((t.unsqueeze(1), x), dim=1)
    
    # 获取价值网络的输出
    u = value_network(input)
    u_t = torch.autograd.grad(u, t, grad_outputs=torch.ones_like(u), create_graph=True, retain_graph=True)[0]
    u_x = torch.autograd.grad(u, x, grad_outputs=torch.ones_like(u), create_graph=True, retain_graph=True)[0]

    # 使用策略网络更新alpha
    a = policy_network(input).detach()  # 防止alpha的梯度传播
    # Perform matrix multiplication correctly, assuming M is [2, 2] and a is [100, 2]
    a_transformed = M @ a.T  # Results in a [2, 100] tensor
    a_transformed = a_transformed.T  # Correct the shape to [100, 2] to match u_x for element-wise multiplication

    # Now you can multiply u_x with a_transformed element-wise directly
    residual_component = u_x * a_transformed  # This should not raise an error given compatible shapes
    # 计算二阶导数
    u_xx = get_hessian(u_x, x)
    d_a_product = D @ a.T  # 结果形状 [2, batch_size]
    d_a_product = d_a_product.T  # 调整形状为 [batch_size, 2] 以匹配 a_transformed

    # 现在，a_transformed 和 d_a_product 形状一致，可以进行逐元素乘法
    # 注意，如果原本的目的是将 a_transformed 和 D @ a 的结果相乘，这里假设两者均为 [batch_size, 2]
    residual_component_1 = a_transformed * d_a_product  # 形状为 [batch_size, 2]
    # Continue with your residual calculation
    residual = u_t + 0.5 * torch.einsum('bii->b', sigma @ sigma.transpose(1,2) @ u_xx) + \
           torch.sum(residual_component, dim=1, keepdim=True) + \
           torch.sum(x * (C @ x.T).T, dim=1, keepdim=True) + \
           torch.sum(residual_component_1, dim=1, keepdim=True)  # Ensure this operation is also corrected if needed
   
    return residual


def boundary_condition(model,t, x):

    
    T_input = T * torch.ones_like(t)

    input = torch.cat((T_input.unsqueeze(1), x),dim=1)
    u = model(input)

    return u - (x.unsqueeze(1) @ R @ x.unsqueeze(1).transpose(1,2)).squeeze()

def total_residual(model, t, x):
    
    residual_loss = pde_residual(model, t, x).pow(2).mean()
    boundary_loss = boundary_condition(model,t,x).pow(2).mean()
    
    return residual_loss + boundary_loss


def boundary_condition_loss(model, x_samples):
    # 假设边界条件为模型输出在边界上应该为0
    # 这里需要根据您的具体问题来调整
    # x_samples 应包含边界上的样本点
    
    # 获取模型在边界样本点上的预测
    boundary_predictions = model(x_samples)
    
    # 计算损失：例如，可以使用 MSE 损失来量化模型输出与0之间的差异
    boundary_loss = torch.mean(boundary_predictions ** 2)
    
    return boundary_loss


def compute_total_loss(value_network, policy_network, t_samples, x_samples):
    residual = pde_residual(value_network, policy_network, t_samples, x_samples)
    boundary_loss = boundary_condition_loss(value_network, x_samples)
    
    total_loss = torch.mean(residual.pow(2)) + torch.mean(boundary_loss.pow(2))
    return total_loss

def train_value_network(value_network, policy_network, t_samples, x_samples, epochs):
    value_network.train()
    final_loss = None
    for epoch in range(epochs):
        # 这里保持原样，不需要修改
        value_optimizer.zero_grad()
        loss = compute_total_loss(value_network, policy_network, t_samples, x_samples)  # 使用原始参数
        loss.backward()
        value_optimizer.step()
        
        if epoch == epochs - 1:
            final_loss = loss.item()
    return final_loss


def policy_loss(value_network, policy_network, t, x):
    # 计算价值网络输出对于策略网络输出的梯度
    alpha = policy_network(torch.cat((t.unsqueeze(1), x), dim=1))
    combined_input = torch.cat((t.unsqueeze(1), x, alpha), dim=1)

    # 将合并后的输入传递给价值网络
    value_output = value_network(combined_input)
    
    # 定义策略损失函数，这可能依赖于具体的任务
    # 例如，可以是负的价值函数的输出，我们希望最大化价值
    loss = -torch.mean(value_output)
    return loss

def update_policy(policy_network, value_optimizer, t_samples, x_samples):
    policy_network.train()
    policy_optimizer.zero_grad()
    
    loss = policy_loss(value_network, policy_network, t_samples, x_samples)
    loss.backward()
    
    policy_optimizer.step()


In [22]:
def new_data(num_samples):
    #num_samples = 10000
    t_samples = T * torch.rand(num_samples, dtype=Proj_dtype, device = Proj_device, requires_grad=True)
    x_ends = torch.tensor([-3,3], dtype = Proj_dtype)
    x_samples = x_ends[0] + (x_ends[1]- x_ends[0]) * torch.rand(num_samples , 2, dtype=Proj_dtype, device = Proj_device, requires_grad=True)
    return t_samples,x_samples


In [23]:
value_network = DGMNN().to(device=Proj_device, dtype=Proj_dtype)
policy_network = PolicyNetwork().to(device=Proj_device, dtype=Proj_dtype)


# Optimizers
value_optimizer = torch.optim.Adam(value_network.parameters(), lr=0.001)
policy_optimizer = torch.optim.Adam(policy_network.parameters(), lr=0.001)

# Generate data
num_samples = 10  # Adjust as needed
t_samples, x_samples = new_data(num_samples)


In [24]:
previous_loss = float('inf')
loss_threshold = 1e-4
num_iterations = 10
for iteration in range(num_iterations):
    train_value_network(value_network, policy_network, t_samples, x_samples, epochs=100)
    current_loss = update_policy(policy_network, value_optimizer, t_samples, x_samples)
    
    # 收敛性检查


RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x2 and 3x100)

In [None]:

    

def compute_total_loss(value_network, policy_network, t_samples, x_samples):
    residual = pde_residual(value_network, policy_network, t_samples, x_samples)
    boundary_loss = boundary_condition_loss(value_network, x_samples)
    
    total_loss = torch.mean(residual.pow(2)) + torch.mean(boundary_loss.pow(2))
    return total_loss

def train_value_network(value_network, policy_network, t_samples, x_samples, epochs):
    value_network.train()
    final_loss = None
    for epoch in range(epochs):
        value_optimizer.zero_grad()
        loss = compute_total_loss(value_network, policy_network, t_samples, x_samples)
        loss.backward()
        value_optimizer.step()
        
        if epoch == epochs - 1:
            final_loss = loss.item()
    return final_loss

def compute_policy_loss(value_network, policy_network, states):
    # Ensure states require gradients
    states = states.requires_grad_(True)

    # Obtain actions from the policy network
    actions = policy_network(states)
    
    # Combine states and actions in the way they're inputted to your value network
    # This step depends on your specific setup
    value_network_input = torch.cat([states, actions], dim=-1)
    
    # Get the value predictions
    values = value_network(value_network_input)
    
    # Compute gradients of values with respect to states
    value_gradients = torch.autograd.grad(outputs=values.sum(), inputs=states, create_graph=True)[0]

    # Policy loss can be formulated in several ways depending on the specific goal
    # For example, maximizing the value (encouraging positive gradients)
    policy_loss = -value_gradients.mean()
    
    return policy_loss

def update_policy(policy_network, value_network, epochs, states):
    """Update the policy network based on the current value function."""
    policy_network.train()
    for epoch in range(epochs):
        policy_optimizer.zero_grad()
        # Define a loss function for the policy network based on the updated value function
        # This could involve computing gradients of the value network's outputs with respect to its inputs
        policy_loss = policy_loss = compute_policy_loss(value_network, policy_network, states)
        policy_loss.backward()
        policy_optimizer.step()



In [None]:
previous_loss = float('inf')
loss_threshold = 1e-4  # Define a suitable threshold for loss convergence


In [None]:
num_iterations = 100

num_samples = 100 

t_samples, x_samples = new_data(num_samples)

for iteration in range(num_iterations):
    # Step 1: Train the value network for the current policy
    # Ensure train_value_network returns the current loss
    current_loss = train_value_network(value_network, policy_network, t_samples, x_samples, epochs=100)
    
    # Check for convergence based on loss
    loss_change = abs(previous_loss - current_loss)
    if loss_change < loss_threshold:
        print(f"Convergence achieved at iteration {iteration} with loss change {loss_change}")
        break
    previous_loss = current_loss
    
    # Step 2: Update the policy based on the current value function
    # Ensure update_policy function is correctly implemented
    update_policy(policy_network, value_network, t_samples, x_samples, epochs=100)

    # Optional: Print progress
    print(f"Iteration {iteration}, Loss Change: {loss_change}")

    # Implement additional checks as needed, e.g., based on policy changes

torch.Size([100, 1])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [100, 2] but got: [100, 1].