In [1]:
# Notebook最小：強制CPUでクラッシュ回避版

import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # ★torch importより前！
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # ★DLL衝突対策（必要な場合のみ）

import math
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(0)
random.seed(0)

DEVICE = "cpu"  # ★強制CPU（ここが重要）

def tsp_tour_length(coords, tour):
    B, N, _ = coords.shape
    idx = tour.unsqueeze(-1).expand(B, N, 2)
    ordered = coords.gather(1, idx)
    rolled = torch.roll(ordered, shifts=-1, dims=1)
    seg = (ordered - rolled).norm(p=2, dim=-1)
    return seg.sum(dim=1)

def sample_batch(batch_size, n_nodes):
    return torch.rand(batch_size, n_nodes, 2, device=DEVICE)

class PointerPolicy(nn.Module):
    def __init__(self, embed_dim=64):  # ★軽く
        super().__init__()
        self.embed = nn.Linear(2, embed_dim)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.scale = 1.0 / math.sqrt(embed_dim)

    def forward(self, coords, greedy=False):
        B, N, _ = coords.shape
        E = self.embed(coords)
        K = self.k_proj(E)

        cur = torch.zeros(B, dtype=torch.long, device=coords.device)
        visited = torch.zeros(B, N, dtype=torch.bool, device=coords.device)
        visited[:, 0] = True

        tour = []
        logps = []

        for t in range(N):
            tour.append(cur)
            if t == N - 1:
                break

            q = self.q_proj(E[torch.arange(B, device=coords.device), cur])
            scores = (K * q.unsqueeze(1)).sum(dim=-1) * self.scale
            scores = scores.masked_fill(visited, -1e9)
            probs = F.softmax(scores, dim=-1)

            if greedy:
                nxt = probs.argmax(dim=-1)
                logp = torch.log(probs.gather(1, nxt.unsqueeze(1)).squeeze(1) + 1e-12)
            else:
                dist = torch.distributions.Categorical(probs)
                nxt = dist.sample()
                logp = dist.log_prob(nxt)

            logps.append(logp)
            visited[torch.arange(B, device=coords.device), nxt] = True
            cur = nxt

        tour = torch.stack(tour, dim=1)
        logp = torch.stack(logps, dim=1).sum(dim=1)
        return tour, logp

def train_minimal(n_nodes=10, batch_size=32, steps=50, lr=1e-3, print_every=10):
    policy = PointerPolicy(embed_dim=64).to(DEVICE)
    opt = torch.optim.Adam(policy.parameters(), lr=lr)

    baseline = None
    beta = 0.9

    for step in range(1, steps + 1):
        coords = sample_batch(batch_size, n_nodes)

        tour, logp = policy(coords, greedy=False)
        length = tsp_tour_length(coords, tour)
        reward = -length

        with torch.no_grad():
            r_mean = reward.mean()
            baseline = r_mean if baseline is None else (beta * baseline + (1 - beta) * r_mean)
            adv = reward - baseline

        loss = -(adv.detach() * logp).mean()

        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(policy.parameters(), 1.0)
        opt.step()

        if step % print_every == 0:
            coords_eval = sample_batch(128, n_nodes)
            tour_g, _ = policy(coords_eval, greedy=True)
            len_g = tsp_tour_length(coords_eval, tour_g).mean().item()
            print(f"step {step:4d} | loss {loss.item():+.4f} | greedy mean length {len_g:.4f}")

    return policy

print("torch:", torch.__version__, "cuda_available:", torch.cuda.is_available())
policy = train_minimal()

coords = sample_batch(1, 10)
tour, _ = policy(coords, greedy=True)
print("tour:", tour.squeeze(0).tolist())
print("length:", tsp_tour_length(coords, tour).item())


torch: 2.10.0+cpu cuda_available: False


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [CPUBoolType [32, 10]] is at version 10; expected version 9 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).