In [3]:
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import NeighborLoader


# =========================
# 設定
# =========================
TXS_FEATURES = "txs_features.txt"
TXS_CLASSES  = "txs_classes.txt"
TXS_EDGES    = "txs_edgelist.txt"

WORKDIR = "./_prep"  # 生成物（memmapなど）置き場
EMB_DIM = 64         # ← ここが「64次元」の指定
HID_DIM = 128
EPOCHS = 3           # とりあえず短め。増やすと精度は上がる
BATCH_SIZE = 4096
NUM_NEIGHBORS = [15, 10]  # 2層分の近傍サンプル数
LR = 1e-3
SEED = 42

# class の解釈（あなたのファイルを見ると 3 が出ている）
# 典型: 1=illicit, 2=licit, 3=unknown
CLASS_TO_Y = {1: 1, 2: 0, 3: -1}  # y: 1=illicit, 0=licit, -1=unknown(学習除外)


# =========================
# ユーティリティ
# =========================
def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def fast_line_count(path: str) -> int:
    # ヘッダ含めた行数：巨大でもそこそこ速い
    # Windowsならこれが最速級だが、ここではPython内でカウント
    n = 0
    with open(path, "rb") as f:
        for _ in f:
            n += 1
    return n

def ensure_dir(d: str):
    os.makedirs(d, exist_ok=True)

def infer_feature_cols(header_cols):
    # txId, Time step 以外を全部 feature として使う
    drop = {"txId", "Time step"}
    feat_cols = [c for c in header_cols if c not in drop]
    return feat_cols

def print_once(msg: str):
    print(msg, flush=True)


# =========================
# ステップA: txId -> row index のマップを作る（featuresの順番を基準）
# =========================
def build_txid_index(features_path: str, out_path: str) -> np.ndarray:
    """
    features の txId 列だけを読み、txId の配列を保存して index を確定する。
    """
    print_once("[A] Build txId index from features (txId column only) ...")
    # txId列だけ読む（メモリ節約）
    tx = pd.read_csv(features_path, usecols=["txId"], dtype={"txId": np.int64})
    tx_ids = tx["txId"].to_numpy(np.int64)
    np.save(out_path, tx_ids)
    print_once(f"[A] tx nodes: {len(tx_ids)}  saved: {out_path}")
    return tx_ids


# =========================
# ステップB: 特徴量を memmap (float32) に落とす
# =========================
def build_features_memmap(features_path: str, tx_ids: np.ndarray, out_memmap_path: str, out_cols_path: str):
    """
    features を chunk で読み、float32 の memmap に書き込む。
    形状: [N, D] （D=183 になるはず）
    """
    print_once("[B] Build features memmap ...")
    header = pd.read_csv(features_path, nrows=0)
    feat_cols = infer_feature_cols(list(header.columns))
    D = len(feat_cols)
    np.save(out_cols_path, np.array(feat_cols, dtype=object))
    print_once(f"[B] feature dim D={D} (saved columns: {out_cols_path})")

    N = len(tx_ids)
    mm = np.memmap(out_memmap_path, dtype=np.float32, mode="w+", shape=(N, D))

    # chunk読み（RAM節約）
    chunksize = 200_000
    reader = pd.read_csv(
        features_path,
        usecols=feat_cols,
        dtype={c: np.float32 for c in feat_cols},
        chunksize=chunksize,
    )

    offset = 0
    for chunk in reader:
        arr = chunk.to_numpy(np.float32, copy=False)
        n = arr.shape[0]
        mm[offset:offset+n, :] = arr
        offset += n
        if offset % 1_000_000 < chunksize:
            print_once(f"[B] written rows: {offset}/{N}")

    mm.flush()
    if offset != N:
        raise RuntimeError(f"features rows mismatch: wrote={offset} N={N}")
    print_once(f"[B] features memmap saved: {out_memmap_path}")


# =========================
# ステップC: ラベル配列 y を作る（unknown=-1）
# =========================
def build_labels(classes_path: str, tx_ids: np.ndarray, out_path: str) -> np.ndarray:
    print_once("[C] Build labels y ...")
    # txId -> index を作る（dictは重いが txId数が数百万なら許容範囲のことが多い）
    # メモリが厳しければ、別方式（ソート＆サーチ）に置き換え可能
    id_to_idx = {int(t): i for i, t in enumerate(tx_ids)}

    y = np.full(shape=(len(tx_ids),), fill_value=-1, dtype=np.int8)  # unknown=-1
    df = pd.read_csv(classes_path, dtype={"txId": np.int64, "class": np.int16})

    updated = 0
    for txId, cls in zip(df["txId"].to_numpy(), df["class"].to_numpy()):
        idx = id_to_idx.get(int(txId), None)
        if idx is None:
            continue
        y_val = CLASS_TO_Y.get(int(cls), -1)
        y[idx] = y_val
        updated += 1

    np.save(out_path, y)
    print_once(f"[C] labels saved: {out_path}  (updated {updated} rows)")
    return y


# =========================
# ステップD: edge_index を作る（txId -> index に変換）
# =========================
def build_edge_index(edges_path: str, tx_ids: np.ndarray, out_path: str) -> np.ndarray:
    print_once("[D] Build edge_index ...")
    id_to_idx = {int(t): i for i, t in enumerate(tx_ids)}

    # edgesは比較的小さい（あなたのサイズだと数十万程度）ので一括読みOK
    edges = pd.read_csv(edges_path, dtype={"txId1": np.int64, "txId2": np.int64})
    src = edges["txId1"].to_numpy(np.int64)
    dst = edges["txId2"].to_numpy(np.int64)

    src_idx = np.empty_like(src)
    dst_idx = np.empty_like(dst)

    valid = np.ones(len(src), dtype=bool)
    for i in range(len(src)):
        a = id_to_idx.get(int(src[i]), None)
        b = id_to_idx.get(int(dst[i]), None)
        if a is None or b is None:
            valid[i] = False
            continue
        src_idx[i] = a
        dst_idx[i] = b

    src_idx = src_idx[valid]
    dst_idx = dst_idx[valid]

    edge_index = np.vstack([src_idx, dst_idx]).astype(np.int64)
    np.save(out_path, edge_index)
    print_once(f"[D] edges kept: {edge_index.shape[1]}  saved: {out_path}")
    return edge_index


# =========================
# GraphSAGE モデル
# =========================
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_dim: int, hidden: int, out_dim: int, dropout: float = 0.2):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden)
        self.conv2 = SAGEConv(hidden, out_dim)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x


# =========================
# メイン：準備→学習→埋め込み保存
# =========================
def main():
    set_seed(SEED)
    ensure_dir(WORKDIR)

    # 生成物パス
    txids_path = os.path.join(WORKDIR, "tx_ids.npy")
    feat_mm_path = os.path.join(WORKDIR, "x_float32.memmap")
    feat_cols_path = os.path.join(WORKDIR, "feature_cols.npy")
    y_path = os.path.join(WORKDIR, "y_int8.npy")
    edge_path = os.path.join(WORKDIR, "edge_index.npy")
    emb_out_path = os.path.join(WORKDIR, f"emb_{EMB_DIM}d_float32.memmap")

    # A: txId index
    if os.path.exists(txids_path):
        tx_ids = np.load(txids_path)
        print_once(f"[A] reuse: {txids_path}  N={len(tx_ids)}")
    else:
        tx_ids = build_txid_index(TXS_FEATURES, txids_path)

    # B: features memmap
    if os.path.exists(feat_mm_path) and os.path.exists(feat_cols_path):
        feat_cols = np.load(feat_cols_path, allow_pickle=True)
        D = len(feat_cols)
        print_once(f"[B] reuse memmap: {feat_mm_path}  shape=({len(tx_ids)}, {D})")
    else:
        build_features_memmap(TXS_FEATURES, tx_ids, feat_mm_path, feat_cols_path)
        feat_cols = np.load(feat_cols_path, allow_pickle=True)
        D = len(feat_cols)

    # C: labels
    if os.path.exists(y_path):
        y = np.load(y_path)
        print_once(f"[C] reuse: {y_path}")
    else:
        y = build_labels(TXS_CLASSES, tx_ids, y_path)

    # D: edges
    if os.path.exists(edge_path):
        edge_index = np.load(edge_path)
        print_once(f"[D] reuse: {edge_path}  E={edge_index.shape[1]}")
    else:
        edge_index = build_edge_index(TXS_EDGES, tx_ids, edge_path)

    # --- torch tensor にする（features は memmap を Tensor 化） ---
    # memmap → torch (cpu) 参照
    x_mm = np.memmap(feat_mm_path, dtype=np.float32, mode="r", shape=(len(tx_ids), D))
    x = torch.from_numpy(x_mm)  # 参照のみ（巨大でもRAMに全コピーしにくい）
    y_t = torch.from_numpy(y.astype(np.int64))
    edge_t = torch.from_numpy(edge_index)

    data = Data(x=x, edge_index=edge_t, y=y_t)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print_once(f"[INFO] device={device}")

    # --- mask（unknown=-1 を学習除外） ---
    labeled = (data.y != -1)
    idx = labeled.nonzero(as_tuple=False).view(-1)

    # train/val を簡易分割
    perm = idx[torch.randperm(idx.numel())]
    n_train = int(0.8 * perm.numel())
    train_idx = perm[:n_train]
    val_idx = perm[n_train:]

    # NeighborLoader（ミニバッチ学習）
    train_loader = NeighborLoader(
        data,
        input_nodes=train_idx,
        num_neighbors=NUM_NEIGHBORS,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )
    val_loader = NeighborLoader(
        data,
        input_nodes=val_idx,
        num_neighbors=NUM_NEIGHBORS,
        batch_size=BATCH_SIZE,
        shuffle=False,
    )

    model = GraphSAGE(in_dim=D, hidden=HID_DIM, out_dim=EMB_DIM, dropout=0.2).to(device)
    clf = torch.nn.Linear(EMB_DIM, 2).to(device)  # 2値（licit=0, illicit=1）

    opt = torch.optim.Adam(list(model.parameters()) + list(clf.parameters()), lr=LR, weight_decay=1e-4)

    def batch_loss(batch):
        batch = batch.to(device)
        z = model(batch.x, batch.edge_index)  # [batch_size, EMB_DIM] ではなくサブグラフ全ノード分が出る
        # input_nodes が先頭に来る仕様：batch.batch_size が対象ノード数
        z0 = z[:batch.batch_size]
        y0 = batch.y[:batch.batch_size]
        # unknownは入ってこない想定だが念のため除外
        m = (y0 != -1)
        z0 = z0[m]
        y0 = y0[m]
        logits = clf(z0)
        return F.cross_entropy(logits, y0)

    # --- 学習 ---
    print_once("[TRAIN] start")
    for epoch in range(1, EPOCHS + 1):
        model.train(); clf.train()
        total = 0.0
        steps = 0
        for batch in train_loader:
            opt.zero_grad()
            loss = batch_loss(batch)
            loss.backward()
            opt.step()
            total += float(loss.item())
            steps += 1
        avg = total / max(1, steps)

        # 簡易val
        model.eval(); clf.eval()
        correct = 0; n = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                z = model(batch.x, batch.edge_index)
                z0 = z[:batch.batch_size]
                y0 = batch.y[:batch.batch_size]
                m = (y0 != -1)
                z0 = z0[m]; y0 = y0[m]
                pred = clf(z0).argmax(dim=-1)
                correct += int((pred == y0).sum().item())
                n += int(y0.numel())
        acc = correct / max(1, n)
        print_once(f"[TRAIN] epoch={epoch} loss={avg:.4f} val_acc={acc:.4f}")

    # --- 全ノードの 64次元埋め込み生成（バッチで回す） ---
    # ここが「トランザクションごとに64次元」になる部分
    print_once("[EMB] generate embeddings for ALL nodes ...")

    all_idx = torch.arange(data.num_nodes)
    full_loader = NeighborLoader(
        data,
        input_nodes=all_idx,
        num_neighbors=NUM_NEIGHBORS,
        batch_size=BATCH_SIZE,
        shuffle=False,
    )

    emb_mm = np.memmap(emb_out_path, dtype=np.float32, mode="w+", shape=(data.num_nodes, EMB_DIM))

    model.eval()
    with torch.no_grad():
        offset = 0
        for batch in full_loader:
            batch = batch.to(device)
            z = model(batch.x, batch.edge_index)
            z0 = z[:batch.batch_size].detach().cpu().numpy().astype(np.float32, copy=False)
            bsz = z0.shape[0]
            # batch.n_id[:batch.batch_size] が元グラフのノードID（index）
            nid = batch.n_id[:batch.batch_size].cpu().numpy()
            emb_mm[nid, :] = z0
            offset += bsz
            if offset % 1_000_000 < BATCH_SIZE:
                print_once(f"[EMB] done nodes: {offset}/{data.num_nodes}")

    emb_mm.flush()
    print_once(f"[EMB] saved memmap: {emb_out_path}")
    print_once("[DONE]")


if __name__ == "__main__":
    main()


[A] Build txId index from features (txId column only) ...
[A] tx nodes: 203769  saved: ./_prep\tx_ids.npy
[B] Build features memmap ...
[B] feature dim D=182 (saved columns: ./_prep\feature_cols.npy)
[B] features memmap saved: ./_prep\x_float32.memmap
[C] Build labels y ...
[C] labels saved: ./_prep\y_int8.npy  (updated 203769 rows)
[D] Build edge_index ...
[D] edges kept: 234355  saved: ./_prep\edge_index.npy
[INFO] device=cpu


  x = torch.from_numpy(x_mm)  # 参照のみ（巨大でもRAMに全コピーしにくい）


[TRAIN] start


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'