In [7]:
import pandas as pd
import numpy as np
import networkx as nx
from node2vec import Node2Vec

TXS_FEATURES = "txs_features.txt"
TXS_CLASSES  = "txs_classes.txt"
TXS_EDGES    = "txs_edgelist.txt"

EMB_DIM = 64
WALK_LENGTH = 20
NUM_WALKS = 5
WINDOW = 10
WORKERS = 4

OUT_EMB_CSV = "ellipticpp_node2vec_emb.csv"

def main():
    print("Loading txs_features...")
    df_features = pd.read_csv(TXS_FEATURES)
    print("  shape:", df_features.shape)

    print("\nLoading txs_classes...")
    df_classes = pd.read_csv(TXS_CLASSES)
    print("  shape:", df_classes.shape)

    print("\nLoading txs_edgelist...")
    df_edges = pd.read_csv(TXS_EDGES)
    print("  shape:", df_edges.shape)

    # --- グラフ作成（txIdはそのまま使う：文字列として統一すると安全）---
    s = df_edges.iloc[:, 0].astype(str)
    t = df_edges.iloc[:, 1].astype(str)

    # Node2Vecは基本無向扱いが多いので、まずは無向で
    G = nx.from_pandas_edgelist(pd.DataFrame({"s": s, "t": t}), "s", "t", create_using=nx.Graph())
    print("\nGraph:")
    print("  nodes:", G.number_of_nodes())
    print("  edges:", G.number_of_edges())

    print("\nTraining Node2Vec (gensim)...")
    n2v = Node2Vec(
        G,
        dimensions=EMB_DIM,
        walk_length=WALK_LENGTH,
        num_walks=NUM_WALKS,
        workers=WORKERS,
        quiet=False,
    )
    model = n2v.fit(window=WINDOW, min_count=1, batch_words=4096)

    # --- featuresのtxId順に埋め込みを並べる（無いものはNaN）---
    tx_ids = df_features["txId"].astype(str).values
    Z = np.full((len(tx_ids), EMB_DIM), np.nan, dtype=np.float32)

    missing = 0
    for i, tid in enumerate(tx_ids):
        if tid in model.wv:
            Z[i] = model.wv[tid]
        else:
            missing += 1

    print("missing embeddings:", missing)

    out = pd.DataFrame(Z, columns=[f"emb_{i}" for i in range(EMB_DIM)])
    out.insert(0, "txId", df_features["txId"].values)
    out.to_csv(OUT_EMB_CSV, index=False)
    print("saved:", OUT_EMB_CSV)

    print("\nDone.")

if __name__ == "__main__":
    main()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [2]:
import torch
print(torch.__version__)

2.6.0+cpu
