In [None]:
import pandas as pd

features = pd.read_csv("txs_features.txt", header=None)
edges = pd.read_csv("txs_edgelist.txt")
labels = pd.read_csv("txs_classes.txt")

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# -------------------------------
# 1. TXT読み込み（区切り自動判定）
# -------------------------------
features = pd.read_csv("txs_features.txt", header=None, sep=None, engine="python")
labels   = pd.read_csv("txs_classes.txt",   header=None, sep=None, engine="python")

# -------------------------------
# 2. 列名を自動的に付ける
# -------------------------------
n_cols = features.shape[1]
features.columns = ["txId"] + [f"f{i}" for i in range(n_cols - 1)]

n_lbl = labels.shape[1]
if n_lbl == 2:
    labels.columns = ["txId", "class"]
else:
    labels.columns = ["txId", "class"] + [f"extra{i}" for i in range(n_lbl - 2)]

labels["class"] = labels["class"].astype(str)

# -------------------------------
# 3. 結合
# -------------------------------
df = pd.merge(features, labels[["txId", "class"]], on="txId")

# -------------------------------
# 4. ラベル '1','2' のみ使用（3, unknown など除外）
# -------------------------------
df_bin = df[df["class"].isin(["1", "2"])].copy()

print("ラベル別件数:")
print(df_bin["class"].value_counts())

feature_cols = [c for c in df_bin.columns if c.startswith("f")]
X = df_bin[feature_cols].values
y = df_bin["class"].map({"1": 0, "2": 1}).values

print("サンプル数:", len(y))

# -------------------------------
# 5. 訓練 / テスト分割
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# 6. NaN を中央値で埋める（ここが今回のポイント）
# -------------------------------
imputer = SimpleImputer(strategy="median")
X_train = imputer.fit_transform(X_train)
X_test  = imputer.transform(X_test)

# NaN が残っていないか一応確認
print("NaN count (train):", np.isnan(X_train).sum())
print("NaN count (test):",  np.isnan(X_test).sum())

# -------------------------------
# 7. AdaBoost
# -------------------------------
base = DecisionTreeClassifier(max_depth=1)

model = AdaBoostClassifier(
    estimator=base,       # sklearn 1.2+
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)

model.fit(X_train, y_train)

# -------------------------------
# 8. 結果
# -------------------------------
y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


ラベル別件数:
class
2    42019
1     4545
Name: count, dtype: int64
サンプル数: 46564
NaN count (train): 0
NaN count (test): 0




Confusion Matrix:
[[ 779  130]
 [  34 8370]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.86      0.90       909
           1       0.98      1.00      0.99      8404

    accuracy                           0.98      9313
   macro avg       0.97      0.93      0.95      9313
weighted avg       0.98      0.98      0.98      9313

