In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ==== 參數設定 ====
INPUT_CSV     = "/mnt/data/cd_ui.csv"                      # 你的 user,item 檔
DATASET_NAME  = "my-cd"                                    # 產出的資料集名稱
OUTPUT_ROOT   = "/mnt/sda1/sherry/SGL-Torch/dataset"       # 專案的 dataset 資料夾
HAS_HEADER    = False                                      # 如果 csv 第一行是標題就設 True
REMAP_IDS     = False                                      # 若想強制把 user/item 重新編為 0..U-1 / 0..I-1
SPLIT_MODE    = "by_order"                                 # "by_order" 或 "random"
RANDOM_SEED   = 2021                                       # SPLIT_MODE="random" 時使用

# ==== 載入 ====
if HAS_HEADER:
    df = pd.read_csv(INPUT_CSV)
    # 嘗試自動對應欄位名
    col_map = {c.lower(): c for c in df.columns}
    user_col = col_map.get("user", "user")
    item_col = col_map.get("item", "item")
    df = df[[user_col, item_col]].rename(columns={user_col: "user", item_col: "item"})
else:
    df = pd.read_csv(INPUT_CSV, header=None, names=["user", "item"])

# 型別保險
df["user"] = pd.to_numeric(df["user"], downcast="integer", errors="coerce")
df["item"] = pd.to_numeric(df["item"], downcast="integer", errors="coerce")
df = df.dropna(subset=["user", "item"]).astype({"user": int, "item": int})

print("原始筆數:", len(df))
display(df.head())

# ==== 可選：重新編號（確保從 0 開始連續）====
if REMAP_IDS:
    u2id = {u:i for i, u in enumerate(df["user"].unique())}
    i2id = {i:j for j, i in enumerate(df["item"].unique())}
    df["user"] = df["user"].map(u2id).astype(int)
    df["item"] = df["item"].map(i2id).astype(int)

# ==== 切分規則 ====
if SPLIT_MODE == "random":
    rng = np.random.RandomState(RANDOM_SEED)
    # 為了 per-user 的「最後一筆」概念，這裡先洗牌，再把「洗牌後的最後一筆」視為 test
    df = df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)

# 如果是 by_order，就維持 CSV 的出現順序，當作時間先後

# 為每個 user 製作 index 序列
df["_idx"] = np.arange(len(df))  # 保留原始順序的索引
groups = df.groupby("user", sort=False)

test_idx = []
valid_idx = []
train_idx = []

for u, g in groups:
    # 依照當前 df 的順序（by_order 或 random 後的順序）
    idxs = g["_idx"].to_list()
    if len(idxs) == 1:
        test_idx.append(idxs[-1])
    elif len(idxs) == 2:
        valid_idx.append(idxs[-2])
        test_idx.append(idxs[-1])
    else:
        train_idx.extend(idxs[:-2])
        valid_idx.append(idxs[-2])
        test_idx.append(idxs[-1])

# 取出三個 split
train_df = df.loc[train_idx, ["user", "item"]].sort_values(by=["user", "item"]).reset_index(drop=True)
valid_df = df.loc[valid_idx, ["user", "item"]].sort_values(by=["user", "item"]).reset_index(drop=True)
test_df  = df.loc[test_idx,  ["user", "item"]].sort_values(by=["user", "item"]).reset_index(drop=True)

print(f"Users: {df['user'].nunique()}, Items: {df['item'].nunique()}")
print(f"Split sizes -> train: {len(train_df)}, valid: {len(valid_df)}, test: {len(test_df)}")
display(train_df.head(), valid_df.head(), test_df.head())

# ==== 存成 SGL 需要的檔案 ====
out_dir = Path(OUTPUT_ROOT) / DATASET_NAME
out_dir.mkdir(parents=True, exist_ok=True)
prefix = out_dir / DATASET_NAME  # e.g., dataset/my-cd/my-cd

train_path = f"{prefix}.train"
valid_path = f"{prefix}.valid"
test_path  = f"{prefix}.test"

train_df.to_csv(train_path, index=False, header=False)
valid_df.to_csv(valid_path, index=False, header=False)
test_df.to_csv(test_path,  index=False, header=False)

print("Saved:")
print("  ", train_path)
print("  ", valid_path)
print("  ", test_path)
