In [1]:
import os
import numpy as np
import dpdata

# === 设置路径 ===
raw_dir = "/home/jovyan/Desktop/Cu-DeePMD/00.data/Cu_raw/Cu_dataset/set.000"
out_train = "/home/jovyan/Desktop/Cu-DeePMD/00.data/training_data"
out_val = "/home/jovyan/Desktop/Cu-DeePMD/00.data/validation_data"
type_map = ["Cu"]

# === 加载 raw 文件 ===
coord = np.loadtxt(os.path.join(raw_dir, "coord.raw")).reshape(-1, 4, 3)
force = np.loadtxt(os.path.join(raw_dir, "force.raw")).reshape(-1, 4, 3)
virial = np.loadtxt(os.path.join(raw_dir, "virial.raw")).reshape(-1, 3, 3)
box = np.loadtxt(os.path.join(raw_dir, "box.raw")).reshape(-1, 3, 3)
energy = np.loadtxt(os.path.join(raw_dir, "energy.raw")).reshape(-1, 1)
type_data = np.loadtxt(os.path.join(raw_dir, "type.raw"), dtype=int).reshape(-1, 4)

n_frames = coord.shape[0]

# === 构建 data 字典，包含 orig ===
data = {
    "coords": coord,
    "forces": force,
    "energies": energy.squeeze(),     # shape (500,)
    "virials": virial,
    "box": box,
    "cells": box,
    "types": type_data,
    "atom_types": type_data[0],
    "atom_numbs": [4],
    "atom_names": ["Cu"],
    "orig": np.arange(3)              # ✅ 添加 orig 字段
}

# === 构建 LabeledSystem 对象 ===
system = dpdata.LabeledSystem(data=data, type_map=type_map)
print(f"✅ 加载成功，总帧数: {len(system)}")

# === 划分训练/验证集 ===
rng = np.random.default_rng(42)
val_size = int(0.2 * n_frames)
val_idx = rng.choice(n_frames, size=val_size, replace=False)
train_idx = list(set(range(n_frames)) - set(val_idx))

train_sys = system.sub_system(train_idx)
val_sys = system.sub_system(val_idx)

# === 输出数据 ===
train_sys.to_deepmd_npy(out_train)
val_sys.to_deepmd_npy(out_val)

print(f"🎉 数据集划分完成！训练帧: {len(train_sys)}, 验证帧: {len(val_sys)}")


✅ 加载成功，总帧数: 500
🎉 数据集划分完成！训练帧: 400, 验证帧: 100
