# 阅读json

## 阅读json并打印它的顶层键名

In [1]:
import json

with open("/home/qygao/matten/datasets/example_crystal_elasticity_tensor_n100.json", "r") as f:
    data = json.load(f)

print(data.keys())
#print(json.dumps(data, indent=2))

dict_keys(['structure', 'formula_pretty', 'crystal_system', 'elastic_tensor_full', 'elastic_tensor_voigt', 'split'])


## 这段代码是在把刚刚读进来的 data 里，取出第 0 个结构样本，然后把它的晶格参数（lattice）和前两个原子位点（sites）打印出来（格式化输出，方便看）。

In [12]:
struct = data["structure"]["0"]

print("=== lattice ===")
print(json.dumps(struct["lattice"], indent=2))

print("\n=== first 2 atoms ===")
print(json.dumps(struct["sites"][:2], indent=2))


=== lattice ===
{
  "matrix": [
    [
      3.83753856,
      0.0,
      0.0
    ],
    [
      0.0,
      3.83753856,
      0.0
    ],
    [
      0.0,
      0.0,
      3.83753856
    ]
  ],
  "pbc": [
    true,
    true,
    true
  ],
  "a": 3.83753856,
  "b": 3.83753856,
  "c": 3.83753856,
  "alpha": 90.0,
  "beta": 90.0,
  "gamma": 90.0,
  "volume": 56.5142875522
}

=== first 2 atoms ===
[
  {
    "species": [
      {
        "element": "Ba",
        "occu": 1
      }
    ],
    "abc": [
      0.0,
      0.0,
      0.0
    ],
    "xyz": [
      0.0,
      0.0,
      0.0
    ],
    "label": "Ba",
    "properties": {
      "magmom": 0.0
    }
  },
  {
    "species": [
      {
        "element": "Si",
        "occu": 1
      }
    ],
    "abc": [
      0.5,
      0.5,
      0.5
    ],
    "xyz": [
      1.91876928,
      1.91876928,
      1.91876928
    ],
    "label": "Si",
    "properties": {
      "magmom": 0.0
    }
  }
]


# 把“介电张量数据集”整理成和示例“弹性张量数据集”类似的 DataFrame 结构（列名/列顺序/含 split），然后保存成一个新的 JSON 文件，方便后续用同一套 matten 的数据管线去读。

In [4]:
import pandas as pd

from pymatgen.core.structure import Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer

from matten.data import split as matten_split


# ===== 1. 路径设置 =====
elastic_path = "/home/qygao/matten/datasets/example_crystal_elasticity_tensor_n100.json"  # 示例数据
dielectric_path = "/home/qygao/matten/datasets/di_pizeoelectric_tensor/dielectric_tensors.json"
save_path = "/home/qygao/matten/datasets/di_pizeoelectric_tensor/dielectric_tensors_like_elastic.json"


# ===== 2. 读取两个数据集，看一下结构 =====
df_elastic = pd.read_json(elastic_path)
print("弹性张量列名:", df_elastic.columns.tolist())
print("弹性样本数:", len(df_elastic))

df_dielectric = pd.read_json(dielectric_path)
print("介电张量原始列名:", df_dielectric.columns.tolist())
print("介电样本数:", len(df_dielectric))

print("\n介电张量第一行（方便确认结构）：")
print(df_dielectric.iloc[0].to_dict())


# ===== 3. 从 structure 提取 formula_pretty 和 crystal_system =====
# structure 这一列现在是 pymatgen.Structure 的 dict 表示

def get_formula(struct_dict):
    """从 structure 字典提取漂亮化学式."""
    struct = Structure.from_dict(struct_dict)
    return struct.composition.reduced_formula


def get_crystal_system(struct_dict):
    """从 structure 字典提取晶系（triclinic, monoclinic, ...）."""
    struct = Structure.from_dict(struct_dict)
    try:
        sga = SpacegroupAnalyzer(struct, symprec=1e-3)
        return sga.get_crystal_system()
    except Exception as e:
        print("get_crystal_system 出错, 返回 None:", e)
        return None


print("\n=== 生成 formula_pretty 和 crystal_system 列 ===")
df_dielectric["formula_pretty"] = df_dielectric["structure"].apply(get_formula)
df_dielectric["crystal_system"] = df_dielectric["structure"].apply(get_crystal_system)

print("增加列后:", df_dielectric.columns.tolist())


# ===== 4. 把介电张量列 rename 成和弹性张量一样的名字 =====
# 这里选择 total 作为主张量，对应 elastic_tensor_full
# 注意：这只是“名字对齐”，物理意义上不是弹性张量

rename_dict = {
    "dielectric_tensor_total": "elastic_tensor_full",
}
df_dielectric = df_dielectric.rename(columns=rename_dict)

# elastic_tensor_voigt 在介电问题里其实没有“规范”的 6x6 Voigt 形式，
# 为了对齐示例数据，我们先造一个占位列：
#   方案 1：干脆设成 None
#   方案 2：直接复制 total (3x3)，后续模型只用 elastic_tensor_full 这一列
df_dielectric["elastic_tensor_voigt"] = None

print("重命名后列名:", df_dielectric.columns.tolist())


# ===== 5. 划分 train / val / test，并加 split 列 =====
# 用你之前写的 train_val_test_split_dataframe
print("\n=== 划分 train/val/test 并添加 split 列 ===")

train_df, val_df, test_df = matten_split.train_val_test_split_dataframe(
    df_dielectric,
    val_size=0.1,   # 验证集 10%
    test_size=0.1,  # 测试集 10%
    stratify=None,
    random_state=42,
)

train_df["split"] = "train"
val_df["split"] = "val"
test_df["split"] = "test"

df_all = pd.concat([train_df, val_df, test_df]).sort_index()

print("各子集样本数：",
      "train =", (df_all["split"] == "train").sum(),
      "val =", (df_all["split"] == "val").sum(),
      "test =", (df_all["split"] == "test").sum())


# ===== 6. 做一个“完全对齐”的视图，并保存 =====
# 和 example_crystal_elasticity_tensor_n100.json 一样的列顺序
aligned_cols = [
    "structure",
    "formula_pretty",
    "crystal_system",
    "elastic_tensor_full",
    "elastic_tensor_voigt",
    "split",
]

df_aligned = df_all[aligned_cols]
df_aligned = df_aligned.rename(columns={
    "elastic_tensor_full": "dielectric_tensor",
    "elastic_tensor_voigt": "dielectric_tensor_voigt"
})
df_aligned.to_json(save_path)
print("\n已保存对齐后的介电数据到：", save_path)
print("最终列名:", df_aligned.columns.tolist())
print("最终样本数:", len(df_aligned))


弹性张量列名: ['structure', 'formula_pretty', 'crystal_system', 'elastic_tensor_full', 'elastic_tensor_voigt', 'split']
弹性样本数: 100
介电张量原始列名: ['structure', 'mpid', 'dielectric_tensor_total', 'dielectric_tensor_ionic', 'dielectric_tensor_electronic']
介电样本数: 7277

介电张量第一行（方便确认结构）：
{'structure': {'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0, 'lattice': {'matrix': [[-2.02233921, -3.50279292, 0.0], [-4.0446774, -1e-08, 0.0], [-2.02233921, -1.1675976399999999, -7.31350573]], 'pbc': [True, True, True], 'a': 4.0446772579, 'b': 4.0446774, 'c': 7.6772720541, 'alpha': 74.7269702201, 'beta': 74.7269717408, 'gamma': 59.9999903541, 'volume': 103.6153162731}, 'properties': {}, 'sites': [{'species': [{'element': 'K', 'occu': 1}], 'abc': [0.5, 0.5, 0.5], 'properties': {'magmom': 0.0}, 'label': 'K', 'xyz': [-4.04467791, -2.335195285, -3.656752865]}, {'species': [{'element': 'Y', 'occu': 1}], 'abc': [0.0, 0.0, 0.0], 'properties': {'magmom': 0.0}, 'label': 'Y', 'xyz': [0.0, 0.0, 0.0]}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["split"] = "test"



已保存对齐后的介电数据到： /home/qygao/matten/datasets/di_pizeoelectric_tensor/dielectric_tensors_like_elastic.json
最终列名: ['structure', 'formula_pretty', 'crystal_system', 'dielectric_tensor', 'dielectric_tensor_voigt', 'split']
最终样本数: 7277


# 同理，处理压电张量（但是处理Voigt矩阵可能有问题）

In [None]:
import pandas as pd

from pymatgen.core.structure import Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer

from matten.data import split as matten_split


# ===== 1. 路径设置 =====
elastic_path = "/home/qygao/matten/datasets/example_crystal_elasticity_tensor_n100.json"  # 示例数据
piezoelectric_path = "/home/qygao/matten/datasets/di_pizeoelectric_tensor/piezoelectric_tensors.json"
save_path = "/home/qygao/matten/datasets/di_pizeoelectric_tensor/piezoelectric_tensors_like_elastic.json"


# ===== 2. 读取两个数据集，看一下结构 =====
df_elastic = pd.read_json(elastic_path)
print("弹性张量列名:", df_elastic.columns.tolist())
print("弹性样本数:", len(df_elastic))

df_piezoelectric = pd.read_json(piezoelectric_path)
print("压电张量原始列名:", df_piezoelectric.columns.tolist())
print("压电样本数:", len(df_piezoelectric))

print("\n压电张量第一行（方便确认结构）：")
print(df_piezoelectric.iloc[0].to_dict())


# ===== 3. 从 structure 提取 formula_pretty 和 crystal_system =====
# structure 这一列现在是 pymatgen.Structure 的 dict 表示

def get_formula(struct_dict):
    """从 structure 字典提取漂亮化学式."""
    struct = Structure.from_dict(struct_dict)
    return struct.composition.reduced_formula


def get_crystal_system(struct_dict):
    """从 structure 字典提取晶系（triclinic, monoclinic, ...）."""
    struct = Structure.from_dict(struct_dict)
    try:
        sga = SpacegroupAnalyzer(struct, symprec=1e-3)
        return sga.get_crystal_system()
    except Exception as e:
        print("get_crystal_system 出错, 返回 None:", e)
        return None


print("\n=== 生成 formula_pretty 和 crystal_system 列 ===")
df_piezoelectric["formula_pretty"] = df_piezoelectric["structure"].apply(get_formula)
df_piezoelectric["crystal_system"] = df_piezoelectric["structure"].apply(get_crystal_system)

print("增加列后:", df_piezoelectric.columns.tolist())


# ===== 4. 把压电张量列 rename 成和弹性张量一样的名字 =====
# 这里选择 total 作为主张量，对应 elastic_tensor_full
# 注意：这只是“名字对齐”，物理意义上不是弹性张量

rename_dict = {
    "piezoelectric_tensor_total": "elastic_tensor_full",
}
df_piezoelectric = df_piezoelectric.rename(columns=rename_dict)

# elastic_tensor_voigt 在压电问题里其实没有“规范”的 6x6 Voigt 形式，
# 为了对齐示例数据，我们先造一个占位列：
#   方案 1：干脆设成 None
#   方案 2：直接复制 total (3x3)，后续模型只用 elastic_tensor_full 这一列
df_piezoelectric["elastic_tensor_voigt"] = None

print("重命名后列名:", df_piezoelectric.columns.tolist())


# ===== 5. 划分 train / val / test，并加 split 列 =====
# 用你之前写的 train_val_test_split_dataframe
print("\n=== 划分 train/val/test 并添加 split 列 ===")

train_df, val_df, test_df = matten_split.train_val_test_split_dataframe(
    df_piezoelectric,
    val_size=0.1,   # 验证集 10%
    test_size=0.1,  # 测试集 10%
    stratify=None,
    random_state=42,
)

train_df["split"] = "train"
val_df["split"] = "val"
test_df["split"] = "test"

df_all = pd.concat([train_df, val_df, test_df]).sort_index()

print("各子集样本数：",
      "train =", (df_all["split"] == "train").sum(),
      "val =", (df_all["split"] == "val").sum(),
      "test =", (df_all["split"] == "test").sum())


# ===== 6. 做一个“完全对齐”的视图，并保存 =====
# 和 example_crystal_elasticity_tensor_n100.json 一样的列顺序
aligned_cols = [
    "structure",
    "formula_pretty",
    "crystal_system",
    "elastic_tensor_full",
    "elastic_tensor_voigt",
    "split",
]

df_aligned = df_all[aligned_cols]
df_aligned = df_aligned.rename(columns={
    "elastic_tensor_full": "piezoelectric_tensor",
    "elastic_tensor_voigt": "piezoelectric_tensor_voigt"
})
df_aligned.to_json(save_path)
print("\n已保存对齐后的压电数据到：", save_path)
print("最终列名:", df_aligned.columns.tolist())
print("最终样本数:", len(df_aligned))


弹性张量列名: ['structure', 'formula_pretty', 'crystal_system', 'elastic_tensor_full', 'elastic_tensor_voigt', 'split']
弹性样本数: 100
介电张量原始列名: ['structure', 'mpid', 'piezoelectric_tensor_total', 'piezoelectric_tensor_ionic', 'piezoelectric_tensor_electronic']
介电样本数: 3292

介电张量第一行（方便确认结构）：
{'structure': {'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0, 'lattice': {'matrix': [[2.84165008, -1.64062857, 0.0], [-1e-08, 3.28125511, 0.0], [0.0, 0.0, 5.28227338]], 'pbc': [True, True, True], 'a': 3.2812554429, 'b': 3.28125511, 'c': 5.28227338, 'alpha': 90.0, 'beta': 90.0, 'gamma': 120.0000172833, 'volume': 49.252861621}, 'properties': {}, 'sites': [{'species': [{'element': 'Zr', 'occu': 1}], 'abc': [0.0, 0.0, 0.0021306800000000002], 'properties': {'magmom': 0.0}, 'label': 'Zr', 'xyz': [0.0, 0.0, 0.011254834200000001]}, {'species': [{'element': 'Zn', 'occu': 1}], 'abc': [0.666667, 0.333333, 0.45002818], 'properties': {'magmom': 0.0}, 'label': 'Zn', 'xyz': [1.8944343306, -2.31730

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["split"] = "test"


# 把前面生成的 “对齐后的总表（带 split 列）” 再进一步处理成 MatTen 更常用的三份文件：train/val/test 三个 JSON，而且保存成 list of dict（records） 的格式，通常更容易被数据加载器直接读取。

In [6]:
import json
import pandas as pd
from pathlib import Path

root = Path("/home/qygao/matten/datasets/di_pizeoelectric_tensor")

# 刚才保存的“总文件”，里面已经有 split 列
like_path = root / "piezoelectric_tensors_like_elastic.json"

df = pd.read_json(like_path)
print("读取对齐数据：", like_path)
print("列名:", df.columns.tolist())
print("样本数:", len(df))
print("split 分布:\n", df["split"].value_counts())

# 按 split 划分
df_train = df[df["split"] == "train"].copy()
df_val   = df[df["split"] == "val"].copy()
df_test  = df[df["split"] == "test"].copy()

print("\n划分结果：",
      "train =", len(df_train),
      "val =", len(df_val),
      "test =", len(df_test))

# 保存成 MatTen 常规用的“记录列表”格式（list of dict）
def save_records(df_sub, path):
    records = df_sub.to_dict(orient="records")
    with open(path, "w") as f:
        json.dump(records, f)
    print("已保存:", path, "样本数:", len(records))

train_path = root / "piezoelectric_tensor_train2.json"
val_path   = root / "piezoelectric_tensor_val2.json"
test_path  = root / "piezoelectric_tensor_test2.json"

save_records(df_train, train_path)
save_records(df_val,   val_path)
save_records(df_test,  test_path)

print("\n完成：三个子集 json 已生成。")


读取对齐数据： /home/qygao/matten/datasets/di_pizeoelectric_tensor/piezoelectric_tensors_like_elastic.json
列名: ['structure', 'formula_pretty', 'crystal_system', 'piezoelectric_tensor', 'piezoelectric_tensor_voigt', 'split']
样本数: 3292
split 分布:
 split
train    2635
val       329
test      328
Name: count, dtype: int64

划分结果： train = 2635 val = 329 test = 328
已保存: /home/qygao/matten/datasets/di_pizeoelectric_tensor/piezoelectric_tensor_train2.json 样本数: 2635
已保存: /home/qygao/matten/datasets/di_pizeoelectric_tensor/piezoelectric_tensor_val2.json 样本数: 329
已保存: /home/qygao/matten/datasets/di_pizeoelectric_tensor/piezoelectric_tensor_test2.json 样本数: 328

完成：三个子集 json 已生成。


## 把dielectric_tensor_val.json 里的每个 3×3 介电张量做对称化,并且可选生成一个 Voigt 6 维表示，然后写到新文件里。

In [None]:
import json
from pathlib import Path
import numpy as np

# ====== 你只需要改这里 ======
in_file = "/home/qygao/matten/datasets/di_pizeoelectric_tensor/dielectric_tensor_val.json"
out_dir = "/home/qygao/matten/datasets"
suffix = "_symmetric"     # 输出文件名后缀；想覆盖原文件就设为 "" 然后自己确认覆盖逻辑
add_voigt = True          # 是否生成 dielectric_tensor_voigt
# ==========================

in_path = Path(in_file)
out_path = Path(out_dir) / f"{in_path.stem}{suffix}{in_path.suffix}"
Path(out_dir).mkdir(parents=True, exist_ok=True)
s
with in_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

dt = data.get("dielectric_tensor")
if dt is None:
    raise KeyError("缺少 key: dielectric_tensor")
if not isinstance(dt, dict):
    raise TypeError("dielectric_tensor 不是 dict")

# 对称化：A <- (A + A^T)/2
for k, A in dt.items():
    A = np.array(A, dtype=float)
    if A.shape != (3, 3):
        raise ValueError(f"{k} 不是 3x3，而是 {A.shape}")
    dt[k] = (0.5 * (A + A.T)).tolist()

data["dielectric_tensor"] = dt

# 可选：生成 Voigt（[xx, yy, zz, yz, xz, xy]）
if add_voigt:
    data["dielectric_tensor_voigt"] = {
        k: [v[0][0], v[1][1], v[2][2], v[1][2], v[0][2], v[0][1]]
        for k, v in dt.items()
    }

with out_path.open("w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("输入:", str(in_path))
print("输出:", str(out_path))


输入: /home/qygao/matten/datasets/di_pizeoelectric_tensor/dielectric_tensor_val.json
输出: /home/qygao/matten/datasets/dielectric_tensor_val_symmetric.json


## 检查一个 JSON 数据集里每个 3×3 介电张量是不是对称矩阵（在容差 tol 内），并统计有多少对称、多少不对称，同时找出“最不对称”的那条样本和输出一些 bad 样本信息。

In [None]:
import json
from pathlib import Path
import numpy as np

# ====== 你改这里 ======
in_file = "/home/qygao/matten/datasets/dielectric_tensor_train_symmetric.json"  # 原始数据集文件
key = "dielectric_tensor"  # 介电张量所在 key
tol = 1e-8                # 判定对称的容差
# ======================

data = json.load(open(in_file, "r", encoding="utf-8"))
dt = data.get(key)
if dt is None:
    raise KeyError(f"缺少 key: {key}")
if not isinstance(dt, dict):
    raise TypeError(f"{key} 不是 dict")

max_asym = -1.0
worst_id = None
n_total = 0
n_symmetric = 0
bad = []

for mid, A in dt.items():
    A = np.array(A, dtype=float)
    if A.shape != (3, 3):
        bad.append((mid, "not_3x3", A.shape))
        continue

    asym = np.max(np.abs(A - A.T))
    n_total += 1
    if asym <= tol:
        n_symmetric += 1
    else:
        bad.append((mid, "asym", float(asym)))

    if asym > max_asym:
        max_asym = float(asym)
        worst_id = mid

print("file:", in_file)
print("tol :", tol)
print("total_3x3:", n_total)
print("symmetric:", n_symmetric)
print("not_symmetric:", n_total - n_symmetric)
print("max_asym:", max_asym, "worst_id:", worst_id)

# 打印前几个不对称样本（可调）
print("\nfirst_bad (up to 10):")
for x in bad[:10]:
    print(x)

# 如果你想把所有不对称的 id 单独列出来：
# bad_ids = [mid for (mid, t, *rest) in bad if t == "asym"]
# print("bad_ids:", bad_ids[:50], "...")


FileNotFoundError: [Errno 2] No such file or directory: 'YOUR_PATH.json'