# Tutorial on xtalmet package
This notebook provides a tutorial on the two primary features of the `xtalmet` package:
- Calculating distances between crystals
- Evaluating the uniqueness and novelty of a given set of generated crystals.

### Setup

In [None]:
import gzip
import pickle

from huggingface_hub import hf_hub_download
from pymatgen.core import Lattice, Structure

from xtalmet.constants import HF_VERSION
from xtalmet.distance import distance
from xtalmet.evaluator import Evaluator

### Distance calculation

In [2]:
# prepare five crystals
wzZnO = Structure.from_file("data/wz-ZnO.cif")
rsZnO = Structure.from_file("data/rs-ZnO.cif")
wzGaN = Structure.from_file("data/wz-GaN.cif")
Bi2Te3 = Structure.from_file("data/Bi2Te3.cif")
# The fifth crystal is the 2x2x2 supercell of wz-ZnO
frac_coords_super = []
species_super = []
for frac_coord, specie in zip(wzZnO.frac_coords, wzZnO.species, strict=False):
	for i in range(2):
		for j in range(2):
			for k in range(2):
				frac_coords_super.append(
					[
						(frac_coord[0] + i) / 2,
						(frac_coord[1] + j) / 2,
						(frac_coord[2] + k) / 2,
					]
				)
				species_super.append(specie)
wzZnO_2x2x2 = Structure(
	lattice=Lattice.from_dict(
		{
			"a": wzZnO.lattice.a * 2,
			"b": wzZnO.lattice.b * 2,
			"c": wzZnO.lattice.c * 2,
			"alpha": wzZnO.lattice.alpha,
			"beta": wzZnO.lattice.beta,
			"gamma": wzZnO.lattice.gamma,
		}
	),
	species=species_super,
	coords=frac_coords_super,
)



In [3]:
# d_smat: a binary distance based on pymatgen's StructureMacher
print(f"d_smat(wz-ZnO, wz-ZnO supercell) = {distance('smat', wzZnO, wzZnO_2x2x2)}")
print(f"d_smat(wz-ZnO, rs-ZnO) = {distance('smat', wzZnO, rsZnO)}")
print(f"d_smat(wz-ZnO, wz-GaN) = {distance('smat', wzZnO, wzGaN)}")
print(f"d_smat(wz-ZnO, Bi2Te3) = {distance('smat', wzZnO, Bi2Te3)}")

d_smat(wz-ZnO, wz-ZnO supercell) = 0.0
d_smat(wz-ZnO, rs-ZnO) = 1.0
d_smat(wz-ZnO, wz-GaN) = 1.0
d_smat(wz-ZnO, Bi2Te3) = 1.0


In [4]:
# You may optionally specify arguments for StructureMatcher
# ref: https://pymatgen.org/pymatgen.analysis.html#pymatgen.analysis.structure_matcher.StructureMatcher
kwargs = {"args_dist": {"ltol": 0.3, "stol": 0.5, "angle_tol": 6}}
print(
	f"d_smat(wz-ZnO, wz-ZnO supercell) = {distance('smat', wzZnO, wzZnO_2x2x2, **kwargs)}"
)
print(f"d_smat(wz-ZnO, rs-ZnO) = {distance('smat', wzZnO, rsZnO, **kwargs)}")
print(f"d_smat(wz-ZnO, wz-GaN) = {distance('smat', wzZnO, wzGaN, **kwargs)}")
print(f"d_smat(wz-ZnO, Bi2Te3) = {distance('smat', wzZnO, Bi2Te3, **kwargs)}")

d_smat(wz-ZnO, wz-ZnO supercell) = 0.0
d_smat(wz-ZnO, rs-ZnO) = 1.0
d_smat(wz-ZnO, wz-GaN) = 1.0
d_smat(wz-ZnO, Bi2Te3) = 1.0


In [5]:
# d_comp: a binary distance based on the match of compositions
print(f"d_comp(wz-ZnO, wz-ZnO supercell) = {distance('comp', wzZnO, wzZnO_2x2x2)}")
print(f"d_comp(wz-ZnO, rs-ZnO) = {distance('comp', wzZnO, rsZnO)}")
print(f"d_comp(wz-ZnO, wz-GaN) = {distance('comp', wzZnO, wzGaN)}")
print(f"d_comp(wz-ZnO, Bi2Te3) = {distance('comp', wzZnO, Bi2Te3)}")

d_comp(wz-ZnO, wz-ZnO supercell) = 0.0
d_comp(wz-ZnO, rs-ZnO) = 0.0
d_comp(wz-ZnO, wz-GaN) = 1.0
d_comp(wz-ZnO, Bi2Te3) = 1.0


In [6]:
# d_wyckoff: a binary distance based on the match of space groups and Wyckoff letters
print(
	f"d_wyckoff(wz-ZnO, wz-ZnO supercell) = {distance('wyckoff', wzZnO, wzZnO_2x2x2)}"
)
print(f"d_wyckoff(wz-ZnO, rs-ZnO) = {distance('wyckoff', wzZnO, rsZnO)}")
print(f"d_wyckoff(wz-ZnO, wz-GaN) = {distance('wyckoff', wzZnO, wzGaN)}")
print(f"d_wyckoff(wz-ZnO, Bi2Te3) = {distance('wyckoff', wzZnO, Bi2Te3)}")

d_wyckoff(wz-ZnO, wz-ZnO supercell) = 0.0
d_wyckoff(wz-ZnO, rs-ZnO) = 1.0
d_wyckoff(wz-ZnO, wz-GaN) = 0.0
d_wyckoff(wz-ZnO, Bi2Te3) = 1.0


In [7]:
# d_magpie: a continuous distance using compositional Magpie fingerprints
print(f"d_magpie(wz-ZnO, wz-ZnO supercell) = {distance('magpie', wzZnO, wzZnO_2x2x2)}")
print(f"d_magpie(wz-ZnO, rs-ZnO) = {distance('magpie', wzZnO, rsZnO)}")
print(f"d_magpie(wz-ZnO, wz-GaN) = {distance('magpie', wzZnO, wzGaN)}")
print(f"d_magpie(wz-ZnO, Bi2Te3) = {distance('magpie', wzZnO, Bi2Te3)}")

d_magpie(wz-ZnO, wz-ZnO supercell) = 0.0
d_magpie(wz-ZnO, rs-ZnO) = 0.0
d_magpie(wz-ZnO, wz-GaN) = 629.7824086796237
d_magpie(wz-ZnO, Bi2Te3) = 1069.5654971383722


In [8]:
# d_pdd: a continuous structural distance using the pointwise distance distribution.
print(f"d_pdd(wz-ZnO, wz-ZnO supercell) = {distance('pdd', wzZnO, wzZnO_2x2x2)}")
print(f"d_pdd(wz-ZnO, rs-ZnO) = {distance('pdd', wzZnO, rsZnO)}")
print(f"d_pdd(wz-ZnO, wz-GaN) = {distance('pdd', wzZnO, wzGaN)}")
print(f"d_pdd(wz-ZnO, Bi2Te3) = {distance('pdd', wzZnO, Bi2Te3)}")

d_pdd(wz-ZnO, wz-ZnO supercell) = 0.0
d_pdd(wz-ZnO, rs-ZnO) = 1.0416266
d_pdd(wz-ZnO, wz-GaN) = 0.09684124
d_pdd(wz-ZnO, Bi2Te3) = 3.2760717080000004


In [9]:
# You may optionally specify arguments for amd.PDD and amd.PDD_cdist. kwargs["args_emb"]
# are arguments for amd.PDD, and kwargs["args_dist"] are arguments for amd.PDD_cdist.
# ref: https://average-minimum-distance.readthedocs.io/en/stable/Using_PDDs.html
kwargs = {
	"args_emb": {"k": 200},
	"args_dist": {
		"metric": "chebyshev",
		"backend": "multiprocessing",
		"n_jobs": 2,
		"verbose": False,
	},
}
print(
	f"d_pdd(wz-ZnO, wz-ZnO supercell) = {distance('pdd', wzZnO, wzZnO_2x2x2, **kwargs)}"
)
print(f"d_pdd(wz-ZnO, rs-ZnO) = {distance('pdd', wzZnO, rsZnO, **kwargs)}")
print(f"d_pdd(wz-ZnO, wz-GaN) = {distance('pdd', wzZnO, wzGaN, **kwargs)}")
print(f"d_pdd(wz-ZnO, Bi2Te3) = {distance('pdd', wzZnO, Bi2Te3, **kwargs)}")

d_pdd(wz-ZnO, wz-ZnO supercell) = 0.0
d_pdd(wz-ZnO, rs-ZnO) = 1.0416266
d_pdd(wz-ZnO, wz-GaN) = 0.12082958
d_pdd(wz-ZnO, Bi2Te3) = 3.6986502420000003


In [10]:
# d_amd: a continuous structural distance using the average minimum distance.
print(f"d_amd(wz-ZnO, wz-ZnO supercell) = {distance('amd', wzZnO, wzZnO_2x2x2)}")
print(f"d_amd(wz-ZnO, rs-ZnO) = {distance('amd', wzZnO, rsZnO)}")
print(f"d_amd(wz-ZnO, wz-GaN) = {distance('amd', wzZnO, wzGaN)}")
print(f"d_amd(wz-ZnO, Bi2Te3) = {distance('amd', wzZnO, Bi2Te3)}")

d_amd(wz-ZnO, wz-ZnO supercell) = 4.440892098500626e-15
d_amd(wz-ZnO, rs-ZnO) = 1.0416266067835895
d_amd(wz-ZnO, wz-GaN) = 0.0968412400000016
d_amd(wz-ZnO, Bi2Te3) = 3.239856771319114


In [11]:
# You may optionally specify arguments for amd.AMD and amd.AMD_cdist. kwargs["args_emb"]
# are arguments for amd.AMD, and kwargs["args_dist"] are arguments for amd.AMD_cdist.
# ref: https://average-minimum-distance.readthedocs.io/en/stable/Using_AMDs.html
kwargs = {
	"args_emb": {"k": 200},
	"args_dist": {"metric": "chebyshev", "low_memory": False},
}
print(
	f"d_amd(wz-ZnO, wz-ZnO supercell) = {distance('amd', wzZnO, wzZnO_2x2x2, **kwargs)}"
)
print(f"d_amd(wz-ZnO, rs-ZnO) = {distance('amd', wzZnO, rsZnO, **kwargs)}")
print(f"d_amd(wz-ZnO, wz-GaN) = {distance('amd', wzZnO, wzGaN, **kwargs)}")
print(f"d_amd(wz-ZnO, Bi2Te3) = {distance('amd', wzZnO, Bi2Te3, **kwargs)}")

d_amd(wz-ZnO, wz-ZnO supercell) = 7.105427357601002e-15
d_amd(wz-ZnO, rs-ZnO) = 1.0416266067835895
d_amd(wz-ZnO, wz-GaN) = 0.1208295822553982
d_amd(wz-ZnO, Bi2Te3) = 3.59863379947711


### Uniqueness and Novelty evaluation

### 

In [12]:
# Load a set of crystals

# You can alternatively prepare your own set of crystals (list[xtalmet.crystal.Crystal]
# or list[pymatgen.core.Structure]).

path = hf_hub_download(
	repo_id="masahiro-negishi/xtalmet",
	filename="mp20/model/adit.pkl.gz",
	repo_type="dataset",
	revision=HF_VERSION,
)
with gzip.open(path, "rb") as f:
	gen_xtals = pickle.load(f)

# Use only the first 10 crystals for this tutorial
gen_xtals = gen_xtals[:10]

In [13]:
# Prepare an evaluator
evaluator = Evaluator(gen_xtals)

In [14]:
# Evaluate uniqueness
distance = "amd"  # Options: "smat", "comp", "wyckoff", "magpie", "pdd", "amd"
normalize = True
validity = None  # Options: None, ["smact"], ["structure"], ["smact", "structure"]
stability = None  # Options: None, "binary", "continuous"
uni, uni_times = evaluator.uniqueness(
	distance=distance,
	normalize=normalize,
	validity=validity,
	stability=stability,
	dir_intermediate_gen=None,
	multiprocessing=False,
	return_time=True,
)
# Note: If you calculate uniqueness using the same distance metric several times (e.g.,
# when trying out different validity screening or stability criteria), we strongly
# recommend specifying "dir_intermediate_gen". This means that the distance matrix
# computed in the first run will be saved in the directory and used thereafter,
# significantly reducing the computation cost.
# When "normalize" is set to True, the distance value is normalized to be between 0 and
# 1 by using d' = d / (1 + d), where d is the original distance value. This argument is
# only considered when d is an unnormalized continuous distance metric listed in
# CONTINUOUS_UNNORMALIZED_DISTANCES in constants.py. To fit the final uniqueness score
# in [0, 1], we recommend setting this argument to True. Default is True.

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Uniqueness: 0.5160
Computation times (s): {'uni_emb': 0.0027086734771728516, 'uni_d_mtx': 0.0011179447174072266, 'uni_metric': 4.863739013671875e-05, 'uni_total': 0.003875255584716797}


In [15]:
# Evaluate novelty
train_xtals = "mp20"  # Options: "mp20", list[xtalmet.crystal.Crystal], list[pymatgen.core.Structure]
distance = "smat"  # Options: "smat", "comp", "wyckoff", "magpie", "pdd", "amd"
validity = None  # Options: None, ["smact"], ["structure"], ["smact", "structure"]
stability = None  # Options: None, "binary", "continuous"
nov, nov_times = evaluator.novelty(
	train_xtals=train_xtals,
	distance=distance,
	validity=validity,
	stability=stability,
	dir_intermediate_gen=None,
	multiprocessing=True,
	n_processes=10,
	return_time=True,
)
# Note: Again, if you calculate novelty using the same distance metric several times, we
# strongly recommend specifying "dir_intermediate_gen"
# Multiprocessing is recommended especially for the "smat" distance to speed up the
# computation.
# "normalize" is not specified here because "smat" is not an unnormalized continuous
# distance metric. You can specify any value for "normalize," but it will be ignored.

print(f"Novelty: {nov:.4f}")
print(f"Computation times (s): {nov_times}")


Novelty: 0.6000
Computation times (s): {'nov_emb_train': 0.0, 'nov_emb_gen': 0.0, 'nov_d_mtx': 29.793761491775513, 'nov_metric': 0.0003662109375, 'nov_total': 29.794127702713013}


### More details on uniqueness and novelty evaluation
For the uniqueness and novelty methods, you can pass additional arguments via kwargs. kwargs can have four keys: "args_emb," "args_dist," "args_validity", and "args_stability." Below, we explain what arguments you can specify. Possible arguments are the same for both uniqueness and novelty.

#### args_emb and args_dist
These two dictionaries are used to pass arguments to the computation of embeddings and the distance matrix between embeddings, respectively. Depending on the distance function used, you can specify different arguments.

In [16]:
# "smat" distance

# Since "smat" distance does not allow for the pre-computation of embeddings,
# kwargs["args_emb"] does not exist.

# kwargs["args_dist"] will be passed to pymatgen's StructureMatcher. See
# https://pymatgen.org/pymatgen.analysis.html#pymatgen.analysis.structure_matcher.StructureMatcher
# for the complete list of arguments you can specify.

kwargs = {
	"args_dist": {"ltol": 0.2, "stol": 0.3, "angle_tol": 5},
}

uni, uni_times = evaluator.uniqueness(
	distance="smat",
	validity=None,
	stability=None,
	dir_intermediate_gen=None,
	multiprocessing=False,
	return_time=True,
	**kwargs,
)

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Uniqueness: 1.0000
Computation times (s): {'uni_emb': 0.0, 'uni_d_mtx': 0.06013941764831543, 'uni_metric': 9.846687316894531e-05, 'uni_total': 0.060237884521484375}


In [17]:
# For "comp", "wyckoff", and "magpie" distances, neither kwargs["args_emb"] nor
# kwargs["args_dist"] exist.

In [18]:
# "pdd" distance

# kwargs["args_emb"] will be passed to amd.PDD. See
# https://average-minimum-distance.readthedocs.io/en/latest/Using_PDDs.html#calculation-options
# for the complete list of arguments you can specify.

# kwargs["args_dist"] will be passed to amd.PDD_pdist when evaluating uniqueness and to
# amd.PDD_cdist when evaluating novelty. Both functions share the same arguments. See
# https://average-minimum-distance.readthedocs.io/en/latest/Using_PDDs.html#comparison-options-and-multiprocessing
# for the complete list of arguments you can specify.

kwargs = {
	"args_emb": {"k": 100},
	"args_dist": {
		"metric": "cheb",
		"backend": "multiprocessing",
		"n_jobs": None,  # If this value is not specified, and if multiprocessing=True is given when calling uniqueness or novelty methods, then n_jobs will be set using the _set_n_processes function in the xtalmet/distance.py file.
		"verbose": False,
	},
}

uni, uni_times = evaluator.uniqueness(
	distance="pdd",
	validity=None,
	stability=None,
	dir_intermediate_gen=None,
	multiprocessing=False,
	return_time=True,
	**kwargs,
)

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Uniqueness: 0.5374
Computation times (s): {'uni_emb': 0.0031206607818603516, 'uni_d_mtx': 0.03785562515258789, 'uni_metric': 2.5510787963867188e-05, 'uni_total': 0.04100179672241211}


In [19]:
# "amd" distance

# kwargs["args_emb"] will be passed to amd.AMD. You can specify the integer k. See
# https://average-minimum-distance.readthedocs.io/en/latest/amd.calculate.html#amd.calculate.AMD
# for the details.

# kwargs["args_dist"] will be passed to amd.AMD_pdist when evaluating uniqueness and to
# amd.AMD_cdist when evaluating novelty. Both functions share the same arguments. See
# https://average-minimum-distance.readthedocs.io/en/latest/Using_AMDs.html#comparison-options
# for the complete list of arguments you can specify.

kwargs = {
	"args_emb": {"k": 100},
	"args_dist": {
		"metric": "chebyshev",
		"low_memory": False,
	},
}

uni, uni_times = evaluator.uniqueness(
	distance="amd",
	validity=None,
	stability=None,
	dir_intermediate_gen=None,
	multiprocessing=False,
	return_time=True,
	**kwargs,
)

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Uniqueness: 0.5160
Computation times (s): {'uni_emb': 0.0027959346771240234, 'uni_d_mtx': 0.001020669937133789, 'uni_metric': 2.09808349609375e-05, 'uni_total': 0.00383758544921875}


#### args_validity
This dictionary is used to pass arguments to the validity screening functions in xtalmet/validity.py. The possible arguments depend on the validity screening methods you choose.

In [20]:
# The currently available validity screening functions are "validity_smact" and
# "validity_structure." The former does not have parameters that users can specify,
# while the latter has two: "threshold_distance" (float) and "threshold_volume" (float).
# "threshold_distance" is the minimum allowed distance between atoms (default 0.5 [Å]).
# "threshold_volume" is the minimum allowed volume of the unit cell (default 0.1 [Å³]).

kwargs = {
	"args_validity": {
		"structure": {
			"threshold_distance": 0.5,
			"threshold_volume": 0.1,
		}
	},
}

uni, uni_times = evaluator.uniqueness(
	distance="amd",
	validity=["structure"],
	stability=None,
	dir_intermediate_gen=None,
	multiprocessing=False,
	return_time=True,
	**kwargs,
)

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Uniqueness: 0.5160
Computation times (s): {'uni_emb': 0.0024302005767822266, 'uni_d_mtx': 0.0007011890411376953, 'uni_metric': 1.6450881958007812e-05, 'uni_total': 0.0031478404998779297}


#### args_stability
This dictionary is used to pass arguments to the stability evaluation function (compute_stability_scores function in xtalmet/stability.py). The possible arguments depend on the type of stability evaluation chosen (binary or continuous).

In [21]:
# binary stability

# kwargs["args_stability"] can have "diagram", "mace_model", and "threshold" arguments.

# "diagram" (Literal["mp_250618", "mp"] | PatchedPhaseDiagram | str) is a phased diagram
# to use. If "mp_250618" is specified, the diagram constructed using compute_ehull
# function from the MP entries on June 18, 2025, will be used. If "mp" is specified,
# the diagram will be constructed on the spot. You can also pass your own diagram or a
# path to it. If the pre-computed results (ehull.pkl.gz) exist in dir_intermediate, this
# argument will be ignored. Default is "mp_250618".

# "mace_model" (str) is the MACE model to use for energy prediction. See
# https://github.com/ACEsuit/mace-foundations/releases for available models. Default is
# "mace-mh-1".

# "threshold" (float) is the energy threshold for binary stability evaluation. Crystals
# with energy above hull values greater than this threshold will be considered unstable
# and ignored in the uniqueness or novelty evaluation. The default threshold is
# 0.1 eV/atom. See the docstrings of xtalmet.evaluator.Evaluator.uniqueness and novelty
# for how the binary stability works.

kwargs = {
	"args_stability": {
		"diagram": "mp_250618",
		"mace_model": "mace-mh-1",
		"threshold": 0.1,
	},
}

uni, uni_times = evaluator.uniqueness(
	distance="smat",
	validity=None,
	stability="binary",
	dir_intermediate_gen=None,
	multiprocessing=True,
	n_processes=5,
	return_time=True,
	**kwargs,
)

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Using Materials Project MACE for MACECalculator with /rds/general/user/mn825/home/.cache/mace/macemh1model
Using float64 for MACECalculator, which is slower but more accurate. Recommended for geometry optimization.




Using head omat_pbe out of ['matpes_r2scan', 'mp_pbe_refit_add', 'spice_wB97M', 'oc20_usemppbe', 'omol', 'omat_pbe']




Uniqueness: 0.0000
Computation times (s): {'uni_emb': 0.0, 'uni_d_mtx': 0.13453960418701172, 'uni_metric': 0.0001728534698486328, 'uni_total': 0.13471245765686035}


In [22]:
# continuous stability

# kwargs["args_stability"] can have "diagram", "mace_model", and "intercept" arguments.

# "diagram" and "mace_model" have the same meanings as in the binary stability case.

# "intercept" (float) is used to determine the x-axis intercept of the stability score,
# which determines how much a crystal contributes to the uniqueness or novelty score.
# It is calculated as 1 - (energy above hull) / intercept, clipped between 0 and 1. The
# default intercept is 1.215 eV/atom, which is the 99th percentile of the energy above
# hull values for the MP data with theoretical=False. See the docstrings of
# xtalmet.evaluator.Evaluator.uniqueness and novelty for how the continuous stability
# works.

kwargs = {
	"args_stability": {
		"diagram": "mp_250618",
		"mace_model": "mace-mh-1",
		"intercept": 1.215,
	},
}

uni, uni_times = evaluator.uniqueness(
	distance="amd",
	validity=None,
	stability="continuous",
	dir_intermediate_gen=None,
	multiprocessing=False,
	return_time=True,
	**kwargs,
)

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Using Materials Project MACE for MACECalculator with /rds/general/user/mn825/home/.cache/mace/macemh1model
Using float64 for MACECalculator, which is slower but more accurate. Recommended for geometry optimization.
Using head omat_pbe out of ['matpes_r2scan', 'mp_pbe_refit_add', 'spice_wB97M', 'oc20_usemppbe', 'omol', 'omat_pbe']




Uniqueness: 0.0707
Computation times (s): {'uni_emb': 0.0034377574920654297, 'uni_d_mtx': 0.0012557506561279297, 'uni_metric': 4.410743713378906e-05, 'uni_total': 0.0047376155853271484}


Please note that you can specify all of "args_emb", "args_dist", "args_validity", and "args_stability" in kwargs when calling either uniqueness or novelty methods.