# Tutorial on xtalmet package
This notebook provides a tutorial on the two primary features of the `xtalmet` package:
- Calculating distances between crystals
- Evaluating the uniqueness and novelty of a given set of generated crystals.

### Setup

In [None]:
import gzip
import pickle

from huggingface_hub import hf_hub_download
from pymatgen.core import Lattice, Structure

from xtalmet.constants import HF_VERSION
from xtalmet.distance import distance
from xtalmet.evaluator import Evaluator

### Distance calculation

In [2]:
# prepare five crystals
wzZnO = Structure.from_file("data/wz-ZnO.cif")
rsZnO = Structure.from_file("data/rs-ZnO.cif")
wzGaN = Structure.from_file("data/wz-GaN.cif")
Bi2Te3 = Structure.from_file("data/Bi2Te3.cif")
# The fifth crystal is the 2x2x2 supercell of wz-ZnO
frac_coords_super = []
species_super = []
for frac_coord, specie in zip(wzZnO.frac_coords, wzZnO.species, strict=False):
	for i in range(2):
		for j in range(2):
			for k in range(2):
				frac_coords_super.append(
					[
						(frac_coord[0] + i) / 2,
						(frac_coord[1] + j) / 2,
						(frac_coord[2] + k) / 2,
					]
				)
				species_super.append(specie)
wzZnO_2x2x2 = Structure(
	lattice=Lattice.from_dict(
		{
			"a": wzZnO.lattice.a * 2,
			"b": wzZnO.lattice.b * 2,
			"c": wzZnO.lattice.c * 2,
			"alpha": wzZnO.lattice.alpha,
			"beta": wzZnO.lattice.beta,
			"gamma": wzZnO.lattice.gamma,
		}
	),
	species=species_super,
	coords=frac_coords_super,
)



In [3]:
# d_smat: a binary distance based on pymatgen's StructureMacher
print(f"d_smat(wz-ZnO, wz-ZnO supercell) = {distance('smat', wzZnO, wzZnO_2x2x2)}")
print(f"d_smat(wz-ZnO, rs-ZnO) = {distance('smat', wzZnO, rsZnO)}")
print(f"d_smat(wz-ZnO, wz-GaN) = {distance('smat', wzZnO, wzGaN)}")
print(f"d_smat(wz-ZnO, Bi2Te3) = {distance('smat', wzZnO, Bi2Te3)}")

d_smat(wz-ZnO, wz-ZnO supercell) = 0.0
d_smat(wz-ZnO, rs-ZnO) = 1.0
d_smat(wz-ZnO, wz-GaN) = 1.0
d_smat(wz-ZnO, Bi2Te3) = 1.0


In [4]:
# You may optionally specify arguments for StructureMatcher
# ref: https://pymatgen.org/pymatgen.analysis.html#pymatgen.analysis.structure_matcher.StructureMatcher
kwargs = {"args_dist": {"ltol": 0.3, "stol": 0.5, "angle_tol": 6}}
print(
	f"d_smat(wz-ZnO, wz-ZnO supercell) = {distance('smat', wzZnO, wzZnO_2x2x2, **kwargs)}"
)
print(f"d_smat(wz-ZnO, rs-ZnO) = {distance('smat', wzZnO, rsZnO, **kwargs)}")
print(f"d_smat(wz-ZnO, wz-GaN) = {distance('smat', wzZnO, wzGaN, **kwargs)}")
print(f"d_smat(wz-ZnO, Bi2Te3) = {distance('smat', wzZnO, Bi2Te3, **kwargs)}")

d_smat(wz-ZnO, wz-ZnO supercell) = 0.0
d_smat(wz-ZnO, rs-ZnO) = 1.0
d_smat(wz-ZnO, wz-GaN) = 1.0
d_smat(wz-ZnO, Bi2Te3) = 1.0


In [5]:
# d_comp: a binary distance based on the match of compositions
print(f"d_comp(wz-ZnO, wz-ZnO supercell) = {distance('comp', wzZnO, wzZnO_2x2x2)}")
print(f"d_comp(wz-ZnO, rs-ZnO) = {distance('comp', wzZnO, rsZnO)}")
print(f"d_comp(wz-ZnO, wz-GaN) = {distance('comp', wzZnO, wzGaN)}")
print(f"d_comp(wz-ZnO, Bi2Te3) = {distance('comp', wzZnO, Bi2Te3)}")

d_comp(wz-ZnO, wz-ZnO supercell) = 0.0
d_comp(wz-ZnO, rs-ZnO) = 0.0
d_comp(wz-ZnO, wz-GaN) = 1.0
d_comp(wz-ZnO, Bi2Te3) = 1.0


In [6]:
# d_wyckoff: a binary distance based on the match of space groups and Wyckoff letters
print(
	f"d_wyckoff(wz-ZnO, wz-ZnO supercell) = {distance('wyckoff', wzZnO, wzZnO_2x2x2)}"
)
print(f"d_wyckoff(wz-ZnO, rs-ZnO) = {distance('wyckoff', wzZnO, rsZnO)}")
print(f"d_wyckoff(wz-ZnO, wz-GaN) = {distance('wyckoff', wzZnO, wzGaN)}")
print(f"d_wyckoff(wz-ZnO, Bi2Te3) = {distance('wyckoff', wzZnO, Bi2Te3)}")

d_wyckoff(wz-ZnO, wz-ZnO supercell) = 0.0
d_wyckoff(wz-ZnO, rs-ZnO) = 1.0
d_wyckoff(wz-ZnO, wz-GaN) = 0.0
d_wyckoff(wz-ZnO, Bi2Te3) = 1.0


In [7]:
# d_magpie: a continuous distance using compositional Magpie fingerprints
print(f"d_magpie(wz-ZnO, wz-ZnO supercell) = {distance('magpie', wzZnO, wzZnO_2x2x2)}")
print(f"d_magpie(wz-ZnO, rs-ZnO) = {distance('magpie', wzZnO, rsZnO)}")
print(f"d_magpie(wz-ZnO, wz-GaN) = {distance('magpie', wzZnO, wzGaN)}")
print(f"d_magpie(wz-ZnO, Bi2Te3) = {distance('magpie', wzZnO, Bi2Te3)}")

d_magpie(wz-ZnO, wz-ZnO supercell) = 0.0
d_magpie(wz-ZnO, rs-ZnO) = 0.0
d_magpie(wz-ZnO, wz-GaN) = 629.7824086796237
d_magpie(wz-ZnO, Bi2Te3) = 1069.5654971383722


In [8]:
# d_pdd: a continuous structural distance using the pointwise distance distribution.
print(f"d_pdd(wz-ZnO, wz-ZnO supercell) = {distance('pdd', wzZnO, wzZnO_2x2x2)}")
print(f"d_pdd(wz-ZnO, rs-ZnO) = {distance('pdd', wzZnO, rsZnO)}")
print(f"d_pdd(wz-ZnO, wz-GaN) = {distance('pdd', wzZnO, wzGaN)}")
print(f"d_pdd(wz-ZnO, Bi2Te3) = {distance('pdd', wzZnO, Bi2Te3)}")

d_pdd(wz-ZnO, wz-ZnO supercell) = 0.0
d_pdd(wz-ZnO, rs-ZnO) = 1.0416266
d_pdd(wz-ZnO, wz-GaN) = 0.09684124
d_pdd(wz-ZnO, Bi2Te3) = 3.2760717080000004


In [9]:
# You may optionally specify arguments for amd.PDD and amd.PDD_cdist. kwargs["args_emb"]
# are arguments for amd.PDD, and kwargs["args_dist"] are arguments for amd.PDD_cdist.
# ref: https://average-minimum-distance.readthedocs.io/en/stable/Using_PDDs.html
kwargs = {
	"args_emb": {"k": 200},
	"args_dist": {
		"metric": "chebyshev",
		"backend": "multiprocessing",
		"n_jobs": 2,
		"verbose": False,
	},
}
print(
	f"d_pdd(wz-ZnO, wz-ZnO supercell) = {distance('pdd', wzZnO, wzZnO_2x2x2, **kwargs)}"
)
print(f"d_pdd(wz-ZnO, rs-ZnO) = {distance('pdd', wzZnO, rsZnO, **kwargs)}")
print(f"d_pdd(wz-ZnO, wz-GaN) = {distance('pdd', wzZnO, wzGaN, **kwargs)}")
print(f"d_pdd(wz-ZnO, Bi2Te3) = {distance('pdd', wzZnO, Bi2Te3, **kwargs)}")

d_pdd(wz-ZnO, wz-ZnO supercell) = 0.0
d_pdd(wz-ZnO, rs-ZnO) = 1.0416266
d_pdd(wz-ZnO, wz-GaN) = 0.12082958
d_pdd(wz-ZnO, Bi2Te3) = 3.6986502420000003


In [10]:
# d_amd: a continuous structural distance using the average minimum distance.
print(f"d_amd(wz-ZnO, wz-ZnO supercell) = {distance('amd', wzZnO, wzZnO_2x2x2)}")
print(f"d_amd(wz-ZnO, rs-ZnO) = {distance('amd', wzZnO, rsZnO)}")
print(f"d_amd(wz-ZnO, wz-GaN) = {distance('amd', wzZnO, wzGaN)}")
print(f"d_amd(wz-ZnO, Bi2Te3) = {distance('amd', wzZnO, Bi2Te3)}")

d_amd(wz-ZnO, wz-ZnO supercell) = 4.440892098500626e-15
d_amd(wz-ZnO, rs-ZnO) = 1.0416266067835895
d_amd(wz-ZnO, wz-GaN) = 0.0968412400000016
d_amd(wz-ZnO, Bi2Te3) = 3.239856771319113


In [11]:
# You may optionally specify arguments for amd.AMD and amd.AMD_cdist. kwargs["args_emb"]
# are arguments for amd.AMD, and kwargs["args_dist"] are arguments for amd.AMD_cdist.
# ref: https://average-minimum-distance.readthedocs.io/en/stable/Using_AMDs.html
kwargs = {
	"args_emb": {"k": 200},
	"args_dist": {"metric": "chebyshev", "low_memory": False},
}
print(
	f"d_amd(wz-ZnO, wz-ZnO supercell) = {distance('amd', wzZnO, wzZnO_2x2x2, **kwargs)}"
)
print(f"d_amd(wz-ZnO, rs-ZnO) = {distance('amd', wzZnO, rsZnO, **kwargs)}")
print(f"d_amd(wz-ZnO, wz-GaN) = {distance('amd', wzZnO, wzGaN, **kwargs)}")
print(f"d_amd(wz-ZnO, Bi2Te3) = {distance('amd', wzZnO, Bi2Te3, **kwargs)}")

d_amd(wz-ZnO, wz-ZnO supercell) = 7.105427357601002e-15
d_amd(wz-ZnO, rs-ZnO) = 1.0416266067835895
d_amd(wz-ZnO, wz-GaN) = 0.1208295822553982
d_amd(wz-ZnO, Bi2Te3) = 3.59863379947711


### Uniqueness and Novelty evaluation

### 

In [12]:
# Load a set of crystals

# You can alternatively prepare your own set of crystals (list[xtalmet.crystal.Crystal]
# or list[pymatgen.core.Structure]).

path = hf_hub_download(
	repo_id="masahiro-negishi/xtalmet",
	filename="mp20/model/cdvae.pkl.gz",
	repo_type="dataset",
	revision=HF_VERSION,
)
with gzip.open(path, "rb") as f:
	gen_xtals = pickle.load(f)

# Use only the first 100 crystals for this tutorial
gen_xtals = gen_xtals[:100]

In [13]:
# Prepare an evaluator
evaluator = Evaluator(gen_xtals)

In [14]:
# Evaluate uniqueness
distance = "amd"  # Options: "smat", "comp", "wyckoff", "magpie", "pdd", "amd"
screen = None  # Options: None, "smact", "ehull"
uni, uni_times = evaluator.uniqueness(
	distance=distance, screen=screen, dir_intermediate_gen=None, return_time=True
)
# Note: If you calculate uniqueness using the same distance metric several times (e.g.,
# when trying out different screening methods), we strongly recommend specifying
# dir_intermediate_gen. This means that the distance matrix computed in the first run
# will be saved in the directory and used thereafter, significantly reducing the
# computation cost.

print(f"Uniqueness: {uni:.4f}")
print(f"Computation times (s): {uni_times}")

Uniqueness: 1.2808
Computation times (s): {'uni_emb': 0.02069830894470215, 'uni_d_mtx': 0.00404667854309082, 'uni_metric': 2.1219253540039062e-05, 'uni_total': 0.024766206741333008}


In [15]:
# Evaluate novelty
train_xtals = "mp20"  # Options: "mp20", list[xtalmet.crystal.Crystal], list[pymatgen.core.Structure]
distance = "amd"  # Options: "smat", "comp", "wyckoff", "magpie", "pdd", "amd"
screen = None  # Options: None, "smact", "ehull"
nov, nov_times = evaluator.novelty(
	train_xtals=train_xtals,
	distance=distance,
	screen=screen,
	dir_intermediate_gen=None,
	return_time=True,
)
# Note: Again, if you calculate novelty using the same distance metric several times, we
# strongly recommend specifying dir_intermediate_gen.

print(f"Novelty: {nov:.4f}")
print(f"Computation times (s): {nov_times}")


Novelty: 0.2009
Computation times (s): {'nov_emb_train': 0.0, 'nov_emb_gen': 0.021158933639526367, 'nov_d_mtx': 0.34051036834716797, 'nov_metric': 0.0020437240600585938, 'nov_total': 0.36371302604675293}
