In [1]:
from typing import Tuple
import pandas as pd
from tqdm.notebook import tqdm
import sys
sys.path.append('..')
from evaluation.generated_dataset import GeneratedDataset, load_all_from_config
from evaluation.novelty import NoveltyFilter, filter_by_unique_structure
from evaluation.statistical_evaluator import StatisticalEvaluator

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [2]:
dataset_names = ("mp_20", "perov_5", "carbon_24")

In [3]:
config_names = {
    #"WyFormer_CrySPR": ("WyckoffTransformer", "CrySPR", "CHGNet_fix"),
    #"WyForDiffCSP++": ("WyckoffTransformer", "DiffCSP++"),
    "MiAD": ("MiAD",),
    "DiffCSP": ("DiffCSP",)
}

In [4]:
all_datasets = {}
for dataset_name in dataset_names:
    all_datasets[dataset_name] = load_all_from_config(
        datasets=list(config_names.values()) + \
            [("split", "train"), ("split", "val"), ("split", "test")],
        dataset_name=dataset_name)

In [None]:
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen.core.composition",
                        message=r"No Pauling electronegativity for.*")
cdvae_datasets = {"mp_20": "mp20", "perov_5": "perovskite", "carbon_24": "carbon"}
results = defaultdict(lambda: defaultdict(dict))

for dataset_name, these_dataset in all_datasets.items():
    print(f"Processing dataset: {dataset_name}")
    novelty_reference = these_dataset[('split', 'train')].data
    novelty_filter = NoveltyFilter(novelty_reference, reference_index_type="reduced_composition")
    test_evaluator = StatisticalEvaluator(these_dataset[('split', 'test')].data, cdvae_eval_model_name=cdvae_datasets[dataset_name])
    for name, transformations in tqdm(config_names.items()):
        dataset = these_dataset[transformations]
        results[dataset_name]["no_filter"][name] = \
            test_evaluator.compute_cdvae_metrics(dataset.data, novelty_filter=None, sample_size_for_precision=500)
        if dataset_name == "carbon_24":
            # It's super slow, as we have to compare each structure with all others
            continue
        results[dataset_name]["only_novel"][name] = \
            test_evaluator.compute_cdvae_metrics(
                dataset.data, novelty_filter=novelty_filter, sample_size_for_precision=500, compute_novelty=True)

Processing dataset: mp_20


  0%|          | 0/2 [00:00<?, ?it/s]

Ignoring 2 generated samples without composition fingerprints.
Ignoring 2 generated samples without composition fingerprints.


Processing dataset: perov_5


  0%|          | 0/2 [00:00<?, ?it/s]

Ignoring 4 generated samples without composition fingerprints.
Ignoring 1 generated samples without composition fingerprints.
Ignoring 4 generated samples without composition fingerprints.
Ignoring 8 generated samples without composition fingerprints.
Ignoring 8 generated samples without composition fingerprints.


Processing dataset: carbon_24


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
for dataset_name, dataset_results in results.items():
    print(f"Results for {dataset_name}:")
    for filer_name, metrics in dataset_results.items():
        print(f"  {filer_name}:")    
        dataset_results_df = pd.DataFrame(metrics).T
        dataset_results_df.index.name = "Model"
        dataset_results_df.reset_index(inplace=True)
        display(dataset_results_df)

Results for mp_20:
  no_filter:


Unnamed: 0,Model,Compositional,Structural,Recall,Precision,EMD_rho,EMD_E,EMD_Elements
0,MiAD,84.863946,99.2497,99.859916,91.664824,0.326899,0.041995,0.019799
1,DiffCSP,83.22,100.0,99.76,93.743091,0.351324,0.094645,0.346054


  only_novel:


Unnamed: 0,Model,Compositional,Structural,Recall,Precision,EMD_rho,EMD_E,EMD_Elements
0,MiAD,81.75657,98.962656,99.820144,91.664824,0.324488,0.08229,0.021135
1,DiffCSP,82.136932,100.0,99.734543,92.582357,0.433102,0.077322,0.38208


Results for perov_5:
  no_filter:


Unnamed: 0,Model,Compositional,Structural,Recall,Precision,EMD_rho,EMD_E,EMD_Elements
0,MiAD,98.309562,94.823032,92.913802,73.606341,0.104099,,0.073767
1,DiffCSP,98.75,99.99,98.248599,70.937913,0.077988,,0.035128


  only_novel:


Unnamed: 0,Model,Compositional,Structural,Recall,Precision,EMD_rho,EMD_E,EMD_Elements
0,MiAD,98.147221,90.185278,88.258906,75.138705,0.098551,,0.08406
1,DiffCSP,98.721921,99.980635,98.312645,72.892999,0.053555,,0.059145


Results for carbon_24:
  no_filter:


Unnamed: 0,Model,Compositional,Structural,Recall,Precision,EMD_rho,EMD_E,EMD_Elements
0,MiAD,100.0,99.852217,99.458128,98.916256,0.060508,0.034357,0.0
1,DiffCSP,100.0,100.0,98.41,99.507389,0.076489,0.044992,0.0


In [7]:
import matplotlib.pyplot as plt
carbon_24 = pd.concat([
    all_datasets[('split', 'train')].data,
    all_datasets[('split', 'val')].data,
    all_datasets[('split', 'test')].data], axis=0, verify_integrity=True)
fig, ax = plt.subplots()
all_datasets[('WyckoffTransformer', 'CrySPR', 'CHGNet_fix')].data.density.hist(
    ax=ax, bins=100, alpha=0.5, label="WyFormer", density=True)
carbon_24.density.hist(ax=ax, bins=100, alpha=0.5, label="Data", density=True)
ax.set_xlabel("Density")
ax.set_ylabel("Histogram Density")
ax.legend();

KeyError: ('split', 'train')