# Supplementary Information for Element Similarity

In [None]:
# Imports
import matplotlib.pyplot as plt
from typing import List, Optional, Tuple
from elementembeddings.core import Embedding, data_directory
from elementembeddings.plotter import dimension_plotter, heatmap_plotter
import pandas as pd
import os
import seaborn as sns

sns.set_context("paper", font_scale=1.5)
random_state = 42
reducer_params = {"random_state": random_state}
scatter_params = {"s": 80}

In [None]:
# Load the embeddings
cbfvs = [
    "magpie",
    "matscholar",
    "mat2vec",
    "megnet16",
    "oliynyk",
    "random_200",
    "skipatom",
]
element_embedddings = {cbfv: Embedding.load_data(cbfv) for cbfv in cbfvs}

# Standardise
for embedding in element_embedddings.values():
    print(f"Attempting to standardise {embedding.embedding_name}...")
    print(f" Already standardised: {embedding.is_standardised}")
    embedding.standardise(inplace=True)
    print(f"Now standardised: {embedding.is_standardised}")

In [None]:
# Get the ordered symbols file
symbols_path = os.path.join(data_directory, "element_data", "ordered_periodic.txt")
with open(symbols_path) as f:
    symbols = f.read().splitlines()

# Get the first 83 elements
symbols = symbols[:83]

for cbfv in element_embedddings.keys():
    # Get the keys of the atomic embeddings object
    elements = set(element_embedddings[cbfv].element_list)
    el_symbols_set = set(symbols)

    # Get the element symbols we want to remove
    els_to_remove = list(elements - el_symbols_set)

    # Iteratively delete the elements with atomic number
    # greater than 83 from our embeddings
    for el in els_to_remove:
        del element_embedddings[cbfv].embeddings[el]

    # Verify that we have 83 elements
    print(len(element_embedddings[cbfv].element_list))

# Remove Xe and Kr from SkipAtom
# del element_embedddings["skipatom"].embeddings["Xe"]
# del element_embedddings["skipatom"].embeddings["Kr"]

In [None]:
# Which elements are missing for skipatom
set(element_embedddings["magpie"].element_list) - set(
    element_embedddings["skipatom"].element_list
)

## Similarity measures


### Euclidean distance


In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="euclidean",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
    )
    print(cbfv.embedding_name)
    # plt.subplots_adjust(wspace=0.001)
axes[-1][-1].remove()


fig.tight_layout()
fig.savefig("SI_euclidean.pdf", bbox_inches="tight")
fig.show()

#### Skipatom

From the above plot, we can observe two element vectors causing anomalous behaviour in the skipatom plot. We plot the skipatom map with the axis labelled to determine which elements are causing this behaviour.

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
heatmap_plotter(
    embedding=element_embedddings["skipatom"],
    metric="euclidean",
    sortaxisby="atomic_number",
    show_axislabels=True,
    ax=ax,
)

fig.show()

Kr and Xe contribute to the distorted images for Skipatom.

In [None]:
element_embedddings["skipatom_no_nobles"] = Embedding.load_data("skipatom")

for el in ["Xe", "Kr"]:
    del element_embedddings["skipatom_no_nobles"].embeddings[el]
element_embedddings["skipatom_no_nobles"].standardise(inplace=True)
element_embedddings["skipatom_no_nobles"].embedding_name = "skipatom (Xe,Kr removed)"

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
heatmap_plotter(
    embedding=element_embedddings["skipatom_no_nobles"],
    metric="euclidean",
    sortaxisby="atomic_number",
    show_axislabels=True,
    ax=ax,
)

fig.show()

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="euclidean",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
    )
    print(cbfv.embedding_name)
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_euclidean.pdf", bbox_inches="tight")
fig.show()

### Manhattan distance

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="manhattan",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_manhattan.pdf", bbox_inches="tight")
fig.show()

### Chebyshev

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="chebyshev",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_chebyshev.pdf", bbox_inches="tight")
fig.show()

### Wasserstein distance

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="wasserstein",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_wasserstein.pdf", bbox_inches="tight")
fig.show()

### Cosine distance

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="cosine_distance",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_cosdistance.pdf", bbox_inches="tight")
fig.show()

### Pearson correlation

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))
heatmap_params = {"vmin": -1, "vmax": 1}
for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="pearson",
        cmap="Blues_r",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
        **heatmap_params
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_pearson.pdf", bbox_inches="tight")
fig.show()

### Spearman correlation

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))
heatmap_params = {"vmin": -1, "vmax": 1}
for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="spearman",
        cmap="Blues_r",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
        **heatmap_params
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_spearman.pdf", bbox_inches="tight")
fig.show()

### Cosine similarity

In [None]:
fig, (axes) = plt.subplots(4, 2, figsize=(20, 20))
heatmap_params = {"vmin": -1, "vmax": 1}
for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    heatmap_plotter(
        embedding=cbfv,
        metric="cosine_similarity",
        cmap="Blues_r",
        sortaxisby="atomic_number",
        show_axislabels=False,
        ax=ax,
        **heatmap_params
    )
    # plt.subplots_adjust(wspace=0.001)


fig.tight_layout()
fig.savefig("SI_cosinesimilarity.pdf", bbox_inches="tight")
fig.show()

## Two-dimensional projections

### Principal Component Analysis (PCA)

In [None]:
fig, axes = plt.subplots(
    4,
    2,
    figsize=(20, 20),
)

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    dimension_plotter(
        embedding=cbfv,
        reducer="pca",
        n_components=2,
        ax=ax,
        adjusttext=True,
        reducer_params=reducer_params,
        scatter_params=scatter_params,
    )
    ax.legend().remove()


handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.54, 1.06), loc="upper center", ncol=5)
fig.tight_layout()
plt.savefig("SI_pca.pdf", bbox_inches="tight")
fig.show()

### t-distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
fig, axes = plt.subplots(
    4,
    2,
    figsize=(20, 20),
)

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    dimension_plotter(
        embedding=cbfv,
        reducer="tsne",
        n_components=2,
        ax=ax,
        # adjusttext=True,
        reducer_params=reducer_params,
        scatter_params=scatter_params,
    )
    ax.legend().remove()


handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.54, 1.06), loc="upper center", ncol=5)
fig.tight_layout()
plt.savefig("SI_tsne.pdf", bbox_inches="tight")
fig.show()

### Uniform Manifold Approximation and Projection (UMAP)

In [None]:
fig, axes = plt.subplots(
    4,
    2,
    figsize=(20, 20),
)

for ax, cbfv in zip(axes.flatten(), element_embedddings.values()):
    dimension_plotter(
        embedding=cbfv,
        reducer="umap",
        n_components=2,
        ax=ax,
        adjusttext=True,
        reducer_params=reducer_params,
        scatter_params=scatter_params,
    )
    ax.legend().remove()


handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.54, 1.06), loc="upper center", ncol=5)
fig.tight_layout()
plt.savefig("SI_umap.pdf", bbox_inches="tight")
fig.show()

## Distribution of similarity measures

### Pearson correlation

In [None]:
correlation_metrics = ["pearson", "cosine_similarity"]
correlation_dfs = {}
for rep in element_embedddings.keys():
    correlation_dfs[rep] = {
        "pearson": element_embedddings[rep].correlation_df(),
        "cosine_similarity": element_embedddings[rep].correlation_df(
            metric="cosine_similarity"
        ),
    }

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(20, 20))
for ax, cbfv in zip(axes.flatten(), element_embedddings.keys()):
    sns.histplot(correlation_dfs[cbfv]["pearson"], x="pearson", ax=ax)
    ax.set_title(cbfv)
    ax.set_xlim(-1, 1)
    ax.set_xlabel("Pearson correlation")
    ax.set_ylabel("Count")


plt.tight_layout()
plt.savefig("SI_pearson_distribution.pdf")

### Cosine similarity

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(20, 20))
for ax, cbfv in zip(axes.flatten(), element_embedddings.keys()):
    sns.histplot(
        correlation_dfs[cbfv]["cosine_similarity"], x="cosine_similarity", ax=ax
    )
    ax.set_title(cbfv)
    ax.set_xlim(-1, 1)
    ax.set_xlabel("Cosine similarity")
    ax.set_ylabel("Count")


plt.tight_layout()
plt.savefig("SI_cosine_similarity_distribution.pdf")