# Element Similarity

This notebook is used to reproduce the plots shown in the paper.

In [None]:
# Imports
from AtomicEmbeddings.core import Embedding, data_directory
from AtomicEmbeddings.plotter import multi_heatmap_plotter
import pandas as pd
import os
import seaborn as sns

sns.set(font_scale=1.5)

## Introduction

Let's set up the Embedding classes and load the data

In [None]:
# Load the embeddings
cbfvs = [
    "magpie",
    "mat2vec",
    "megnet16",
    "oliynyk",
    "mod_petti",
    "random_200",
    "skipatom",
]
element_embedddings = {cbfv: Embedding.load_data(cbfv) for cbfv in cbfvs}

We can reproduce some of the information in table I from the paper by running the following code:

In [None]:
# Let's find the dimensionality of all of the CBFVs that we have loaded

element_embedddings_dim = {cbfv: [element_embedddings[cbfv].dim] for cbfv in cbfvs}

dim_df = pd.DataFrame.from_dict(
    element_embedddings_dim, orient="index", columns=["dimension"]
)
print(dim_df)

## II.B Similarity measures

Let's set up the Embedding classes for our analysis

In [None]:
# Get our four embeddings to compare
cbfvs_to_keep = ["magpie", "mat2vec", "megnet16", "random_200"]
element_vectors = {cbfv: element_embedddings[cbfv] for cbfv in cbfvs_to_keep}

# Keep the first 83 elements

# Get the ordered symbols file
symbols_path = os.path.join(data_directory, "element_data", "ordered_periodic.txt")
with open(symbols_path) as f:
    symbols = f.read().splitlines()

# Get the first 83 elements
symbols = symbols[:83]

for cbfv in cbfvs_to_keep:
    # Get the keys of the atomic embeddings object
    elements = set(element_vectors[cbfv].element_list)
    el_symbols_set = set(symbols)

    # Get the element symbols we want to remove
    els_to_remove = list(elements - el_symbols_set)

    # Iteratively delete the elements with atomic number
    # greater than 83 from our embeddings
    for el in els_to_remove:
        del element_vectors[cbfv].embeddings[el]

    # Verify that we have 83 elements
    print(len(element_vectors[cbfv].element_list))

# Euclidean distances


\begin{equation}
d_E(\textbf{A,B}) = 
\sqrt{
(A_1 - B_1)^2 
+ \cdots
+ (A_n - B_n)^2 }
\end{equation}

We can use the Euclidean distance to compare the similarity of two elements. The following code will plot the distribution of the Euclidean distances between all pairs of elements in the embedding space.

In [None]:
multi_heatmap_plotter(
    element_vectors.values(),
    nrows=2,
    ncols=2,
    metric="euclidean",
    sortaxisby="atomic_number",
    show_axislabels=False,
    show_plot=True,
    figsize=(10, 10),
    filename="1_euclidean.pdf",
)

### Manhattan distances

\begin{equation}
d_M(\textbf{A,B}) = 
\sum_{i=1}^n |A_i - B_i|
\end{equation}

We can use the Manhattan distance to compare the similarity of two elements. The following code will plot the distribution of the Manhattan distances between all pairs of elements in the embedding space.


In [None]:
multi_heatmap_plotter(
    element_vectors.values(),
    nrows=2,
    ncols=2,
    metric="manhattan",
    sortaxisby="atomic_number",
    show_axislabels=False,
    show_plot=True,
    figsize=(10, 10),
    filename="2_manhattan.pdf",
)

In [None]:
multi_heatmap_plotter(
    element_vectors.values(),
    nrows=2,
    ncols=2,
    metric="manhattan",
    sortaxisby="atomic_number",
    show_axislabels=False,
    show_plot=True,
    figsize=(10, 10),
    # filename="2_manhattan.pdf",
)