# Similarity measures for the atomic representations

This notebook provides the code used to plot the distance/correlation plots for the chemical representations featured in our publication

TODO:  a plotting module to handle building subplots of multiple AtomicEmbedding outputs

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab as pl
import seaborn as sns
import smact
import umap
from matplotlib.ticker import FixedLocator, FormatStrFormatter
from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

from AtomicEmbeddings.core import Embedding

sns.set(font_scale=2)

## Pre-processing


In [None]:
# Set up the Atomic Embedding features

#  a list of the CBFVs we are interested in
cbfvs = ["mat2vec", "random_200", "magpie_sc", "megnet16"]

#  a dictionary of {cbfv: AtomicEmbeddings}
AtomEmbeds = {cbfv: Embedding.load_data(cbfv) for cbfv in cbfvs}

In [None]:
# Only keep the first 83 elements for the Atomic Embeddings

# For now Smact is used, but a future update will aim to remove this dependency
el_symbols = smact.ordered_elements(1, 83)

In [None]:
# Iterate over the AtomEmbeds to remove the elements

for cbfv in cbfvs:
    # Get the keys of the atomic embeddings object
    elements = set(AtomEmbeds[cbfv].element_list)
    el_symbols_set = set(el_symbols)

    # Get the element symbols we want to remove
    els_to_remove = list(elements - el_symbols_set)

    # Iteratively delete the elements with atomic number
    # greater than 83 from our embeddings
    for el in els_to_remove:
        del AtomEmbeds[cbfv].embeddings[el]

    # Verify that we have 83 elements
    print(len(AtomEmbeds[cbfv].element_list))

In [None]:
p = AtomEmbeds["megnet16"].pearson_pivot_table()
p.head()

In [None]:
p.to_numpy().shape

In [None]:
m16_els = AtomEmbeds["megnet16"].element_list
r200_els = AtomEmbeds["random_200"].element_list
set(m16_els) - set(r200_els)

In [None]:
AtomEmbeds["magpie_sc"].correlation_df()

## Distance and correlation plots for the high-dimensional representations

In [None]:
# # Let's start generating the plots

# # Plotting pearson correlations

# fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(
#     2,
#     2,
#     sharex="col",
#     sharey="row",
#     figsize=(24, 24),
# )
# axes = [ax1, ax2, ax3, ax4]
# fig.suptitle("Pearson Correlation")
# for i, cbfv in enumerate(cbfvs):
#     df = AtomEmbeds[cbfv]._correlation_df()
#     xlabels = df["ele_1"].tolist()

#     p = AtomEmbeds[cbfv].pearson_pivot_table()
#     g = sns.heatmap(p, cmap="bwr", square="True", linecolor="k", ax=axes[i - 1])
#     axes[i - 1].title.set_text(cbfv)
#     # axes[i-1].set_xticklabels(xlabels)


# fig.tight_layout()
# plt.show()

In [None]:
# # Let's start generating the plots

# # Plotting pearson correlations

# fig, (ax1, ax2) = plt.subplots(1, 2, sharey="row", figsize=(36, 24))
# axes = [ax1, ax2]
# fig.suptitle("Pearson Correlation")
# for i, cbfv in enumerate(cbfvs[:2]):

#     p = AtomEmbeds[cbfv].pearson_pivot_table()
#     xlabels = [i[1] for i in p.index]
#     g = sns.heatmap(p, cmap="bwr", square="True", linecolor="k", ax=axes[i - 1])
#     axes[i - 1].title.set_text(cbfv)
#     axes[i - 1].set_xticklabels(xlabels)


# fig.tight_layout()
# plt.show()

In [None]:
# Let's start generating the plots

# Plotting pearson correlations

fig, ax = plt.subplots(figsize=(36, 24))

# fig.suptitle('Pearson Correlation')
for i, cbfv in enumerate(cbfvs[:1]):
    p = AtomEmbeds[cbfv].pearson_pivot_table()
    xlabels = [i[1] for i in p.index]
    ylabels = [i[1] for i in p.columns]
    g = sns.heatmap(
        p,
        cmap="bwr",
        square="True",
        linecolor="k",
        ax=ax,
        xticklabels=True,
        yticklabels=True,
    )
    ax.title.set_text(cbfv)
    ax.set_xticklabels(xlabels, fontsize="medium")
    ax.set_yticklabels(ylabels)
    majors = np.linspace(1, 83, 2)
    minors = np.linspace(2, 83, 2)
    # ax.xaxis.set_major_locator(FixedLocator())
    ax.set_xlabel("")
    ax.set_ylabel("")


fig.tight_layout()
# plt.savefig('mat2vecpearson.svg')
plt.show()

In [None]:
def heatmap_plotter(
    embedding,
    metric=False,
    distance=True,
    correlation=False,
    figsize=(36, 24),
    filename=False,
):
    fig, ax = plt.subplots(figsize=figsize)
    if correlation:
        p = AtomEmbeds[embedding].pearson_pivot_table()

    elif distance:
        p = AtomEmbeds[embedding].distance_pivot_table(metric=metric)
    xlabels = [i[1] for i in p.index]
    ylabels = [i[1] for i in p.columns]
    g = sns.heatmap(
        p,
        cmap="bwr",
        square="True",
        linecolor="k",
        ax=ax,
        xticklabels=True,
        yticklabels=True,
    )
    ax.title.set_text(embedding)
    ax.set_xticklabels(
        xlabels,
        # fontsize='medium'
    )
    ax.set_yticklabels(ylabels)
    # majors = np.linspace(1,83,2)
    # minors = np.linspace(2,83,2)
    # ax.xaxis.set_major_locator(FixedLocator())
    ax.set_xlabel("")
    ax.set_ylabel("")

    fig.tight_layout()
    if filename:
        plt.savefig("plots/" + filename)
    plt.show()

In [None]:
# Plot the cbfvs
for i in cbfvs:
    heatmap_plotter(
        i,
        correlation=True,
        # filename=f"{i}pearson.svg"
    )

In [None]:
# Plot the cbfvs
for i in cbfvs:
    heatmap_plotter(
        i,
        metric="euclidean",
        distance=True,
        correlation=False,
        # filename=f"{i}_euclid.svg",
    )

In [None]:
# Plot the cbfvs
for i in cbfvs:
    heatmap_plotter(
        i,
        metric="manhattan",
        distance=True,
        correlation=False,
        # filename=f"{i}_manhattan.svg",
    )

In [None]:
# Plot the cbfvs
for i in cbfvs:
    heatmap_plotter(
        i,
        metric="chebyshev",
        distance=True,
        correlation=False,
        # filename=f"{i}_chebyshev.svg",
    )

In [None]:
# Let's start generating the plots

# Plotting Euclidean correlations

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(24, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("Euclidean distance")
for i, cbfv in enumerate(cbfvs):
    p = AtomEmbeds[cbfv].distance_pivot_table(metric="euclidean")
    g = sns.heatmap(p, cmap="bwr", square="True", linecolor="k", ax=axes[i - 1])
    axes[i - 1].title.set_text(cbfv)


fig.tight_layout()
plt.show()

In [None]:
# Let's start generating the plots

# Plotting Manhattan correlations

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(24, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("Manhattan distance")
for i, cbfv in enumerate(cbfvs):
    p = AtomEmbeds[cbfv].distance_pivot_table(metric="manhattan")
    g = sns.heatmap(p, cmap="bwr", square="True", linecolor="k", ax=axes[i - 1])
    axes[i - 1].title.set_text(cbfv)


fig.tight_layout()
plt.show()

In [None]:
# Let's start generating the plots

# Plotting Chebyshev correlations

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(24, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("Chebyshev distance")
for i, cbfv in enumerate(cbfvs):
    p = AtomEmbeds[cbfv].distance_pivot_table(metric="chebyshev")
    g = sns.heatmap(p, cmap="bwr", square="True", linecolor="k", ax=axes[i - 1])
    axes[i - 1].title.set_text(cbfv)


plt.show()

In [None]:
# Let's start generating the plots

# Plotting Wasserstein distances

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(24, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("Wasserstein distance")
for i, cbfv in enumerate(cbfvs):
    p = AtomEmbeds[cbfv].distance_pivot_table(metric="wasserstein")
    g = sns.heatmap(p, cmap="bwr", square="True", linecolor="k", ax=axes[i - 1])
    axes[i - 1].title.set_text(cbfv)


fig.tight_layout()
plt.show()

## Clustering analysis (Dimensionality reduction)

In this section we will show clustering analysis of the different chemical representations

TODO: Add a hue for the group of the elements to show better clustering

In [None]:
# Make the PCA plots

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(36, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("Principle component analysis")
for i, cbfv in enumerate(cbfvs):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)

    # Perform the PCA
    pca = decomposition.PCA(n_components=2)  # project to 2 dimensions
    pca.fit(embeddings_array)
    X = pca.transform(embeddings_array)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions,
    # labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "pca_dim1": pca_dim1,
            "pca_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="pca_dim1",
        y="pca_dim2",
        data=pca_df,
        hue="group",
        s=200,
        ax=axes[i - 1],
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx],
            y=pca_dim2[idx],
            s=element_array[idx],
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)


plt.show()

In [None]:
# Make the PCA plots

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12))
axes = [ax1, ax2]
fig.suptitle("Principle component analysis")
for i, cbfv in enumerate(cbfvs[:2]):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)
    scaler = StandardScaler()
    e_scaled = scaler.fit_transform(embeddings_array)

    # Perform the PCA
    pca = decomposition.PCA(n_components=2)  # project to 2 dimensions
    # pca.fit(embeddings_array)
    X = pca.fit_transform(e_scaled)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "pca_dim1": pca_dim1,
            "pca_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="pca_dim1",
        y="pca_dim2",
        data=pca_df,
        hue="group",
        s=300,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)

plt.savefig("plots/PCA_1_scaled.svg")

plt.show()

In [None]:
# Make the PCA plots

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12))
axes = [ax1, ax2]
fig.suptitle("Principle component analysis")
for i, cbfv in enumerate(cbfvs[2:]):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)
    scaler = StandardScaler()
    e_scaled = scaler.fit_transform(embeddings_array)

    # Perform the PCA
    pca = decomposition.PCA(n_components=2)  # project to 2 dimensions
    # pca.fit(embeddings_array)
    X = pca.fit_transform(e_scaled)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "pca_dim1": pca_dim1,
            "pca_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="pca_dim1",
        y="pca_dim2",
        data=pca_df,
        hue="group",
        s=300,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)

plt.savefig("plots/PCA_2_scaled.svg")

plt.show()

In [None]:
# Make the PCA plots

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(24, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("t-SNE")
for i, cbfv in enumerate(cbfvs):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)

    # Perform the PCA
    tsne = TSNE(n_components=2)  # project to 2 dimensions
    X = tsne.fit_transform(embeddings_array)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "tsne_dim1": pca_dim1,
            "tsne_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="tsne_dim1", y="tsne_dim2", data=pca_df, hue="group", s=200, ax=axes[i - 1]
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")
    # axes[i-1].set_xlim(axes[i-1].get_xlim()[0]*0.7, axes[i-1].get_xlim()[1]*1.3)
    # axes[i-1].set_ylim(axes[i-1].get_ylim()[0]*0.7,axes[i-1].get_ylim()[1]*1.3)

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)


plt.show()

In [None]:
# Make the PCA plots

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12))
axes = [ax1, ax2]
fig.suptitle("t-SNE")
for i, cbfv in enumerate(cbfvs[:2]):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)
    scaler = StandardScaler()
    e_scaled = scaler.fit_transform(embeddings_array)

    # Perform the PCA
    tsne = TSNE(n_components=2)  # project to 2 dimensions
    X = tsne.fit_transform(e_scaled)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "tsne_dim1": pca_dim1,
            "tsne_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="tsne_dim1",
        y="tsne_dim2",
        data=pca_df,
        hue="group",
        s=200,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")
    # axes[i-1].set_xlim(axes[i-1].get_xlim()[0], axes[i-1].get_xlim()[1])
    # axes[i-1].set_ylim(axes[i-1].get_ylim()[0],axes[i-1].get_ylim()[1])

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)


plt.savefig(
    "plots/t-SNE_1_scaled.svg",
    # transparent=True
)
plt.show()

In [None]:
# Make the PCA plots

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12))
axes = [ax1, ax2]
fig.suptitle("t-SNE")
for i, cbfv in enumerate(cbfvs[2:]):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)
    scaler = StandardScaler()
    e_scaled = scaler.fit_transform(embeddings_array)

    # Perform the PCA
    tsne = TSNE(n_components=2)  # project to 2 dimensions
    X = tsne.fit_transform(e_scaled)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "tsne_dim1": pca_dim1,
            "tsne_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="tsne_dim1",
        y="tsne_dim2",
        data=pca_df,
        hue="group",
        s=200,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")
    # axes[i-1].set_xlim(axes[i-1].get_xlim()[0], axes[i-1].get_xlim()[1])
    # axes[i-1].set_ylim(axes[i-1].get_ylim()[0],axes[i-1].get_ylim()[1])

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)

plt.savefig(
    "plots/t-SNE_2_scaled.svg",
    # transparent=True
)

plt.show()

## UMAP

In [None]:
# Make the UMAP plots

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12))
axes = [ax1, ax2]
fig.suptitle("UMAP projection")
for i, cbfv in enumerate(cbfvs[:2]):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)
    scaler = StandardScaler()
    e_scaled = scaler.fit_transform(embeddings_array)

    # Perform the PCA
    reducer = umap.UMAP()  # project to 2 dimensions
    X = reducer.fit_transform(e_scaled)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "tsne_dim1": pca_dim1,
            "tsne_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="tsne_dim1",
        y="tsne_dim2",
        data=pca_df,
        hue="group",
        s=200,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")
    # axes[i-1].set_xlim(axes[i-1].get_xlim()[0], axes[i-1].get_xlim()[1])
    # axes[i-1].set_ylim(axes[i-1].get_ylim()[0],axes[i-1].get_ylim()[1])

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)

plt.savefig(
    "plots/UMAP_1_scaled.svg",
    # transparent=True
)

plt.show()

In [None]:
# Make the UMAP plots

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12))
axes = [ax1, ax2]
fig.suptitle("UMAP projection")
for i, cbfv in enumerate(cbfvs[2:]):
    AtomicEmbed = AtomEmbeds[cbfv]

    #  an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    #  an array of the elements
    element_array = np.array(AtomicEmbed.element_list)
    scaler = StandardScaler()
    e_scaled = scaler.fit_transform(embeddings_array)

    # Perform the PCA
    reducer = umap.UMAP()  # project to 2 dimensions
    X = reducer.fit_transform(e_scaled)

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    #  a dataframe to store the dimensions, labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "tsne_dim1": pca_dim1,
            "tsne_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )

    g = sns.scatterplot(
        x="tsne_dim1",
        y="tsne_dim2",
        data=pca_df,
        hue="group",
        s=200,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel("Dimension 1")
    axes[i - 1].set_ylabel("Dimension 2")
    # axes[i-1].set_xlim(axes[i-1].get_xlim()[0], axes[i-1].get_xlim()[1])
    # axes[i-1].set_ylim(axes[i-1].get_ylim()[0],axes[i-1].get_ylim()[1])

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx] * 1.01, y=pca_dim2[idx] * 1.01, s=element_array[idx]
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)

plt.savefig(
    "plots/UMAP_2_scaled.svg",
    # transparent=True
)

plt.show()