In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab as pl
import seaborn as sns
import smact
import umap
from matplotlib.ticker import FixedLocator, FormatStrFormatter
from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

from AtomicEmbeddings.core import Embedding

sns.set(font_scale=2)

# Set up the Atomic Embedding features

# Create a list of the CBFVs we are interested in
cbfvs = ["mat2vec", "random_200", "magpie_sc", "megnet16"]

# Create a dictionary of {cbfv: AtomicEmbeddings}
AtomEmbeds = {cbfv: Embedding.load_data(cbfv) for cbfv in cbfvs}

# Only keep the first 83 elements for the Atomic Embeddings

# For now Smact is used, but a future update will aim to remove this dependency
el_symbols = smact.ordered_elements(1, 83)

# Iterate over the AtomEmbeds to remove the elements

for cbfv in cbfvs:
    # Get the keys of the atomic embeddings object
    elements = set(AtomEmbeds[cbfv].element_list)
    el_symbols_set = set(el_symbols)

    # Get the element symbols we want to remove
    els_to_remove = list(elements - el_symbols_set)

    # Iteratively delete the elements with atomic number
    # greater than 83 from our embeddings
    for el in els_to_remove:
        del AtomEmbeds[cbfv].embeddings[el]

    # Verify that we have 83 elements
    print(len(AtomEmbeds[cbfv].element_list))

In [None]:
AtomicEmbed = AtomEmbeds["mat2vec"]
# Create an array of the embedding vectors
embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
# Create an array of the elements
element_array = np.array(AtomicEmbed.element_list)

# Perform the PCA
pca = decomposition.PCA()  # project to 2 dimensions
pca.fit(embeddings_array)
X = pca.transform(embeddings_array)
print(embeddings_array.shape)
print(X.shape)
a = pca.explained_variance_ratio_ * 100
a.sum()

In [None]:
len(embeddings_array)

In [None]:
AtomicEmbed = AtomEmbeds["magpie_sc"]
# Create an array of the embedding vectors
embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
# Create an array of the elements
element_array = np.array(AtomicEmbed.element_list)

# Perform the PCA
pca = decomposition.PCA()  # project to 2 dimensions
pca.fit(embeddings_array)
X = pca.transform(embeddings_array)
print(X.shape)
a = pca.explained_variance_ratio_ * 100
a.sum()

In [None]:
x = np.linspace(1, 22, 22) * 100 / 22
x.shape

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

ax.scatter(
    x,
    a,
)
ax.set_xlabel("Percentage of total components")
ax.set_ylabel("Explained variance ratio")

# plt.tight_layout()
plt.show()

In [None]:
X[0]

In [None]:
X.shape

In [None]:
pca.explained_variance_

In [None]:
# Make the PCA plots
var_list = []
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(36, 24))
axes = [ax1, ax2, ax3, ax4]
fig.suptitle("Principle component analysis")
for i, cbfv in enumerate(cbfvs):
    AtomicEmbed = AtomEmbeds[cbfv]

    # Create an array of the embedding vectors
    embeddings_array = np.array(list(AtomicEmbed.embeddings.values()))
    # Create an array of the elements
    element_array = np.array(AtomicEmbed.element_list)

    # Perform the PCA
    pca = decomposition.PCA()  # project to 2 dimensions
    pca.fit(embeddings_array)
    X = pca.transform(embeddings_array)
    exp_var = pca.explained_variance_ratio_

    pca_dim1 = X[:, 0]
    pca_dim2 = X[:, 1]
    # Create a dataframe to store the dimensions,
    # labels and group info for the PCA
    pca_df = pd.DataFrame(
        {
            "pca_dim1": pca_dim1,
            "pca_dim2": pca_dim2,
            "element": element_array,
            "group": list(AtomicEmbed.element_groups_dict.values()),
        }
    )
    var_dict = {
        "atom_embed": cbfv,
        "exp_var": exp_var * 100,
        "percent_component": np.linspace(1, X.shape[1], X.shape[1]) * 100 / X.shape[1],
        "exp_var_cumsum": np.cumsum(exp_var * 100),
    }
    var_list.append(var_dict)

    g = sns.scatterplot(
        x="pca_dim1",
        y="pca_dim2",
        data=pca_df,
        hue="group",
        s=200,
        ax=axes[i - 1],
        legend=False,
    )

    axes[i - 1].set_xlabel(f"Dimension 1 ({exp_var[0]*100:.0f}%)")
    axes[i - 1].set_ylabel(f"Dimension 2 ({exp_var[1]*100:.0f}%)")

    for idx in range(len(X)):
        axes[i - 1].text(
            x=pca_dim1[idx],
            y=pca_dim2[idx],
            s=element_array[idx],
        )

    # Title the plots
    axes[i - 1].title.set_text(cbfv)


plt.show()

In [None]:
var_df = pd.DataFrame(var_list)
var_df

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(16, 12), constrained_layout=True, sharex=True)
handles = []
for i, row in var_df.iterrows():
    g = ax[0].scatter(
        y=row["exp_var"], x=row["percent_component"], label=row["atom_embed"]
    )

    ax[0].plot(
        row["percent_component"],
        row["exp_var"],
    )
    ax[0].legend(
        loc="lower left",
        bbox_to_anchor=(0.0, 1.02, 1.0, 0.102),
        mode="expand",
        ncols=4,
        borderaxespad=0.0,
    )
    handles.append(g)

    ax[1].step(x=row["percent_component"], y=row["exp_var_cumsum"])

# ax[0].set

ax[1].set_xlabel("Percentage of Principal components (%)")
ax[1].set_ylabel("Cumulative explained variance (%)")
ax[0].set_ylabel("Explained variance (%)")
# ax[].legend(loc='best')
# fig.legend(handles,var_df['atom_embed'], loc = 'right')
# plt.subplots_adjust(right=0.7)
# fig.text(0.04,0.5, 'Explained variance score (%)', va='center',rotation='vertical')#
# plt.ylabel("Exp")
# fig.supylabel("Explained variance score (%)",fontsize=20)
fig.suptitle("Explained variance (%) of elemental representations (%)", fontsize=24)

# plt.savefig("Variance.jpg", transparent=False)
plt.show()

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(16, 12), constrained_layout=True, sharex=True)
handles = []
for i, row in var_df.iterrows():
    g = ax[0].scatter(
        y=row["exp_var"], x=range(0, len(row["exp_var"])), label=row["atom_embed"]
    )

    ax[0].plot(
        range(0, len(row["exp_var"])),
        row["exp_var"],
    )
    ax[0].legend(
        loc="lower left",
        bbox_to_anchor=(0.0, 1.02, 1.0, 0.102),
        mode="expand",
        ncols=4,
        borderaxespad=0.0,
    )
    handles.append(g)

    ax[1].step(x=range(0, len(row["exp_var"])), y=row["exp_var_cumsum"])

# ax[0].set

ax[1].set_xlabel("Principle component index")
ax[1].set_ylabel("Cumulative explained variance (%)")
ax[0].set_ylabel("Explained variance (%)")
fig.suptitle("Explained variance (indices) of elemental representations", fontsize=24)
# ax[].legend(loc='best')
# fig.legend(handles,var_df['atom_embed'], loc = 'right')
# plt.subplots_adjust(right=0.7)
# fig.text(0.04,0.5, 'Explained variance score (%)', va='center',rotation='vertical')#
# plt.ylabel("Exp")
# fig.supylabel("Explained variance score (%)",fontsize=20)
# plt.savefig("Variance_index.jpg", transparent=False)
plt.show()