# Dimension Reduction

In this notebook, we will use the dimension reduction techniques to reduce the dimension of the data. We will use the following techniques:

- Principal Component Analysis (PCA)
- t-distributed Stochastic Neighbor Embedding (t-SNE)
- Uniform Manifold Approximation and Projection (UMAP)

we will make composional embedding created by element embeddings, as follows:

(Please refer to [ElementEmbeddings](https://wmd-group.github.io/ElementEmbeddings/0.4/reference/) for the details of element embedding)

- Magpie
- Mat2Vec
- Megnet16
- Skipatom
- Oliynyk
- random_200

## 1. Element Embeddings

To begin, we will make compositional embedding created by element embeddings using the ElementEmbeddings package.

In [None]:
# pip install ElementEmbeddings

In [None]:
from typing import Iterable
from pathlib import Path

from tqdm import tqdm
import numpy as np
import pandas as pd

from elementembeddings.composition import CompositionalEmbedding

In [None]:
embedding_names = [
    "magpie",
    "mat2vec",
    "megnet16",
    "skipatom",
    "oliynyk",
    "random_200",
]

reducers = ["pca", "tsne", "umap"]

# set save directory
save_dir = Path("data/binary/")
save_dir.mkdir(parents=True, exist_ok=True)

In [None]:
df_category = pd.read_pickle(save_dir / "df_binary_category.pkl")
df_category

In [None]:
# sampling
n_samples = 3000

dict_label = {
    "standard": 0,
    "missing": 1,
    "interesting": 2,
    "unlikely": 3,
}
labels = ["standard", "missing", "interesting", "unlikely"]
list_df_sample = []
for label in labels:
    m = df_category["label"] == label
    df = df_category[m].sample(
        n=min(n_samples, len(df_category[m])),
        random_state=42,
    )
    list_df_sample.append(df)
df_sample = pd.concat(list_df_sample)
# save sampled data
df_sample.to_pickle(save_dir / "df_binary_sample.pkl")

In [None]:
def get_embedding(formula, embedding="magpie", stats="mean"):
    """
    Computes a compositional embedding for a given chemical formula or a list of chemical formulas.

    Parameters:
    -----------
    formula : str or iterable

    embedding : str, optional
        The type of embedding to compute. Must be one of ['magpie', 'mat2vec', 'megnet16', 'skipatom', 'oliynyk', 'random_200'].
        Default is 'magpie'.
    stats : str, optional
        The type of statistics to compute for the embedding. Must be one of
        ["mean", "variance", "minpool", "maxpool", "range", "sum", "geometric_mean", "harmonic_mean"].
        Default is 'mean'.
    Returns:
    --------
    numpy.ndarray
        1D array when formula is a string, 2D array when formula is a list of strings.
    """
    if isinstance(formula, str):
        formula = [formula]
    elif isinstance(formula, Iterable):
        pass
    else:
        raise TypeError("formula must be a string or a list of strings")

    # get embedding dimension
    embedding_dim = CompositionalEmbedding(
        "", embedding=embedding
    ).embedding_dim

    # compute embedding
    embeddings = []
    for f in tqdm(formula):
        try:
            compositional_embedding = CompositionalEmbedding(
                f, embedding=embedding
            )
            embeddings.append(
                compositional_embedding.feature_vector(stats=stats)
            )
        except Exception as e:
            # the exception is raised when the embedding doesn't support the element
            embeddings.append(np.full(embedding_dim, np.nan))

    # concatenate the embedded vectors
    embeddings = np.stack(embeddings, axis=0).squeeze()
    return embeddings

In [None]:
# make the directory to save the embeddings
(save_dir / "embeddings").mkdir(parents=True, exist_ok=True)
# save the embeddings
for name in embedding_names:
    print(f"Computing {name} embeddings")
    embeddings = get_embedding(df_sample.index, embedding=name)
    df_embeddings = pd.DataFrame(embeddings, index=df_sample.index)
    df_embeddings = df_embeddings.dropna(axis=0)
    df_embeddings.to_pickle(save_dir / "embeddings" / f"embeddings_{name}.pkl")
    print(
        f"Saved {name} embeddings with shape {df_embeddings.shape} to {save_dir / 'embeddings' / f'embeddings_{name}.pkl'}"
    )

## 2. Dimension Reduction

Next, we will use the dimension reduction techniques to reduce the dimension of the data.

In [None]:
# pip install umap-learn==0.5.3

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

In [None]:
def dimension_reduction(
    embeddings,
    reducer="pca",
    n_components=2,
    save_dir=None,
    file_name=None,
    **kwargs,
):
    """
    Performs dimensionality reduction on the given embeddings.

    Parameters:
    -----------
    embeddings : pandas.DataFrame
        The embeddings to reduce.
    reducer : str, optional
        The dimensionality reduction algorithm to use. Must be one of ['pca', 'tsne', 'umap'].
        Default is 'pca'.
    n_components : int, optional
        The number of components to reduce to. Default is 2.
    save_dir : str, optional
        The directory to save the reduced embeddings. Default is None.
    file_name : str, optional
        The file name to save the reduced embeddings. Default is None.
    **kwargs : dict, optional

    Returns:
    --------
    numpy.ndarray
        The reduced embeddings.
    """
    if reducer == "pca":
        reducer = PCA(n_components=n_components, **kwargs)
    elif reducer == "tsne":
        reducer = TSNE(n_components=n_components, **kwargs)
    elif reducer == "umap":
        reducer = UMAP(n_components=n_components, **kwargs)
    else:
        raise ValueError("reducer must be one of ['pca', 'tsne', 'umap']")

    reduced_embeddings = reducer.fit_transform(embeddings.values)

    if save_dir is not None:
        save_dir = Path(save_dir)
        save_dir.mkdir(exist_ok=True)
        if file_name is None:
            file_name = f"reduced_embeddings_{reducer.__class__.__name__}.pkl"
        else:
            file_name = f"{file_name}.pkl"
        pd.DataFrame(reduced_embeddings, index=embeddings.index).to_pickle(
            save_dir / file_name
        )
        print(f"Saved reduced embeddings to {save_dir / file_name}")
    return reduced_embeddings

In [None]:
# make the directory to save the reduced embeddings
(save_dir / "reduced_embeddings_2d").mkdir(parents=True, exist_ok=True)
# calculate the reduced embeddings
silhouette_scores = {}
for name in embedding_names:
    for reducer in reducers:
        print(f"Computing {name} {reducer} embeddings")

        embeddings = pd.read_pickle(
            save_dir / "embeddings" / f"embeddings_{name}.pkl"
        )

        reduced_embeddings = dimension_reduction(
            embeddings,
            reducer=reducer,
            n_components=2,
            save_dir=save_dir / "reduced_embeddings_2d",
            file_name=f"{reducer}_{name}",
            random_state=42,
        )

## 3. Visualization of the Reduced Embeddings

In [None]:
# pip install -U kaleido

In [None]:
from plot_embedding import plot_reducers_embeddings

In [None]:
df_category = pd.read_pickle(save_dir / "df_binary_category.pkl")
df_category["formula"] = df_category.index
embedding_dir = Path(save_dir / "reduced_embeddings_2d/")
save_path = save_dir / "plot_binary.jpg"  # save path for the plot
fig = plot_reducers_embeddings(
    df_category,
    reducers,
    embedding_names,
    embedding_dir,
    symbol="circle",
    title="Compositional space for binary compounds",
    save_path=save_path,
)
# check the plot in save_path