# Preprocessing

This notebook is here to show how the embedding files are processed prior to any calculations and explain any design decisions

In [None]:
# Imports
from elementembeddings.core import Embedding, data_directory
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer

pd.set_option("display.max_columns", None)

The following code block loads all the embedding files currently packaged within the code.

In [None]:
# Get a list of the representations packaged
rep_folder = os.listdir(f"{data_directory}/element_representations")

# Filter out any files which are not json or csv files
rep_files = [rep for rep in rep_folder if rep.endswith(".csv") or rep.endswith(".json")]

# Print the filenames
print(rep_files)

In [None]:
# Load the embeddings

embedding_dict = {}

# Use functions of the Embedding class to load these raw data.
for rep in rep_files:
    if rep.endswith(".csv"):
        name = rep.split(".")[0]
        embedding_dict[name] = {
            "embedding": Embedding.from_csv(
                f"{data_directory}/element_representations/{rep}", name
            )
        }
    elif rep.endswith(".json"):
        name = rep.split(".json")[0]
        embedding_dict[name] = {
            "embedding": Embedding.from_json(
                f"{data_directory}/element_representations/{rep}", name
            )
        }

We can check to see if any of the embedding files have missing values. To do this, we will load each embedding as a dataframe and verify if any of the columns of the dataframe (i.e. the individual vector components) have missing values.

In [None]:
# Checking for missing values

for embedding in embedding_dict.values():
    df = embedding["embedding"].as_dataframe()
    embedding["dataframe"] = df
    print(
        f"For {embedding['embedding'].embedding_name} there are {df.isna().any().sum()} features with missing values"
    )

In [None]:
embedding_dict["oliynyk"]["dataframe"].describe()

In [None]:
# Get a bool series for columns to check for missing values
olinyk_columns_bool = embedding_dict["oliynyk"]["dataframe"].isna().any()

# Print the columns with missing values
missing_val_cols = olinyk_columns_bool[olinyk_columns_bool == True]
print(missing_val_cols)

In [None]:
# Visualise the distribution of the missing-value columns:
fig, axes = plt.subplots(2, 2)

for ax, col in zip(axes.flatten(), list(missing_val_cols.index)):
    sns.histplot(data=embedding_dict["oliynyk"]["dataframe"], x=col, ax=ax)


plt.tight_layout()
plt.show()

## Imputing missing values
We will try different strategies to impute the missing values while trying to keep the distributions the same.

In [None]:
# Simple Imputing
oliynyk_dfs = {"original": embedding_dict["oliynyk"]["dataframe"].copy()}

# The constant value is zero by default
strategies = ["mean", "median", "most_frequent", "constant"]

for strat in strategies:
    imp = SimpleImputer(strategy=strat)
    df = oliynyk_dfs["original"].copy()
    index, columns = df.index, df.columns
    X = df.values
    X_imp = imp.fit_transform(X)

    df_imp = pd.DataFrame(data=X_imp, index=index, columns=columns)
    oliynyk_dfs[f"{strat}"] = df_imp

    # Verify if there are missing values
    print(
        f"The original dataframe had {df.isna().any().sum()} missing values. Using {strat}-imputing, the new dataframe now has {df_imp.isna().any().sum()} missing values"
    )


# knn imputing
knn_imp = KNNImputer()
df = oliynyk_dfs["original"].copy()
index, columns = df.index, df.columns
X = df.values
X_imp = knn_imp.fit_transform(X)
df_imp = pd.DataFrame(data=X_imp, index=index, columns=columns)
oliynyk_dfs["knn"] = df_imp
print(
    f"Using knn-imputing, the new dataframe now has {df_imp.isna().any().sum()} missing values."
)

In [None]:
# Visualise the distribution of the missing-value columns:

for col in list(missing_val_cols.index):
    fig, axes = plt.subplots(3, 2)
    for ax, imp in zip(axes.flatten(), oliynyk_dfs.keys()):
        sns.histplot(data=oliynyk_dfs[imp], x=col, ax=ax)
        if imp == "original":
            ax.set_title("Original")
        else:
            ax.set_title(f"{imp} imputing")
    plt.tight_layout()
    plt.show()

In [None]:
embedding_dict["oliynyk_sc"]["dataframe"].columns = embedding_dict["oliynyk"][
    "dataframe"
].columns
embedding_dict["oliynyk"]["dataframe"]

From the above graphs, we can see for `Mulliken_EN`, `MB_electronegativity`, `crystal_radius`, knn imputation leaves the overall distribution unchanged. Whereas for the `Miracle_Radius_[pm]`, mode imputing keeps the overall distribution unchanged.

For this particular work, we will create a new embedding file from the Oliynyk file.

In [None]:
# Mode impute the miracle radius
oliynyk_df = oliynyk_dfs["original"].copy()
imp = SimpleImputer(strategy="most_frequent")
X = oliynyk_df["Miracle_Radius_[pm]"].values.reshape(-1, 1)
X_imp = imp.fit_transform(X)

oliynyk_df["Miracle_Radius_[pm]"] = X_imp

# knn impute the other 3 variables
knn_imp = KNNImputer()
index, columns = oliynyk_df.index, oliynyk_df.columns
X = df.values
X_imp = knn_imp.fit_transform(X)
oliynyk_df = pd.DataFrame(data=X_imp, index=index, columns=columns)
print(
    f"The new dataframe has {oliynyk_df.isna().any().sum()} columns with missing values"
)
oliynyk_df.head()

In [None]:
# Export the new dataframe.

oliynyk_df.to_csv(
    f"{data_directory}/element_representations/oliynyk_preprocessed.csv",
    index=True,
    index_label="element",
)