WMD-group · AntObi · Jul 26, 2023 · Jul 25, 2023 · Jul 25, 2023 · Jul 25, 2023
diff --git a/examples/composition.ipynb b/examples/composition.ipynb
@@ -6,6 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
     "from elementembeddings.composition import composition_featuriser\n",
     "from elementembeddings.composition import CompositionalEmbedding\n",
     "import numpy as np\n",
@@ -178,6 +179,16 @@
     "composition_featuriser(formulas, embedding=\"magpie\", stats=\"mean\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame({\"formula\": formulas})\n",
+    "composition_featuriser(df, embedding=\"magpie\", stats=[\"mean\", \"sum\"])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/examples/usage.ipynb b/examples/usage.ipynb
@@ -152,10 +152,13 @@
     "# Let's find the dimensionality of all of the CBFVs that we have loaded\n",
     "\n",
     "\n",
-    "AtomEmbeds_dim = {cbfv: [AtomEmbeds[cbfv].dim] for cbfv in cbfvs}\n",
+    "AtomEmbeds_dim = {\n",
+    "    cbfv: {\"dim\": AtomEmbeds[cbfv].dim, \"type\": AtomEmbeds[cbfv].embedding_type}\n",
+    "    for cbfv in cbfvs\n",
+    "}\n",
     "\n",
-    "dim_df = pd.DataFrame.from_dict(AtomEmbeds_dim, orient=\"index\", columns=[\"dimension\"])\n",
-    "print(dim_df)"
+    "dim_df = pd.DataFrame.from_dict(AtomEmbeds_dim)\n",
+    "dim_df.T"
    ]
   },
   {

diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
     packages=find_namespace_packages(where="src"),
     package_dir={"": "src"},
     package_data={
-        "elementembeddings.data": ["*.json", "*.csv"],
+        "elementembeddings.data.element_representations": ["*.json", "*.csv"],
         "elementembeddings.data.element_data": ["*.json", "*.txt"],
     },
     test_suite="elementembeddings.tests.test",

diff --git a/src/elementembeddings/composition.py b/src/elementembeddings/composition.py
@@ -378,32 +378,30 @@
         Union[pd.DataFrame,list]: A pandas DataFrame containing the feature vector,
         or a list of feature vectors is returned
     """
+    if isinstance(stats, str):
+        stats = [stats]
+    if isinstance(data, pd.Series):
+        data = data.to_frame(name="formula")
     if isinstance(data, pd.DataFrame):
         if not inplace:
             data = data.copy()
         if "formula" not in data.columns:
             raise ValueError(
                 "The data must contain a column named 'formula' to featurise."
             )
-        data["composition"] = data["formula"].progress_apply(
-            lambda x: CompositionalEmbedding(x, embedding)
-        )
-        data["feature_vector"] = data["composition"].progress_apply(
-            lambda x: x.feature_vector(stats)
-        )
-        data.drop("composition", axis=1, inplace=True)
-        return data
-    elif isinstance(data, pd.Series):
-        if not inplace:
-            data = data.copy()
-        data["composition"] = data["formula"].progress_apply(
-            lambda x: CompositionalEmbedding(x, embedding)
-        )
-        data["feature_vector"] = data["composition"].progress_apply(
-            lambda x: x.feature_vector(stats)
-        )
-        data.drop("composition", axis=1, inplace=True)
-        return data
+        print("Featurising compositions...")
+        comps = [
+            CompositionalEmbedding(x, embedding) for x in tqdm(data["formula"].tolist())
+        ]
+        print("Computing feature vectors...")
+        fvs = [x.feature_vector(stats) for x in tqdm(comps)]
+        feature_names = comps[0].embedding.feature_labels
+        feature_names = [
+            f"{stat}_{feature}" for stat in stats for feature in feature_names
+        ]
+        data_new = pd.concat([data, pd.DataFrame(fvs, columns=feature_names)], axis=1)
+        # data.columns = []
+        return data_new
     elif isinstance(data, list):
         comps = [CompositionalEmbedding(x, embedding) for x in data]
         return [x.feature_vector(stats) for x in tqdm(comps)]

diff --git a/src/elementembeddings/core.py b/src/elementembeddings/core.py
@@ -50,15 +50,22 @@ class Embedding:
     Adds a few convenience methods related to elemental representations.
     """
 
-    def __init__(self, embeddings: dict, embedding_name: Optional[str] = None):
+    def __init__(
+        self,
+        embeddings: dict,
+        embedding_name: Optional[str] = None,
+        feature_labels: Optional[List[str]] = None,
+    ):
         """Initialise the Embedding class.
 
         Args:
             embeddings (dict): A {element_symbol: vector} dictionary
             embedding_name (str): The name of the elemental representation
+            feature_labels (list(str)): A list of feature labels
         """
         self.embeddings = embeddings
         self.embedding_name = embedding_name
+        self.feature_labels = feature_labels
 
         # Grab a random value from the embedding vector
         _rand_embed = random.choice(list(self.embeddings.values()))
@@ -67,9 +74,11 @@ def __init__(self, embeddings: dict, embedding_name: Optional[str] = None):
             self.embeddings = {
                 ele: np.array(self.embeddings[ele]) for ele in self.embeddings
             }
+
         # Determines if the embedding vector has a length attribute
         # (i.e. is not a scalar int or float)
-        # If the 'vector' is a scalar/float, the representation is linear (dim=1)
+        # If the 'vector' is a scalar/float, the representation is linear
+        # A linear representation gets converted to a one-hot vector
         if hasattr(_rand_embed, "__len__") and (not isinstance(_rand_embed, str)):
             self.embedding_type: str = "vector"
             self.dim: int = len(random.choice(list(self.embeddings.values())))
@@ -88,13 +97,20 @@ def __init__(self, embeddings: dict, embedding_name: Optional[str] = None):
                 }
             else:
                 sorted_embedding = {
-                    el: num for el, num in sorted_embedding if el in elements
+                    el: num for el, num in sorted_embedding if el in elements[:118]
                 }
+            self.feature_labels = list(sorted_embedding.keys())
             self.embeddings = {}
+
             for el, num in sorted_embedding.items():
                 self.embeddings[el] = np.zeros(len(sorted_embedding))
                 self.embeddings[el][num] = 1
-            self.dim = len(self.embeddings["H"])
+            self.dim = len(random.choice(list(self.embeddings.values())))
+
+        if not self.feature_labels:
+            self.feature_labels = list(range(self.dim))
+        else:
+            self.feature_labels = self.feature_labels
 
         # Dummy initialisation for results
         self._data = []
@@ -143,77 +159,71 @@ def load_data(embedding_name: Optional[str] = None):
             "skipatom": "skipatom_20201009_induced.csv",
             "atomic": "atomic.json",
         }
-        _cbfv_names = list(_cbfv_files.keys())
-        _cbfv_names_others = [
-            i
-            for i in _cbfv_names
-            if i not in ["skipatom", "random_200", "megnet16", "magpie", "mat2vec"]
-        ]
 
-        # Get the embeddings
-        if embedding_name in _cbfv_files:
-            if embedding_name in ["skipatom", "random_200", "magpie", "mat2vec"]:
-                _csv = path.join(data_directory, _cbfv_files[embedding_name])
-                df = pd.read_csv(_csv)
-                # Convert df to a dictionary of (ele:embeddings) pairs
-                elements = list(df["element"])
-                df.drop(["element"], axis=1, inplace=True)
-                embeds_array = df.to_numpy()
-                embedding_data = {
-                    elements[i]: embeds_array[i] for i in range(len(embeds_array))
-                }
-
-            elif embedding_name == "megnet16":
-                megnet16_json = path.join(data_directory, _cbfv_files["megnet16"])
-                with open(megnet16_json) as f:
-                    embedding_data = json.load(f)
-                # Remove 'Null' key from megnet embedding
-                del embedding_data["Null"]
-
-            elif embedding_name in _cbfv_names_others:
-                _json = path.join(data_directory, _cbfv_files[embedding_name])
-                with open(_json) as f:
-                    embedding_data = json.load(f)
-        else:
-            raise (
-                ValueError(
-                    f"{embedding_name} not in the data directory or not in directory."
-                )
+        if _cbfv_files[embedding_name].endswith(".csv"):
+            return Embedding.from_csv(
+                path.join(
+                    data_directory,
+                    "element_representations",
+                    _cbfv_files[embedding_name],
+                ),
+                embedding_name,
+            )
+        elif "megnet" in _cbfv_files[embedding_name]:
+            return Embedding.from_json(
+                path.join(
+                    data_directory,
+                    "element_representations",
+                    _cbfv_files[embedding_name],
+                ),
+                embedding_name,
+            ).remove_elements(["Null"])
+        elif _cbfv_files[embedding_name].endswith(".json"):
+            return Embedding.from_json(
+                path.join(
+                    data_directory,
+                    "element_representations",
+                    _cbfv_files[embedding_name],
+                ),
+                embedding_name,
             )
-        return Embedding(embedding_data, embedding_name)
 
     @staticmethod
-    def from_json(embedding_json):
+    def from_json(embedding_json, embedding_name: Optional[str] = None):
         """
         Create an instance of the Embedding class from a json file.
 
         Args:
             embedding_json (str): Filepath of the json file
+            embedding_name (str): The name of the elemental representation
         """
         # Need to add validation handling for JSONs in different formats
         with open(embedding_json) as f:
             embedding_data = json.load(f)
-        return Embedding(embedding_data)
+        return Embedding(embedding_data, embedding_name)
 
     @staticmethod
-    def from_csv(embedding_csv):
+    def from_csv(embedding_csv, embedding_name: Optional[str] = None):
         """
         Create an instance of the Embedding class from a csv file.
 
         The first column of the csv file must contain the elements and be named element.
 
         Args:
             embedding_csv (str): Filepath of the csv file
+            embedding_name (str): The name of the elemental representation
+
         """
         # Need to add validation handling for csv files
         df = pd.read_csv(embedding_csv)
         elements = list(df["element"])
         df.drop(["element"], axis=1, inplace=True)
+        feature_labels = list(df.columns)
         embeds_array = df.to_numpy()
         embedding_data = {
             elements[i]: embeds_array[i] for i in range(len(embeds_array))
         }
-        return Embedding(embedding_data)
+        return Embedding(embedding_data, embedding_name, feature_labels)
 
     def as_dataframe(self, columns: str = "components") -> pd.DataFrame:
         """
@@ -224,7 +234,7 @@ def as_dataframe(self, columns: str = "components") -> pd.DataFrame:
 
         Args:
             columns (str): A string to specify if the columns are the vector components
-            and the index is the elements (`columns='components')
+            and the index is the elements (`columns='components'`)
             or the columns are the elements (`columns='elements'`).
 
         Returns:
@@ -233,7 +243,7 @@ def as_dataframe(self, columns: str = "components") -> pd.DataFrame:
 
         """
         embedding = self.embeddings
-        df = pd.DataFrame(embedding)
+        df = pd.DataFrame(embedding, index=self.feature_labels)
         if columns == "components":
             return df.T
         elif columns == "elements":

diff --git a/src/elementembeddings/data/README.md → ...gs/data/element_representations/README.md b/src/elementembeddings/data/README.md → ...gs/data/element_representations/README.md
diff --git a/src/elementembeddings/data/atomic.json → .../data/element_representations/atomic.json b/src/elementembeddings/data/atomic.json → .../data/element_representations/atomic.json
diff --git a/src/elementembeddings/data/magpie.csv → ...s/data/element_representations/magpie.csv b/src/elementembeddings/data/magpie.csv → ...s/data/element_representations/magpie.csv
diff --git a/src/elementembeddings/data/magpie_sc.json → ...ta/element_representations/magpie_sc.json b/src/elementembeddings/data/magpie_sc.json → ...ta/element_representations/magpie_sc.json
diff --git a/src/elementembeddings/data/mat2vec.csv → .../data/element_representations/mat2vec.csv b/src/elementembeddings/data/mat2vec.csv → .../data/element_representations/mat2vec.csv
diff --git a/...embeddings/data/matscholar-embedding.json → ...representations/matscholar-embedding.json b/...embeddings/data/matscholar-embedding.json → ...representations/matscholar-embedding.json
diff --git a/src/elementembeddings/data/megnet16.json → ...ata/element_representations/megnet16.json b/src/elementembeddings/data/megnet16.json → ...ata/element_representations/megnet16.json
diff --git a/src/elementembeddings/data/mod_petti.json → ...ta/element_representations/mod_petti.json b/src/elementembeddings/data/mod_petti.json → ...ta/element_representations/mod_petti.json
diff --git a/src/elementembeddings/data/oliynyk.json → ...data/element_representations/oliynyk.json b/src/elementembeddings/data/oliynyk.json → ...data/element_representations/oliynyk.json
diff --git a/src/elementembeddings/data/oliynyk_sc.json → ...a/element_representations/oliynyk_sc.json b/src/elementembeddings/data/oliynyk_sc.json → ...a/element_representations/oliynyk_sc.json
diff --git a/src/elementembeddings/data/random_200.csv → ...ta/element_representations/random_200.csv b/src/elementembeddings/data/random_200.csv → ...ta/element_representations/random_200.csv
diff --git a/...elementembeddings/data/random_200_new.csv → ...lement_representations/random_200_new.csv b/...elementembeddings/data/random_200_new.csv → ...lement_representations/random_200_new.csv
diff --git a/...ddings/data/skipatom_20201009_induced.csv → ...esentations/skipatom_20201009_induced.csv b/...ddings/data/skipatom_20201009_induced.csv → ...esentations/skipatom_20201009_induced.csv
diff --git a/src/elementembeddings/tests/test_composition.py b/src/elementembeddings/tests/test_composition.py
@@ -113,7 +113,7 @@ def test_composition_featuriser(self):
         formulas = self.formulas[:3]
         formula_df = pd.DataFrame(formulas, columns=["formula"])
         assert isinstance(composition.composition_featuriser(formula_df), pd.DataFrame)
-        assert composition.composition_featuriser(formula_df).shape == (3, 2)
+        assert composition.composition_featuriser(formula_df).shape == (3, 23)
         assert isinstance(composition.composition_featuriser(formulas), list)
         assert len(composition.composition_featuriser(formulas)) == 3
 

diff --git a/src/elementembeddings/tests/test_core.py b/src/elementembeddings/tests/test_core.py
@@ -87,17 +87,17 @@ def test_Embedding_class_atomic(self):
         # Check if H is present in the embedding keys
         assert "H" in atomic.embeddings.keys()
         # Check dimensions
-        assert atomic.dim == 119
+        assert atomic.dim == 118
         # Check embedding type
         assert atomic.embedding_type == "linear"
         # Check that a list is returned
         assert isinstance(atomic.element_list, list)
         # Check the the dimensons of the embedding vector
-        assert atomic.embeddings["H"].shape == (119,)
+        assert atomic.embeddings["H"].shape == (118,)
         # Check that the embedding vector is not all zeros
         assert not np.all(atomic.embeddings["H"] == 0)
         # Check the the embedding vector for H is correct
-        test_H = np.zeros(119)
+        test_H = np.zeros(118)
         test_H[0] = 1
         assert np.all(atomic.embeddings["H"] == test_H)