Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature labels #73

Merged
merged 9 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions examples/composition.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from elementembeddings.composition import composition_featuriser\n",
"from elementembeddings.composition import CompositionalEmbedding\n",
"import numpy as np\n",
Expand Down Expand Up @@ -178,6 +179,16 @@
"composition_featuriser(formulas, embedding=\"magpie\", stats=\"mean\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"formula\": formulas})\n",
"composition_featuriser(df, embedding=\"magpie\", stats=[\"mean\", \"sum\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
9 changes: 6 additions & 3 deletions examples/usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,13 @@
"# Let's find the dimensionality of all of the CBFVs that we have loaded\n",
"\n",
"\n",
"AtomEmbeds_dim = {cbfv: [AtomEmbeds[cbfv].dim] for cbfv in cbfvs}\n",
"AtomEmbeds_dim = {\n",
" cbfv: {\"dim\": AtomEmbeds[cbfv].dim, \"type\": AtomEmbeds[cbfv].embedding_type}\n",
" for cbfv in cbfvs\n",
"}\n",
"\n",
"dim_df = pd.DataFrame.from_dict(AtomEmbeds_dim, orient=\"index\", columns=[\"dimension\"])\n",
"print(dim_df)"
"dim_df = pd.DataFrame.from_dict(AtomEmbeds_dim)\n",
"dim_df.T"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
packages=find_namespace_packages(where="src"),
package_dir={"": "src"},
package_data={
"elementembeddings.data": ["*.json", "*.csv"],
"elementembeddings.data.element_representations": ["*.json", "*.csv"],
"elementembeddings.data.element_data": ["*.json", "*.txt"],
},
test_suite="elementembeddings.tests.test",
Expand Down
36 changes: 17 additions & 19 deletions src/elementembeddings/composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,32 +378,30 @@
Union[pd.DataFrame,list]: A pandas DataFrame containing the feature vector,
or a list of feature vectors is returned
"""
if isinstance(stats, str):
stats = [stats]
if isinstance(data, pd.Series):
data = data.to_frame(name="formula")

Check warning on line 384 in src/elementembeddings/composition.py

View check run for this annotation

Codecov / codecov/patch

src/elementembeddings/composition.py#L384

Added line #L384 was not covered by tests
if isinstance(data, pd.DataFrame):
if not inplace:
data = data.copy()
if "formula" not in data.columns:
raise ValueError(
"The data must contain a column named 'formula' to featurise."
)
data["composition"] = data["formula"].progress_apply(
lambda x: CompositionalEmbedding(x, embedding)
)
data["feature_vector"] = data["composition"].progress_apply(
lambda x: x.feature_vector(stats)
)
data.drop("composition", axis=1, inplace=True)
return data
elif isinstance(data, pd.Series):
if not inplace:
data = data.copy()
data["composition"] = data["formula"].progress_apply(
lambda x: CompositionalEmbedding(x, embedding)
)
data["feature_vector"] = data["composition"].progress_apply(
lambda x: x.feature_vector(stats)
)
data.drop("composition", axis=1, inplace=True)
return data
print("Featurising compositions...")
comps = [
CompositionalEmbedding(x, embedding) for x in tqdm(data["formula"].tolist())
]
print("Computing feature vectors...")
fvs = [x.feature_vector(stats) for x in tqdm(comps)]
feature_names = comps[0].embedding.feature_labels
feature_names = [
f"{stat}_{feature}" for stat in stats for feature in feature_names
]
data_new = pd.concat([data, pd.DataFrame(fvs, columns=feature_names)], axis=1)
# data.columns = []
return data_new
elif isinstance(data, list):
comps = [CompositionalEmbedding(x, embedding) for x in data]
return [x.feature_vector(stats) for x in tqdm(comps)]
Expand Down
102 changes: 56 additions & 46 deletions src/elementembeddings/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,22 @@ class Embedding:
Adds a few convenience methods related to elemental representations.
"""

def __init__(self, embeddings: dict, embedding_name: Optional[str] = None):
def __init__(
self,
embeddings: dict,
embedding_name: Optional[str] = None,
feature_labels: Optional[List[str]] = None,
):
"""Initialise the Embedding class.

Args:
embeddings (dict): A {element_symbol: vector} dictionary
embedding_name (str): The name of the elemental representation
feature_labels (list(str)): A list of feature labels
"""
self.embeddings = embeddings
self.embedding_name = embedding_name
self.feature_labels = feature_labels

# Grab a random value from the embedding vector
_rand_embed = random.choice(list(self.embeddings.values()))
Expand All @@ -67,9 +74,11 @@ def __init__(self, embeddings: dict, embedding_name: Optional[str] = None):
self.embeddings = {
ele: np.array(self.embeddings[ele]) for ele in self.embeddings
}

# Determines if the embedding vector has a length attribute
# (i.e. is not a scalar int or float)
# If the 'vector' is a scalar/float, the representation is linear (dim=1)
# If the 'vector' is a scalar/float, the representation is linear
# A linear representation gets converted to a one-hot vector
if hasattr(_rand_embed, "__len__") and (not isinstance(_rand_embed, str)):
self.embedding_type: str = "vector"
self.dim: int = len(random.choice(list(self.embeddings.values())))
Expand All @@ -88,13 +97,20 @@ def __init__(self, embeddings: dict, embedding_name: Optional[str] = None):
}
else:
sorted_embedding = {
el: num for el, num in sorted_embedding if el in elements
el: num for el, num in sorted_embedding if el in elements[:118]
}
self.feature_labels = list(sorted_embedding.keys())
self.embeddings = {}

for el, num in sorted_embedding.items():
self.embeddings[el] = np.zeros(len(sorted_embedding))
self.embeddings[el][num] = 1
self.dim = len(self.embeddings["H"])
self.dim = len(random.choice(list(self.embeddings.values())))

if not self.feature_labels:
self.feature_labels = list(range(self.dim))
else:
self.feature_labels = self.feature_labels

# Dummy initialisation for results
self._data = []
Expand Down Expand Up @@ -143,77 +159,71 @@ def load_data(embedding_name: Optional[str] = None):
"skipatom": "skipatom_20201009_induced.csv",
"atomic": "atomic.json",
}
_cbfv_names = list(_cbfv_files.keys())
_cbfv_names_others = [
i
for i in _cbfv_names
if i not in ["skipatom", "random_200", "megnet16", "magpie", "mat2vec"]
]

# Get the embeddings
if embedding_name in _cbfv_files:
if embedding_name in ["skipatom", "random_200", "magpie", "mat2vec"]:
_csv = path.join(data_directory, _cbfv_files[embedding_name])
df = pd.read_csv(_csv)
# Convert df to a dictionary of (ele:embeddings) pairs
elements = list(df["element"])
df.drop(["element"], axis=1, inplace=True)
embeds_array = df.to_numpy()
embedding_data = {
elements[i]: embeds_array[i] for i in range(len(embeds_array))
}

elif embedding_name == "megnet16":
megnet16_json = path.join(data_directory, _cbfv_files["megnet16"])
with open(megnet16_json) as f:
embedding_data = json.load(f)
# Remove 'Null' key from megnet embedding
del embedding_data["Null"]

elif embedding_name in _cbfv_names_others:
_json = path.join(data_directory, _cbfv_files[embedding_name])
with open(_json) as f:
embedding_data = json.load(f)
else:
raise (
ValueError(
f"{embedding_name} not in the data directory or not in directory."
)
if _cbfv_files[embedding_name].endswith(".csv"):
return Embedding.from_csv(
path.join(
data_directory,
"element_representations",
_cbfv_files[embedding_name],
),
embedding_name,
)
elif "megnet" in _cbfv_files[embedding_name]:
return Embedding.from_json(
path.join(
data_directory,
"element_representations",
_cbfv_files[embedding_name],
),
embedding_name,
).remove_elements(["Null"])
elif _cbfv_files[embedding_name].endswith(".json"):
return Embedding.from_json(
path.join(
data_directory,
"element_representations",
_cbfv_files[embedding_name],
),
embedding_name,
)
return Embedding(embedding_data, embedding_name)

@staticmethod
def from_json(embedding_json):
def from_json(embedding_json, embedding_name: Optional[str] = None):
"""
Create an instance of the Embedding class from a json file.

Args:
embedding_json (str): Filepath of the json file
embedding_name (str): The name of the elemental representation
"""
# Need to add validation handling for JSONs in different formats
with open(embedding_json) as f:
embedding_data = json.load(f)
return Embedding(embedding_data)
return Embedding(embedding_data, embedding_name)

@staticmethod
def from_csv(embedding_csv):
def from_csv(embedding_csv, embedding_name: Optional[str] = None):
"""
Create an instance of the Embedding class from a csv file.

The first column of the csv file must contain the elements and be named element.

Args:
embedding_csv (str): Filepath of the csv file
embedding_name (str): The name of the elemental representation

"""
# Need to add validation handling for csv files
df = pd.read_csv(embedding_csv)
elements = list(df["element"])
df.drop(["element"], axis=1, inplace=True)
feature_labels = list(df.columns)
embeds_array = df.to_numpy()
embedding_data = {
elements[i]: embeds_array[i] for i in range(len(embeds_array))
}
return Embedding(embedding_data)
return Embedding(embedding_data, embedding_name, feature_labels)

def as_dataframe(self, columns: str = "components") -> pd.DataFrame:
"""
Expand All @@ -224,7 +234,7 @@ def as_dataframe(self, columns: str = "components") -> pd.DataFrame:

Args:
columns (str): A string to specify if the columns are the vector components
and the index is the elements (`columns='components')
and the index is the elements (`columns='components'`)
or the columns are the elements (`columns='elements'`).

Returns:
Expand All @@ -233,7 +243,7 @@ def as_dataframe(self, columns: str = "components") -> pd.DataFrame:

"""
embedding = self.embeddings
df = pd.DataFrame(embedding)
df = pd.DataFrame(embedding, index=self.feature_labels)
if columns == "components":
return df.T
elif columns == "elements":
Expand Down
2 changes: 1 addition & 1 deletion src/elementembeddings/tests/test_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_composition_featuriser(self):
formulas = self.formulas[:3]
formula_df = pd.DataFrame(formulas, columns=["formula"])
assert isinstance(composition.composition_featuriser(formula_df), pd.DataFrame)
assert composition.composition_featuriser(formula_df).shape == (3, 2)
assert composition.composition_featuriser(formula_df).shape == (3, 23)
assert isinstance(composition.composition_featuriser(formulas), list)
assert len(composition.composition_featuriser(formulas)) == 3

Expand Down
6 changes: 3 additions & 3 deletions src/elementembeddings/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@ def test_Embedding_class_atomic(self):
# Check if H is present in the embedding keys
assert "H" in atomic.embeddings.keys()
# Check dimensions
assert atomic.dim == 119
assert atomic.dim == 118
# Check embedding type
assert atomic.embedding_type == "linear"
# Check that a list is returned
assert isinstance(atomic.element_list, list)
# Check the the dimensons of the embedding vector
assert atomic.embeddings["H"].shape == (119,)
assert atomic.embeddings["H"].shape == (118,)
# Check that the embedding vector is not all zeros
assert not np.all(atomic.embeddings["H"] == 0)
# Check the the embedding vector for H is correct
test_H = np.zeros(119)
test_H = np.zeros(118)
test_H[0] = 1
assert np.all(atomic.embeddings["H"] == test_H)

Expand Down