In [1]:
import os
import pandas as pd
from pymatgen.core import composition

In [5]:
icsd_true = pd.read_csv(os.path.join('Materials', 'icsd_materials.csv'))
print(icsd_true.head())

  material_id pretty_formula  icsd_ids stoichiometry_list
0  mp-1002206            SiC  [182362]         [1.0, 1.0]
1    mp-10164        NaGaTe2   [44702]    [1.0, 1.0, 2.0]
2  mp-1025377      Cd(AgI2)2  [190587]    [1.0, 2.0, 4.0]
3  mp-1057273             Pb  [105158]              [1.0]
4  mp-1065918        Mn2CuGe  [184949]    [2.0, 1.0, 1.0]


In [4]:
# Calculate stoichiometry for all compositions
def calculate_stoichiometry(formula):
    comp = composition.Composition(formula)  # Use the Composition class
    return {el.symbol: amt for el, amt in comp.items()}

icsd_true['stoichiometry'] = icsd_true['pretty_formula'].apply(calculate_stoichiometry)

print(icsd_true[['pretty_formula', 'stoichiometry']].head())

  pretty_formula                      stoichiometry
0            SiC              {'Si': 1.0, 'C': 1.0}
1        NaGaTe2  {'Na': 1.0, 'Ga': 1.0, 'Te': 2.0}
2      Cd(AgI2)2   {'Cd': 1.0, 'Ag': 2.0, 'I': 4.0}
3             Pb                        {'Pb': 1.0}
4        Mn2CuGe  {'Mn': 2.0, 'Cu': 1.0, 'Ge': 1.0}


In [5]:
icsd_true['stoichiometry_list'] = icsd_true['stoichiometry'].apply(lambda x: list(x.values()))
print(icsd_true[['pretty_formula', 'stoichiometry', 'stoichiometry_list']].head())

  pretty_formula                      stoichiometry stoichiometry_list
0            SiC              {'Si': 1.0, 'C': 1.0}         [1.0, 1.0]
1        NaGaTe2  {'Na': 1.0, 'Ga': 1.0, 'Te': 2.0}    [1.0, 1.0, 2.0]
2      Cd(AgI2)2   {'Cd': 1.0, 'Ag': 2.0, 'I': 4.0}    [1.0, 2.0, 4.0]
3             Pb                        {'Pb': 1.0}              [1.0]
4        Mn2CuGe  {'Mn': 2.0, 'Cu': 1.0, 'Ge': 1.0}    [2.0, 1.0, 1.0]


In [7]:
# Select the required columns
icsd_csv = icsd_true[['material_id', 'pretty_formula', 'icsd_ids', 'stoichiometry_list']]

# Display the updated dataframe
print(icsd_csv.head())
icsd_csv.to_csv(os.path.join('Materials', 'icsd_valid_true.csv'), index=False)


  material_id pretty_formula  icsd_ids stoichiometry_list
0  mp-1002206            SiC  [182362]         [1.0, 1.0]
1    mp-10164        NaGaTe2   [44702]    [1.0, 1.0, 2.0]
2  mp-1025377      Cd(AgI2)2  [190587]    [1.0, 2.0, 4.0]
3  mp-1057273             Pb  [105158]              [1.0]
4  mp-1065918        Mn2CuGe  [184949]    [2.0, 1.0, 1.0]


In [3]:
from pymatgen.core import Composition

# Function to check if a composition is valid
def is_valid_composition(formula):
    try:
        Composition(formula)
        return True
    except:
        return False

# Find invalid compositions
icsd_true['is_valid'] = icsd_true['pretty_formula'].apply(lambda x: is_valid_composition(x) if pd.notnull(x) else False)
invalid_compositions = icsd_true[~icsd_true['is_valid']]

print(invalid_compositions)

Empty DataFrame
Columns: [material_id, pretty_formula, icsd_ids, is_valid]
Index: []


In [20]:
# Filter the dataframe to include only valid compositions
icsd_true = icsd_true[['material_id', 'pretty_formula', 'icsd_ids']][icsd_true['is_valid']]
print(icsd_true.head())
# Save the updated dataframe back to the CSV file
icsd_true.to_csv(os.path.join('Materials', 'icsd_materials.csv'), index=False)

  material_id pretty_formula  icsd_ids
0  mp-1002206            SiC  [182362]
1    mp-10164        NaGaTe2   [44702]
2  mp-1025377      Cd(AgI2)2  [190587]
3  mp-1057273             Pb  [105158]
4  mp-1065918        Mn2CuGe  [184949]


In [5]:
# Calculate stoichiometry for valid compositions
def calculate_stoichiometry(formula):
    comp = Composition(formula)
    return {el.symbol: amt for el, amt in comp.items()}

icsd_true['stoichiometry'] = icsd_true.apply(
    lambda row: calculate_stoichiometry(row['pretty_formula']) if row['is_valid'] else None, axis=1
)

print(icsd_true[['pretty_formula', 'stoichiometry']].head())

  pretty_formula                      stoichiometry
0            SiC              {'Si': 1.0, 'C': 1.0}
1        NaGaTe2  {'Na': 1.0, 'Ga': 1.0, 'Te': 2.0}
2      Cd(AgI2)2   {'Cd': 1.0, 'Ag': 2.0, 'I': 4.0}
3             Pb                        {'Pb': 1.0}
4        Mn2CuGe  {'Mn': 2.0, 'Cu': 1.0, 'Ge': 1.0}


In [7]:
icsd_true['stoichiometry_list'] = icsd_true['stoichiometry'].apply(
    lambda x: list(x.values()) if pd.notnull(x) else None
)

print(icsd_true[['pretty_formula', 'stoichiometry', 'stoichiometry_list']].head())

  pretty_formula                      stoichiometry stoichiometry_list
0            SiC              {'Si': 1.0, 'C': 1.0}         [1.0, 1.0]
1        NaGaTe2  {'Na': 1.0, 'Ga': 1.0, 'Te': 2.0}    [1.0, 1.0, 2.0]
2      Cd(AgI2)2   {'Cd': 1.0, 'Ag': 2.0, 'I': 4.0}    [1.0, 2.0, 4.0]
3             Pb                        {'Pb': 1.0}              [1.0]
4        Mn2CuGe  {'Mn': 2.0, 'Cu': 1.0, 'Ge': 1.0}    [2.0, 1.0, 1.0]


In [9]:
# Filter the dataframe to include only valid compositions
valid_compositions = icsd_true[icsd_true['is_valid']]

valid_compositions['stoichiometry_list'] = valid_compositions['stoichiometry'].apply(
    lambda x: list(x.values()) if pd.notnull(x) else None
)
print(valid_compositions)
# Save the filtered dataframe to a new CSV file
#valid_compositions.to_csv(os.path.join('Materials', 'icsd_valid_materials.csv'), index=False)

      material_id pretty_formula          icsd_ids  is_valid  \
0      mp-1002206            SiC          [182362]      True   
1        mp-10164        NaGaTe2           [44702]      True   
2      mp-1025377      Cd(AgI2)2          [190587]      True   
3      mp-1057273             Pb          [105158]      True   
4      mp-1065918        Mn2CuGe          [184949]      True   
...           ...            ...               ...       ...   
48639     mp-9413      Ca2TiSiO6           [83455]      True   
48640   mp-973926    KCoH3(CO2)3          [181923]      True   
48641   mp-980205          YB4Os          [615147]      True   
48642   mp-999473            ZnN  [236817, 236818]      True   
48643   mp-999550          Mn3Ga          [188332]      True   

                                           stoichiometry  total_amt  \
0                                  {'Si': 1.0, 'C': 1.0}        2.0   
1                      {'Na': 1.0, 'Ga': 1.0, 'Te': 2.0}        4.0   
2                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_compositions['stoichiometry_list'] = valid_compositions['stoichiometry'].apply(
