In [109]:
import pandas as pd
import numpy as np
import plotly as pt
import seaborn as sns
!pip install pymatgen
!pip install mp_api
import requests
import json
!pip install pymatgen nglview



In [110]:
from google.colab import output
output.enable_custom_widget_manager()

In [111]:
df = pd.read_excel("/content/drive/MyDrive/University/Artificial intelligence in chemistry/Perovskite project/Perovskite-liked-oxides-bandgap-prediction/Data/Perovskite dataset export.xlsx",sheet_name='Photocatalytic dataset')

In [112]:
df.columns

Index(['Perovskite', 'Hill formula', 'Interlayer space composition',
       'Bandgap, eV', 'DOI', 'Materials Project ID', 'COD_ID', 'Springer_ID',
       'MP_CIF_modifier', 'COD_CIF_modifier', 'Springer_CIF_modifier', 'Z',
       'Z_MP', 'Z_COD', 'Z_Springer', 'a, A', 'b, A', 'c, A', 'Symmetry group',
       'd,A', 'a_MP', 'b_MP', 'c_MP', 'a_COD', 'b_COD', 'c_COD', 'a_Springer',
       'b_Springer', 'c_Springer', 'Number of octahedrons on a layer',
       'Valence electrons', 'Volume', 'Volume_MP', 'Volume_COD',
       'Volume_Springer', 'Valence Electrons Density',
       'Valence Electrons Density_MP', 'Valence Electrons Density_COD',
       'Springer_Valence Electrons Density', 'avg s valence electrons',
       'avg p valence electrons', 'avg d valence electrons',
       'avg f valence electrons', 'frac s valence electrons',
       'frac p valence electrons', 'frac d valence electrons',
       'frac f valence electrons', 'MagpieData minimum Electronegativity',
       'MagpieData max

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089 entries, 0 to 1088
Data columns (total 81 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Perovskite                            1089 non-null   object 
 1   Hill formula                          730 non-null    object 
 2   Interlayer space composition          5 non-null      object 
 3   Bandgap, eV                           1053 non-null   float64
 4   DOI                                   1084 non-null   object 
 5   Materials Project ID                  971 non-null    object 
 6   COD_ID                                895 non-null    object 
 7   Springer_ID                           195 non-null    object 
 8   MP_CIF_modifier                       105 non-null    object 
 9   COD_CIF_modifier                      46 non-null     object 
 10  Springer_CIF_modifier                 18 non-null     object 
 11  Z                

In [114]:
from pymatgen.core.structure import Structure
from pymatgen.core import Composition
from pymatgen.core.periodic_table import Element
import os
import re
import nglview as nv
from pymatgen.io.ase import AseAtomsAdaptor


In [115]:
subs_map = {
    "Ph": "C6H5",
    "Bn": "C7H7",
    "Pr": "C3H7",
    "Bu": "C4H9",
    "Hx": "C6H13",
    "Me": "CH3",
    "Et": "C2H5",
    "Oc": "C8H17",
    "Dc": "C10H21",
}

import re

def expand_substituents(formula):
    if pd.isna(formula):
        return formula

    for abbr, full in subs_map.items():
        formula = re.sub(rf'{abbr}', full, formula)
    return formula

In [116]:
print(df.shape[0])
df = df[~df['Perovskite'].str.contains("Nx", na=False)]
df = df[~df['Perovskite'].str.contains("Ox", na=False)]
print(df.shape[0])
df['Perovskite'] = df['Perovskite'].apply(expand_substituents)

1089
1068


In [117]:
def getStructureFromCIF(cif_file_name):
  if(cif_file_name==-1):
    return 0
  file_path=f"/content/drive/MyDrive/University/Artificial intelligence in chemistry/Perovskite project/Perovskite-liked-oxides-bandgap-prediction/Data/CIF/{cif_file_name}.cif"
  if os.path.exists(file_path):
    try:
      structure = Structure.from_file(file_path)
    except:
      print('ERROR: Invalid structure for ',cif_file_name)
      return None
  else:
    return None

  if(structure == None):
    return None
  return structure

In [118]:
def eliminateAsterisksDromFormula(formula):
  parts = formula.split('*')
  main_formula = parts[0]
  if len(parts) == 1:
    return main_formula
  print("------------")
  print(formula)
  hydrate_part = parts[1]
  print(hydrate_part)
  #match = re.match(r'(\d+)([A-Za-z0-9]+)', hydrate_part)
  #match = re.match(r'([0-9]*\.?[0-9]+)([A-Za-z0-9]+)', hydrate_part)
  match = re.match(r'([0-9]*\.?[0-9]*)?([A-Za-z0-9]+)', hydrate_part)
  if not match:
     raise ValueError(f"Cannot parse hydrate: {hydrate_part}")
  number_str = match.group(1)
  n = float(number_str) if number_str else 1.0
  molecule = match.group(2)
  print(n)
  print(molecule)
  comp = Composition(molecule)
  comp *= n
  total_formula = Composition(main_formula) + comp
  print(total_formula)
  print("------------")
  return total_formula.reduced_composition

In [119]:
print(eliminateAsterisksDromFormula("CuSO4*5H2O"))
print(eliminateAsterisksDromFormula("CuSO4*0.25H2O"))

------------
CuSO4*5H2O
5H2O
5.0
H2O
Cu1 S1 O9 H10
------------
Cu1 S1 O9 H10
------------
CuSO4*0.25H2O
0.25H2O
0.25
H2O
Cu1 S1 O4.25 H0.5
------------
Cu1 S1 O4.25 H0.5


In [138]:
def checkCompositionStructureMatching(formula,cif_file_name):
  structure = getStructureFromCIF(cif_file_name)
  #print(structure)
  if(structure == None or structure==0):
    return False
  composition = structure.composition
  formula = eliminateAsterisksDromFormula(formula)
  try:
    composition_formula = Composition(formula)
  except:
    return False
  #print(type(composition))
  #print(type(composition_formula))
  #print(composition," || ", composition_formula, " = ")
  #print(composition_formula)
  same = composition.reduced_composition == composition_formula.reduced_composition
  #print(same)
  print("CIF file:",cif_file_name," || " ,composition," || ", composition_formula, " = ",same)
  return same

In [121]:
checkCompositionStructureMatching("Nb6K4O15OO","mp-560692")

K16 Nb24 O68  ||  Nb6 K4 O17  =  True


True

#CIF modifier


In [122]:
def parse_stoichiometric_replacement(expr):
  expr = expr.replace(" ", "")
  if "->" not in expr:
        raise ValueError(f"Invalid expression (missing ->): {expr}")
  lhs, rhs = expr.split("->")
  print("LHS: ",lhs," RHS: ", rhs)

  # --- Parse LHS ---
  m = re.fullmatch(r"(?:(\d+(?:\.\d+)?))?([A-Z][a-z]?)", lhs)
  if not m:
      raise ValueError(f"Invalid LHS: {lhs}")

  lhs_coeff = float(m.group(1)) if m.group(1) else 1.0
  lhs_elem = m.group(2)

  # --- Parse RHS ---
  terms = rhs.split(",")
  rhs_counts = {}
  for term in terms:
        print('Term: ', term)
        #m = re.fullmatch(r"(\d+(?:\.\d+)?)([A-Z][a-z]?)", term)
        m = re.fullmatch(r"(?:(\d+(?:\.\d+)?))?([A-Z][a-z]?)", term)
        if not m:
            raise ValueError(f"Invalid RHS term: {term}")
        print("Term goups: ", m.group(1), "  ; ", m.group(2))
        coeff = 1
        if(m.group(1) is not None):
          coeff = float(m.group(1))
        elem = m.group(2)

        rhs_counts[elem] = rhs_counts.get(elem, 0.0) + coeff
  # --- Normalize RHS ---
  total_rhs = sum(rhs_counts.values())
  if total_rhs == 0:
      raise ValueError("RHS total stoichiometry is zero")

  rhs_fractions = {
      elem: coeff / total_rhs
      for elem, coeff in rhs_counts.items()
  }

  return {
      "from": lhs_elem,
      "total": lhs_coeff,
      "to": rhs_fractions
  }

In [123]:
inp = "Ta->Nb"
com = parse_stoichiometric_replacement(inp)
print(com)

LHS:  Ta  RHS:  Nb
Term:  Nb
Term goups:  None   ;  Nb
{'from': 'Ta', 'total': 1.0, 'to': {'Nb': 1.0}}


In [124]:
def replace_element(comp, from_el, to_dict):
    print("Element replacement start: From ",from_el," To: ",to_dict)
    new_dict = {}

    for el, amt in comp.items():
        if el.symbol == from_el:
            for new_el, frac in to_dict.items():
                new_dict[Element(new_el)] = amt * frac
        else:
            new_dict[el] = amt
    output = Composition(new_dict)
    print("New comp: ", output)
    print("Element replacement is done!")
    return output

def modify_structure(structure, instruction):
  print("Start structure modification!")
  if(structure is None):
    print("Null structure")
    return None
  try:
    instructions = [cmd.strip() for cmd in instruction.split(";") if cmd.strip()]
        #old, new = instruction.split("->")
        #old = old.strip()
        #new = new.strip()
  except ValueError:
    raise ValueError("Failed to separte instructinos")
        #raise ValueError("Instruction must be of the form 'A->B', e.g. 'K->H'")
  output = structure
  for command in instructions:
    parsed_command = parse_stoichiometric_replacement(command)
    print("Parsed command: ", parsed_command)
    for site in structure:
      if site.is_ordered:
        print("Ordered site:", site.specie)
        if site.specie.symbol == parsed_command["from"]:
          site.species = {
              Element(el): frac
              for el, frac in parsed_command["to"].items()  #[TO DO]: not always 1:1 replacement
          }
      else:
        print("Disordered site:", site.species)
        print(site.species)
        print(type(site.species))
        species_comp = site.species;
        new_species_comp = replace_element(species_comp, parsed_command["from"], parsed_command["to"])
        site.species = new_species_comp
  print("Finish structure modification!")
  print("-------------------------------")
  print("-------------------------------")
  print("-------------------------------")
  print("-------------------------------")
  print("-------------------------------")
  return output


In [125]:
#s = getStructureFromCIF("sd_1810747")
#s
#s_new = modify_structure(s, "K->H")
s = getStructureFromCIF("sd_1958942")
s
s_new = modify_structure(s, "2Sr->Sr,Pb")
s_new.to("new_cif.cif","cif")

Start structure modification!
LHS:  2Sr  RHS:  Sr,Pb
Term:  Sr
Term goups:  None   ;  Sr
Term:  Pb
Term goups:  None   ;  Pb
Parsed command:  {'from': 'Sr', 'total': 2.0, 'to': {'Sr': 0.5, 'Pb': 0.5}}
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Sr': 0.5, 'Pb': 0.5}
New comp:  Bi0.5 Sr0.25 Pb0.25
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Sr': 0.5, 'Pb': 0.5}
New comp:  Bi0.5 Sr0.25 Pb0.25
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Sr': 0.5, 'Pb': 0.5}
New comp:  Bi0.5 Sr0.25 Pb0.25
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Sr': 0.5, 'Pb': 0.5}
New

'_atom_site_label'
  struct = parser.parse_structures(primitive=primitive)[0]
  self.symmetry_operations = self.get_symops(data)  # type:ignore[assignment]
  if struct := self._get_structure(data, primitive, symmetrized, check_occu=check_occu):
  self.symmetry_operations = self.get_symops(data)  # type:ignore[assignment]
No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)


"# generated using pymatgen\ndata_SrTa2Bi3PbClO11\n_symmetry_space_group_name_H-M   'P 1'\n_cell_length_a   3.91000000\n_cell_length_b   3.91000000\n_cell_length_c   18.49600000\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   1\n_chemical_formula_structural   SrTa2Bi3PbClO11\n_chemical_formula_sum   'Sr1 Ta2 Bi3 Pb1 Cl1 O11'\n_cell_volume   282.76869760\n_cell_formula_units_Z   1\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Sr  Sr_fix1  1  0.50000000  0.50000000  0.73255000  0.25\n  Bi  Sr_fix1  1  0.50000000  0.50000000  0.73255000  0.5\n  Pb  Sr_fix1  1  0.50000000  0.50000000  0.73255000  0.25\n  Sr  Sr_fix1  1  0.50000000  0.50000000  0.26745000  0.25\n  Bi  Sr_fix1  1  0.50000000  0.50000000  

In [126]:
def modify_CIF(cif_file_name, instruction):
  structure = getStructureFromCIF(cif_file_name)
  if(structure == None or structure==0):
    return None
  new_structure = modify_structure(structure, instruction)
  return new_structure

def modify_all_CIFs(cif_input_column, instruction_column, cif_output_column, prefix):
  results = []
  counter=0
  col_idx = {name: i for i, name in enumerate(df.columns)}
  cif_i = col_idx[cif_input_column]
  instr_i = col_idx[instruction_column]

  for i, row in enumerate(df.itertuples(index=False, name=None), start=1):
    cif_input = row[cif_i]
    instruction = row[instr_i]
    print("CIF input: ", cif_input, " instruction: ", instruction)
    if pd.isna(instruction):
      print("No instruction")
      results.append(cif_input)
      continue
    new_CIF = modify_CIF(cif_input,instruction)
    new_CIF_name = "M_"+ prefix +str(counter)
    counter = counter +1
    file_path=f"/content/drive/MyDrive/University/Artificial intelligence in chemistry/Perovskite project/Perovskite-liked-oxides-bandgap-prediction/Data/CIF/{new_CIF_name}.cif"
    new_CIF.to(file_path,"cif")
    results.append(new_CIF_name)

  df[cif_output_column] = results
  print("Modified CIFs: ", counter)


In [127]:
modify_all_CIFs("Materials Project ID", "MP_CIF_modifier", "MP_CIF_modified", "MP")

CIF input:  mp-560692  instruction:  nan
No instruction
CIF input:  mp-1223501  instruction:  nan
No instruction
CIF input:  mp-553965  instruction:  nan
No instruction
CIF input:  mp-553248  instruction:  nan
No instruction
CIF input:  mp-557195  instruction:  nan
No instruction
CIF input:  mp-20396  instruction:  nan
No instruction
CIF input:  mp-581330  instruction:  nan
No instruction
CIF input:  mp-557195  instruction:  Ca->Sr
Start structure modification!
LHS:  Ca  RHS:  Sr
Term:  Sr
Term goups:  None   ;  Sr
Parsed command:  {'from': 'Ca', 'total': 1.0, 'to': {'Sr': 1.0}}
Ordered site: K
Ordered site: K
Ordered site: K
Ordered site: K
Ordered site: Ca
Ordered site: Ca
Ordered site: Ca
Ordered site: Ca
Ordered site: Ca
Ordered site: Ca
Ordered site: Ca
Ordered site: Ca
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered si

  writer: Any = CifWriter(self, **kwargs)
  writer: Any = CifWriter(self, **kwargs)


CIF input:  mp-1217452  instruction:  nan
No instruction
CIF input:  mp-1217452  instruction:  nan
No instruction
CIF input:  mp-1217452  instruction:  nan
No instruction
CIF input:  mp-1104930  instruction:  nan
No instruction
CIF input:  mp-1104930  instruction:  nan
No instruction
CIF input:  mp-1104930  instruction:  nan
No instruction
CIF input:  mp-1104930  instruction:  nan
No instruction
CIF input:  mp-1104930  instruction:  nan
No instruction
CIF input:  mp-1104930  instruction:  Ti->Zr
Start structure modification!
LHS:  Ti  RHS:  Zr
Term:  Zr
Term goups:  None   ;  Zr
Parsed command:  {'from': 'Ti', 'total': 1.0, 'to': {'Zr': 1.0}}
Ordered site: K
Ordered site: K
Ordered site: La
Ordered site: La
Ordered site: Ti
Ordered site: Ti
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Finish structure modification!
-------------------------------
-------------------------------
--------------------------

  writer: Any = CifWriter(self, **kwargs)


CIF input:  mp-1179025  instruction:  3Ti->2.6Ti, 0.4Cr
Start structure modification!
LHS:  3Ti  RHS:  2.6Ti,0.4Cr
Term:  2.6Ti
Term goups:  2.6   ;  Ti
Term:  0.4Cr
Term goups:  0.4   ;  Cr
Parsed command:  {'from': 'Ti', 'total': 3.0, 'to': {'Ti': 0.8666666666666667, 'Cr': 0.13333333333333333}}
Ordered site: Ti
Ordered site: Ti
Ordered site: Ti
Ordered site: Bi
Ordered site: Bi
Ordered site: Bi
Ordered site: Bi
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Finish structure modification!
-------------------------------
-------------------------------
-------------------------------
-------------------------------
-------------------------------
CIF input:  mp-1179025  instruction:  3Ti->2.6Ti, 0.4Cr
Start structure modification!
LHS:  3Ti  RHS:  2.6Ti,0.4Cr
Term:  2.6Ti
Term goups:  2.6   ;  Ti
Term:  0.4Cr
Term goups:  0.4   ;  Cr
Parsed co

In [128]:
modify_all_CIFs("COD_ID", "COD_CIF_modifier", "COD_CIF_modified","COD")

CIF input:  1001842  instruction:  nan
No instruction
CIF input:  1545643  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF input:  2004917  instruction:  nan
No instruction
CIF input:  1521061  instruction:  nan
No instruction
CIF input:  2238958  instruction:  nan
No instruction
CIF input:  1518045  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF input:  1010942  instruction:  nan
No instruction
CIF input:  1001842  instruction:  nan
No instruction
CIF input:  1545643  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF input:  2004917  instruction:  nan
No instruction
CIF input:  1521061  instruction:  nan
No instruction
CIF input:  2238958  instruction:  nan
No instruction
CIF input:  1518045  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF 

  writer: Any = CifWriter(self, **kwargs)


Start structure modification!
LHS:  Cs  RHS:  H
Term:  H
Term goups:  None   ;  H
Parsed command:  {'from': 'Cs', 'total': 1.0, 'to': {'H': 1.0}}
Ordered site: Cs
Ordered site: Ca
Ordered site: Ca
Ordered site: Ta
Ordered site: Ta
Ordered site: Ta
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
LHS:  Ta  RHS:  Nb
Term:  Nb
Term goups:  None   ;  Nb
Parsed command:  {'from': 'Ta', 'total': 1.0, 'to': {'Nb': 1.0}}
Ordered site: H
Ordered site: Ca
Ordered site: Ca
Ordered site: Ta
Ordered site: Ta
Ordered site: Ta
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Finish structure modification!
-------------------------------
-------------------------------
-------------------------------
-------------------------------
-------------------------------
CIF input:  1522043 

  writer: Any = CifWriter(self, **kwargs)


CIF input:  1522043  instruction:  Cs->H; Ca->Sr; Ta->Nb
Start structure modification!
LHS:  Cs  RHS:  H
Term:  H
Term goups:  None   ;  H
Parsed command:  {'from': 'Cs', 'total': 1.0, 'to': {'H': 1.0}}
Ordered site: Cs
Ordered site: Ca
Ordered site: Ca
Ordered site: Ta
Ordered site: Ta
Ordered site: Ta
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
LHS:  Ca  RHS:  Sr
Term:  Sr
Term goups:  None   ;  Sr
Parsed command:  {'from': 'Ca', 'total': 1.0, 'to': {'Sr': 1.0}}
Ordered site: H
Ordered site: Ca
Ordered site: Ca
Ordered site: Ta
Ordered site: Ta
Ordered site: Ta
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
LHS:  Ta  RHS:  Nb
Term:  Nb
Term goups:  None   ;  Nb
Parsed command:  {'from': 'Ta', 'total': 1.0, 'to': {'Nb': 1.0}}
Ordered site: H
Ordered site: Sr
O

  writer: Any = CifWriter(self, **kwargs)


CIF input:  1526803  instruction:  nan
No instruction
CIF input:  2238958  instruction:  Rb->Ag
Start structure modification!
LHS:  Rb  RHS:  Ag
Term:  Ag
Term goups:  None   ;  Ag
Parsed command:  {'from': 'Rb', 'total': 1.0, 'to': {'Ag': 1.0}}
Ordered site: Rb
Ordered site: Ca
Ordered site: Ca
Ordered site: Nb
Ordered site: Nb
Ordered site: Nb
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Finish structure modification!
-------------------------------
-------------------------------
-------------------------------
-------------------------------
-------------------------------
CIF input:  -1  instruction:  nan
No instruction
CIF input:  1509663  instruction:  nan
No instruction
CIF input:  -1  instruction:  nan
No instruction
CIF input:  1509430  instruction:  nan
No instruction
CIF input:  2238958  instruction:  nan
No instruction
CIF input:  2238958  instruction:  Rb->Ag

In [129]:
modify_all_CIFs("Springer_ID", "Springer_CIF_modifier", "Springer_CIF_modified","Springer")

CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction


'_atom_site_label'
  struct = parser.parse_structures(primitive=primitive)[0]
  self.symmetry_operations = self.get_symops(data)  # type:ignore[assignment]
  if struct := self._get_structure(data, primitive, symmetrized, check_occu=check_occu):
  self.symmetry_operations = self.get_symops(data)  # type:ignore[assignment]
Pauling file corrections applied.
No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)


Start structure modification!
LHS:  K  RHS:  H
Term:  H
Term goups:  None   ;  H
Parsed command:  {'from': 'K', 'total': 1.0, 'to': {'H': 1.0}}
Disordered site: K0.833
K0.833
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  K  To:  {'H': 1.0}
New comp:  H0.833
Element replacement is done!
Disordered site: K0.833
K0.833
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  K  To:  {'H': 1.0}
New comp:  H0.833
Element replacement is done!
Disordered site: K0.833
K0.833
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  K  To:  {'H': 1.0}
New comp:  H0.833
Element replacement is done!
Disordered site: K0.833
K0.833
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  K  To:  {'H': 1.0}
New comp:  H0.833
Element replacement is done!
Disordered site: La0.665 K0.335
La0.665 K0.335
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  K  To:  {'H': 1.

'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)


Start structure modification!
LHS:  Nd  RHS:  Pr
Term:  Pr
Term goups:  None   ;  Pr
Parsed command:  {'from': 'Nd', 'total': 1.0, 'to': {'Pr': 1.0}}
Ordered site: Cs
Ordered site: Nd
Ordered site: Ta
Ordered site: Ta
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Ordered site: O
Finish structure modification!
-------------------------------
-------------------------------
-------------------------------
-------------------------------
-------------------------------
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  sd_1150217  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  nan  instruction:  nan
No instruction
CIF input:  sd_1150217  instruction:  Nd->Sm
Start structure modification!
LHS:  Nd  RHS:  Sm
Term:  Sm
Term goups:  None 

No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]
  writer: Any = CifWriter(self, **kwargs)
  writer: Any = CifWriter(self, **kwargs)


Start structure modification!
LHS:  Sr  RHS:  Ba
Term:  Ba
Term goups:  None   ;  Ba
Parsed command:  {'from': 'Sr', 'total': 1.0, 'to': {'Ba': 1.0}}
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Ba': 1.0}
New comp:  Bi0.5 Ba0.5
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Ba': 1.0}
New comp:  Bi0.5 Ba0.5
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Ba': 1.0}
New comp:  Bi0.5 Ba0.5
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Sr  To:  {'Ba': 1.0}
New comp:  Bi0.5 Ba0.5
Element replacement is done!
Ordered site: Ta
Ordered site: Ta
Ordered site: Bi
Ordered site: Cl
Or

  writer: Any = CifWriter(self, **kwargs)


Start structure modification!
LHS:  Ta  RHS:  Nb
Term:  Nb
Term goups:  None   ;  Nb
Parsed command:  {'from': 'Ta', 'total': 1.0, 'to': {'Nb': 1.0}}
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Ta  To:  {'Nb': 1.0}
New comp:  Bi0.5 Sr0.5
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Ta  To:  {'Nb': 1.0}
New comp:  Bi0.5 Sr0.5
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Ta  To:  {'Nb': 1.0}
New comp:  Bi0.5 Sr0.5
Element replacement is done!
Disordered site: Bi0.5 Sr0.5
Bi0.5 Sr0.5
<class 'pymatgen.core.composition.Composition'>
Element replacement start: From  Ta  To:  {'Nb': 1.0}
New comp:  Bi0.5 Sr0.5
Element replacement is done!
Ordered site: Ta
Ordered site: Ta
Ordered site: Bi
Ordered site: Cl
Or

In [130]:
df.to_excel("checkpoint_CIF_modification.xlsx")

#CIF Verification

In [139]:
#def verifyCIFFilesColumn(column):
#df["Materials Project verification"] = df.apply(lambda row: checkCompositionStructureMatching(row['Perovskite'], row['Materials Project ID']), axis=1)
df["Materials Project verification"] = df.apply(lambda row: checkCompositionStructureMatching(row['Perovskite'], row['MP_CIF_modified']), axis=1)

CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-1223501  ||  K1 La1 Nb2 O7  ||  K1 La1 Nb2 O7  =  True
CIF file: mp-553965  ||  Rb2 La2 Nb4 O14  ||  Rb1 La1 Nb2 O7  =  True
CIF file: mp-553248  ||  Cs1 La1 Nb2 O7  ||  Cs1 La1 Nb2 O7  =  True
CIF file: mp-557195  ||  K4 Ca8 Nb12 O40  ||  K1 Ca2 Nb3 O10  =  True
CIF file: mp-20396  ||  Rb1 Ca2 Nb3 O10  ||  Rb1 Ca2 Nb3 O10  =  True
CIF file: mp-581330  ||  Cs8 Ca16 Nb24 O80  ||  Cs1 Ca2 Nb3 O10  =  True
CIF file: M_MP0  ||  K4 Sr8 Nb12 O40  ||  K1 Sr2 Nb3 O10  =  True
CIF file: mp-1245098  ||  Ti30 O60  ||  Ti1 O2  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-1223501  ||  K1 La1 Nb2 O7  ||  K1 La1 Nb2 O7  =  True
CIF file: mp-553965  ||  Rb2 La2 Nb4 O14  ||  Rb1 La1 Nb2 O7  =  True
CIF file: mp-553248  ||  Cs1 La1 Nb2 O7  ||  Cs1 La1 Nb2 O7  =  True
CIF file: mp-557195  ||  K4 Ca8 Nb12 O40  ||  K1 Ca2 Nb3 O10  =  True
CIF file: mp-20396  ||  Rb1 Ca2 Nb3 O10  ||  Rb1 Ca2 Nb

  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: mp-1228150  ||  Ba3 La1 Nb3 O12  ||  Ba3 La1 Nb3 O12  =  True
CIF file: mp-561133  ||  Sr5 Nb4 O15  ||  Sr5 Nb4 O15  =  True
CIF file: mp-3563  ||  Ba5 Nb4 O15  ||  Ba5 Nb4 O15  =  True
CIF file: mp-3249  ||  La4 Ti3 O12  ||  La4 Ti3 O12  =  True
CIF file: mp-1228245  ||  Ba2 La8 Ti8 O30  ||  Ca1 La4 Ti4 O15  =  False
CIF file: mp-13664  ||  Sr4 Ta4 O14  ||  Sr2 Ta2 O7  =  True
CIF file: mp-15590  ||  Sr16 Nb16 O56  ||  Sr2 Nb2 O7  =  True
CIF file: mp-13664  ||  Sr4 Ta4 O14  ||  Sr2 Ta2 O7  =  True
CIF file: mp-15590  ||  Sr16 Nb16 O56  ||  Sr2 Nb2 O7  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: M_MP12  ||  K16 Ta24 O68  ||  K4 Ta6 O17  =  True
CIF file: mp-3614  ||  K1 Ta1 O3  ||  K1 Ta1 O3  =  True
CIF file: mp-5532  ||  Sr2 Ti1 O4  ||  Sr2 Ti1 O4  =  True
CIF file: mp-5532  ||  Sr2 Ti1 O4  ||  Sr1.9 La0.1 Ti1 O4  =  False
CIF file: mp-5532  ||  Sr2 Ti1 O4  ||  Sr1.8 La0.2 Ti1 O4  =  False
CIF file: mp-5532  ||  Sr2 Ti1 O4  ||  Sr1.7 La

  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: mp-13664  ||  Sr4 Ta4 O14  ||  Sr2 Ta2 O7  =  True
CIF file: mp-769246  ||  Sr16 Ta8 O36  ||  Sr4 Ta2 O9  =  True
CIF file: mp-769297  ||  Sr5 Ta4 O15  ||  Sr5 Ta4 O15  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  =  True
CIF file: mp-560692  ||  K16 Nb24 O68  ||  K4 Nb6 O17  = 

In [140]:
#df["COD verification"] = df.apply(lambda row: checkCompositionStructureMatching(row['Perovskite'], row['COD_ID']), axis=1)
df["COD verification"] = df.apply(lambda row: checkCompositionStructureMatching(row['Perovskite'], row['COD_CIF_modified']), axis=1)

CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1545643  ||  K2.188 La1.936 Nb4 O14  ||  K1 La1 Nb2 O7  =  False
CIF file: 2004917  ||  Cs1 La1 Nb2 O7  ||  Cs1 La1 Nb2 O7  =  True
CIF file: 1521061  ||  K4 Ca8 Nb12 O40  ||  K1 Ca2 Nb3 O10  =  True
CIF file: 2238958  ||  Rb1 Ca2 Nb3 O10  ||  Rb1 Ca2 Nb3 O10  =  True
CIF file: 1518045  ||  Cs8 Ca16 Nb24 O80  ||  Cs1 Ca2 Nb3 O10  =  True
CIF file: 1010942  ||  Ti4+4 O2-8  ||  Ti1 O2  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False


  CIF={'K': 1.0, 'La': 1.0, 'Nb': 2.0, 'O': 7.0}
  PMG={'K': 2.188, 'La': 1.936, 'Nb': 4.0, 'O': 14.0}
  ratios={'Nb': 2.0, 'O': 2.0, 'La': 1.936, 'K': 2.188}
  if struct := self._get_structure(data, primitive, symmetrized, check_occu=check_occu):


CIF file: 1545643  ||  K2.188 La1.936 Nb4 O14  ||  K1 La1 Nb2 O7  =  False
CIF file: 2004917  ||  Cs1 La1 Nb2 O7  ||  Cs1 La1 Nb2 O7  =  True
CIF file: 1521061  ||  K4 Ca8 Nb12 O40  ||  K1 Ca2 Nb3 O10  =  True
CIF file: 2238958  ||  Rb1 Ca2 Nb3 O10  ||  Rb1 Ca2 Nb3 O10  =  True
CIF file: 1518045  ||  Cs8 Ca16 Nb24 O80  ||  Cs1 Ca2 Nb3 O10  =  True
CIF file: 1010942  ||  Ti4+4 O2-8  ||  Ti1 O2  =  False
CIF file: M_COD0  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD1  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD2  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD3  ||  Sr2 Nb3 H1 O10  ||  H1 Sr2 Nb3 O10  =  True
CIF file: M_COD4  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD5  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD6  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD7  ||  Sr2 Nb3 H1 O10  ||  H1 Sr2 Nb3 O10  =  True
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False

  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: 2106523  ||  Sr8 Ta8 O28  ||  Sr2 Ta2 O7  =  True
CIF file: 2002850  ||  Sr2+16 Nb5+16 O2-56  ||  Sr2 Nb2 O7  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: M_COD8  ||  K16 Ta24 O68  ||  K4 Ta6 O17  =  True
CIF file: 2102087  ||  K1 Ta1 O3  ||  K1 Ta1 O3  =  True
CIF file: 1517788  ||  Sr2+4 Ti4+2 O2-8  ||  Sr2 Ti1 O4  =  False
CIF file: 1517788  ||  Sr2+4 Ti4+2 O2-8  ||  Sr1.9 La0.1 Ti1 O4  =  False
CIF file: 1517788  ||  Sr2+4 Ti4+2 O2-8  ||  Sr1.8 La0.2 Ti1 O4  =  False
CIF file: 1517788  ||  Sr2+4 Ti4+2 O2-8  ||  Sr1.7 La0.3 Ti1 O4  =  False
CIF file: 1517788  ||  Sr2+4 Ti4+2 O2-8  ||  Sr1.6 La0.4 Ti1 O4  =  False
CIF file: 1517788  ||  Sr2+4 Ti4+2 O2-8  ||  Sr1.5 La0.5 Ti1 O4  =  False
CIF file: 1522043  ||  Cs1 Ca2 Ta3 O10  ||  Cs1 Ca2 Ta3 O10  =  True
CIF file: 7221084  ||  Na2 Ca4 Ta6 O20  ||  Cs0.03 Na0.97 Ca2 Ta3 O10  =  False
CIF file: 1522043  ||  Cs1 Ca2 Ta3 O10  ||  Cs1 Ca2 Ta3 O10  =  True
CIF file: 7221084  ||  Na2 Ca4 Ta

  CIF={'Ba': 1.0, 'Bi': 4.0, 'O': 15.0, 'Ti': 4.0}
  PMG={'Ti': 16.0, 'Ba': 3.76, 'Bi': 16.239999999999995, 'O': 60.0}
  ratios={'Bi': 4.059999999999999, 'O': 4.0, 'Ti': 4.0, 'Ba': 3.76}
  if struct := self._get_structure(data, primitive, symmetrized, check_occu=check_occu):


CIF file: 1010942  ||  Ti4+4 O2-8  ||  Ti1 O2  =  False
CIF file: 1529527  ||  Nb4 Bi16 Cl4 O32  ||  Bi4 Nb1 O8 Cl1  =  True
CIF file: 7221321  ||  Ta4 Bi16 Cl4 O32  ||  Bi4 Ta1 O8 Cl1  =  True
CIF file: 1544432  ||  Na2 La2 Ta4 O14  ||  Na1 La1 Ta2 O7  =  True
CIF file: 1544432  ||  Na2 La2 Ta4 O14  ||  Na1 La1 Ta2 O7  =  True
CIF file: 1522041  ||  Li2 Ca4 Ta6 O20  ||  Li1 Ca2 Ta3 O10  =  True
CIF file: 1522039  ||  K2 Ca4 Ta6 O20  ||  K1 Ca2 Ta3 O10  =  True
CIF file: 2238958  ||  Rb1 Ca2 Nb3 O10  ||  Rb1 Ca2 Ta3 O10  =  False
CIF file: 1522043  ||  Cs1 Ca2 Ta3 O10  ||  Cs1 Ca2 Ta3 O10  =  True
CIF file: 1522041  ||  Li2 Ca4 Ta6 O20  ||  Li1 Ca2 Ta3 O10  =  True
CIF file: M_COD9  ||  Ca2 Nb3 H1 O10  ||  H1 Ca2 Nb3 O10  =  True
CIF file: M_COD10  ||  Sr2 Nb3 H1 O10  ||  H1 Sr2 Nb3 O10  =  True
CIF file: M_COD11  ||  Sr2 Nb3 H1 O10  ||  H1 Sr2 Nb3 O10  =  True
CIF file: M_COD12  ||  Sr2 Ta3 H1 O10  ||  H1 Sr2 Ta3 O10  =  True
CIF file: M_COD13  ||  Sr2 Nb3 H1 O10  ||  H1 Sr2 Nb3 O10  

'_atom_site_label'
  struct = parser.parse_structures(primitive=primitive)[0]
  self.symmetry_operations = self.get_symops(data)  # type:ignore[assignment]
  if struct := self._get_structure(data, primitive, symmetrized, check_occu=check_occu):
  self.symmetry_operations = self.get_symops(data)  # type:ignore[assignment]
Pauling file corrections applied.
No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]
No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No

CIF file: sd_0548135  ||  Nb8 Bi8.04 Sr3.96 O36  ||  Sr1 Bi2 Nb2 O9  =  False
CIF file: sd_1045310  ||  Ca4 Bi8 Nb8 O36  ||  Ca1 Bi2 Nb2 O9  =  True
CIF file: sd_0548135  ||  Nb8 Bi8.04 Sr3.96 O36  ||  Sr1 Bi2 Nb2 O9  =  False
CIF file: 1001030  ||  Na+16 Ta5+16 O2-47.984  ||  Na2 Ta2 O6  =  False
CIF file: 1001177  ||  Ca2+16 Ta5+16 O2-56  ||  Ca2 Ta2 O7  =  False
CIF file: 1001177  ||  Ca2+16 Ta5+16 O2-56  ||  Ca2 Ta2 O7  =  False
CIF file: 1001177  ||  Ca2+16 Ta5+16 O2-56  ||  Ca2 Ta2 O7  =  False
CIF file: 1001177  ||  Ca2+16 Ta5+16 O2-56  ||  Ca2 Ta2 O7  =  False
CIF file: 1001177  ||  Ca2+16 Ta5+16 O2-56  ||  Ca2 Ta2 O7  =  False
CIF file: 1011128  ||  Ca2+16 Nb5+16 O2-56  ||  Ca2 Nb2 O7  =  False
CIF file: 1001022  ||  La3+8 Ti4+8 O2-28  ||  La2 Ti2 O7  =  False
CIF file: 1001022  ||  La3+8 Ti4+8 O2-28  ||  La2 Ti2 O7  =  False
CIF file: 1011054  ||  Cd2+2 S2-2  ||  Cd1 S1  =  False
CIF file: 1011195  ||  Zn2+2 S2-2  ||  Zn1 S1  =  False
CIF file: 1534928  ||  Sr4 Ta8 O24  ||  S

  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1001842  ||  K+16 Nb5+24 O2-68  ||  K4 Nb6 O17  =  False
CIF file: 1522043  ||  Cs1 Ca2 Ta3 O10  ||  H1 Ca2 Ta3 O9.7 N0

In [141]:
#df["Springer verification"] = df.apply(lambda row: checkCompositionStructureMatching(row['Perovskite'], row['Springer_ID']), axis=1)
df["Springer verification"] = df.apply(lambda row: checkCompositionStructureMatching(row['Perovskite'], row['Springer_CIF_modified']), axis=1)

ERROR: Invalid structure for  sd_1614775
ERROR: Invalid structure for  sd_1614775
ERROR: Invalid structure for  sd_1614775


  block = CifBlock.from_str(f"data_{block_str}")
  block = CifBlock.from_str(f"data_{block_str}")
  symbol = self._parse_symbol(label)
could not convert string to float: 't'
  struct = parser.parse_structures(primitive=primitive)[0]
could not convert string to float: 't'
  struct = parser.parse_structures(primitive=primitive)[0]
No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
t parsed as T
No structure parsed for section 3 in CIF.
could not convert string to float: 't'
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
t parsed as T
No structure parsed for section 4 in CIF.
could not convert string to float: 't'
  struct = parser.parse_structures(primitive=primitive)[0]
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spa

CIF file: sd_0308503  ||  K1 Nb1 O3  ||  K1 Nb1 O3  =  True
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 C3 H7 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 Nd1 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 Sm1 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 C3 H7 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 Nd1 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 Sm1 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 C3 H7 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 Nd1 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 Sm1 Ta2 O7  =  False
CIF file: sd_1232960  ||  Ta8 O20  ||  Ta2 O5  =  True
CIF file: sd_0314261  ||  Nb28 O70  ||  Nb2 O5  =  True
CIF file: sd_0551678  ||  Ti16 Bi16 Pb4 O60  ||  Pb1 Bi4 Ti4 O15  =  True
CIF file: sd_0306542  ||  La4 Ca1 Ti4 O15  ||  Ca1 La4 Ti4 O15  =  True
CIF file: sd_0306445  ||  La4 Sr1 Ti4 

10 fractional coordinates rounded to ideal values to avoid issues with finite precision.
No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: sd_0306445  ||  La4 Sr1 Ti4 O15  ||  Sr1 La4 Ti4 O15  =  True
CIF file: sd_0306542  ||  La4 Ca1 Ti4 O15  ||  Ca1 La4 Ti4 O15  =  True
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  K4 Nb6 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  K4 Nb4 Ta2 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  K4 Nb3 Ta3 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  K4 Nb2 Ta4 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  K4 Ta6 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  Rb4 Nb6 O17  =  True
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  Rb4 Nb4 Ta2 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  Rb4 Nb3 Ta3 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  Rb4 Nb2 Ta4 O17  =  False
CIF file: sd_0376709  ||  Rb16 Nb24 O68  ||  Rb4 Ta6 O17  =  False
CIF file: sd_0314261  ||  Nb28 O70  ||  Nb2 O5  =  True
CIF file: sd_0308503  ||  K1 Nb1 O3  ||  K1 Nb1 O3  =  True
CIF file: sd_1127671  ||  K4 Nb12 O32  ||  K1 Nb3 O8

'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]


ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412


  symbol = self._parse_symbol(data["_atom_site_type_symbol"][idx])
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
OH parsed as 
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
OH parsed as 
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: sd_1200159  ||  Sr2 Ta4 O10  ||  H2 Sr1 Ta2 O7  =  False
CIF file: sd_1200159  ||  Sr2 Ta4 O10  ||  H2 Sr1 Ta2 O7  =  False
CIF file: sd_1810747  ||  K4.002 La1.33 Ta4 O14  ||  K6 La2 Ta6 O21  =  False
CIF file: M_Springer0  ||  La1.33 H4.002 Ta4 O14  ||  H6 La2 Ta6 O21  =  False
CIF file: sd_1810747  ||  K4.002 La1.33 Ta4 O14  ||  K6 La2 Ta6 O21  =  False
CIF file: M_Springer1  ||  La1.33 H4.002 Ta4 O14  ||  H6 La2 Ta6 O21  =  False
CIF file: M_Springer2  ||  La1.33 H4.002 Ta4 O14  ||  H6 La2 Ta6 O21  =  False
CIF file: sd_1810747  ||  K4.002 La1.33 Ta4 O14  ||  K2 Sr1.5 Ta3 O10  =  False
CIF file: sd_1810747  ||  K4.002 La1.33 Ta4 O14  ||  K2 Sr1.5 Ta3 O10  =  False
CIF file: sd_1430726  ||  K4.64 Sr1.36 Nb10 O28.64  ||  K2.33 Sr0.67 Nb5 O14.335  =  False
CIF file: sd_1430726  ||  K4.64 Sr1.36 Nb10 O28.64  ||  H2.33 Sr0.67 Nb5 O14.335  =  False
CIF file: sd_1430726  ||  K4.64 Sr1.36 Nb10 O28.64  ||  H2.33 Sr0.67 Nb5 O14.335  =  False
ERROR: Invalid structure for  sd_1922152

'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
t parsed as T
No structure parsed for section 3 in CIF.
could not convert string to float: 't'
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
t parsed as T
No structure parsed for section 4 in CIF.
could not convert string to float: 't'
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: sd_1003951  ||  Nb4 Bi16 Cl4 O32  ||  Bi4 Nb1 O8 Cl1  =  True
CIF file: sd_1241784  ||  La1 Ta2 O7  ||  H1 La1 Ta2 O7  =  False
CIF file: M_Springer3  ||  Cs1 La1 Ta2 O7  ||  Cs1 La1 Ta2 O7  =  True
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  Rb1 C3 H7 Ta2 O7  =  False
CIF file: M_Springer4  ||  Cs1 Pr1 Ta2 O7  ||  Cs1 C3 H7 Ta2 O7  =  False
CIF file: sd_1150217  ||  Cs1 Nd1 Ta2 O7  ||  Cs1 Nd1 Ta2 O7  =  True
CIF file: M_Springer5  ||  Cs1 Sm1 Ta2 O7  ||  Cs1 Sm1 Ta2 O7  =  True
CIF file: sd_1241784  ||  La1 Ta2 O7  ||  H1 La1 Ta2 O7  =  False
CIF file: sd_1050391  ||  Rb1 Pr1 Ta2 O7  ||  H1 Ca2 Nb3 O10  =  False
CIF file: sd_1241782  ||  La1 Nb2 O7  ||  H1 La1 Nb2 O7  =  False
CIF file: sd_1835408  ||  La8 Ca2 Ti10 O34  ||  La4 Ca1 Ti5 O17  =  True
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structu

No structure parsed for section 1 in CIF.
'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: sd_1000988  ||  Cs1 Sr2 Ta3 O10  ||  Cs1 Sr1.5 Ba0.5 Ta3 O10  =  False
CIF file: sd_1000988  ||  Cs1 Sr2 Ta3 O10  ||  H1 Sr1.5 Ba0.5 Ta3 O10  =  False
ERROR: Invalid structure for  sd_1210175
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1614775
ERROR: Invalid structure for  sd_1614775
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1925412
ERROR: Invalid structure for  sd_1241787


'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
t parsed as T
No structure parsed for section 3 in CIF.
could not convert string to float: 't'
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
t parsed as T
No structure parsed for section 4 in CIF.
could not convert string to float: 't'
  struct = parser.parse_structures(primitive=primitive)[0]


CIF file: sd_1241784  ||  La1 Ta2 O7  ||  H1 Nd1 Ta2 O7  =  False
CIF file: sd_1955780  ||  Li4 Nd4 Ta8 O28  ||  Li1 Nd1 Ta2 O7  =  True
CIF file: sd_1955782  ||  Na4 Nd4 Ta8 O28  ||  Na1 Nd1 Ta2 O7  =  True
CIF file: sd_1538044  ||  K4 Nd4 Ta8 O28  ||  K1 Nd1 Ta2 O7  =  True
CIF file: sd_1050392  ||  Rb1 Nd1 Ta2 O7  ||  Rb1 Nd1 Ta2 O7  =  True
CIF file: sd_1150217  ||  Cs1 Nd1 Ta2 O7  ||  Cs1 Nd1 Ta2 O7  =  True
ERROR: Invalid structure for  sd_1241787
------------
H2K0.5Bi2.5Ti4O13*H2O
H2O
1.0
H2O
H4 K0.5 Bi2.5 Ti4 O14
------------
CIF file: sd_1530646  ||  K5 Ti8 Bi5 H4 O28  ||  H4 K0.5 Bi2.5 Ti4 O14  =  False
CIF file: sd_1241784  ||  La1 Ta2 O7  ||  H1 La0.7 Tb0.3 Ta2 O7  =  False
CIF file: sd_1958942  ||  Bi3 Sr2 Ta2 Cl1 O11  ||  Sr2 Bi3 Ta2 O11 Cl1  =  True
CIF file: M_Springer6  ||  Ta2 Sr1 Bi3 Pb1 Cl1 O11  ||  Sr1 Pb1 Bi3 Ta2 O11 Cl1  =  True
CIF file: M_Springer7  ||  Ba2 Bi3 Ta2 Cl1 O11  ||  Ba2 Bi3 Ta2 O11 Cl1  =  True
CIF file: M_Springer8  ||  Ta2 Ba1 Bi3 Pb1 Cl1 O11  || 

In [142]:
def markEntriesWithoutVerifiedCIF(ver1, ver2, ver3):
  if(ver1 or ver2 or ver3):
    return False
  return True

In [143]:
df["General verification"] = df.apply(lambda row: markEntriesWithoutVerifiedCIF(row['Materials Project verification'], row['COD verification'],row['Springer verification']), axis=1)
df_filtered = df[df['General verification'] != True]
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 445 entries, 0 to 1088
Data columns (total 88 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Perovskite                            445 non-null    object 
 1   Hill formula                          324 non-null    object 
 2   Interlayer space composition          4 non-null      object 
 3   Bandgap, eV                           438 non-null    float64
 4   DOI                                   442 non-null    object 
 5   Materials Project ID                  437 non-null    object 
 6   COD_ID                                402 non-null    object 
 7   Springer_ID                           94 non-null     object 
 8   MP_CIF_modifier                       102 non-null    object 
 9   COD_CIF_modifier                      44 non-null     object 
 10  Springer_CIF_modifier                 12 non-null     object 
 11  Z                      

In [144]:
df_filtered.to_excel("checkpoint_CIF_verification.xlsx")

In [145]:
df.to_excel("checkpoint_CIF_verification_labels.xlsx")