In [1]:
import os
from pathlib import Path
import re
import pandas as pd

In [2]:
data_loc = "C:/Users/joe/Downloads/CIFs"
cif_dataset_loc = "cifs"
felix_input_dataset_loc = "felix_input"

In [3]:
Path(cif_dataset_loc).mkdir(parents=True, exist_ok=True)
Path(felix_input_dataset_loc).mkdir(parents=True, exist_ok=True)

In [4]:
keyinfo = []

hkl_file = open("hkl.txt", "r", encoding="utf8")
hkl_content = hkl_file.read()

inp_file = open("inp.txt", "r", encoding="utf8")
inp_content = inp_file.read()

for folder in os.listdir(data_loc):
    print(folder)
    
    for cif_filename in os.listdir(os.path.join(data_loc, folder)):
        ICSD_code = int(re.search(r"CollCode(\d+)\.cif", cif_filename).group(1))
        cif_file = open(os.path.join(data_loc, folder, cif_filename), "r", encoding="utf8")
        cif_text = cif_file.read()
        cif_file.close()
        
        try:
            chemical_formula_sum = re.search(
                r"_chemical_formula_sum\s*([\s\S]*?)\s*_",
                cif_text).group(1).strip(r" \n;'").replace('\n', '')
        except:
            continue
        
        try:
            cell_length = float(re.search(r"_cell_length_a\s+([\d.]+)", cif_text).group(1))
        except:
            continue
        
        try:
            space_group = re.search(r"_space_group_name_H-M_alt '(.+)'", cif_text).group(1)
        except:
            continue
        
        keyinfo.append(
            {
                "ICSD_code": ICSD_code,
                'chemical_formula_sum': chemical_formula_sum,
                'cell_length': cell_length,
                'space_group': space_group,
            }
        )
        
        # cif dataset
        with open(os.path.join(cif_dataset_loc, str(ICSD_code) + ".cif"), "w", encoding="utf8") as dest_file:
            dest_file.write(cif_text)
        
        
        # felix input dataset
        Path(os.path.join(felix_input_dataset_loc, str(ICSD_code))).mkdir(parents=True, exist_ok=True)
        # .cif
        with open(os.path.join(felix_input_dataset_loc, str(ICSD_code), "felix.cif"), "w", encoding="utf8") as dest_file:
            dest_file.write(cif_text)
        # .hkl
        with open(os.path.join(felix_input_dataset_loc, str(ICSD_code), "felix.hkl"), "w", encoding="utf8") as dest_file:
            dest_file.write(hkl_content)
        # .inp
        new_inp_content = re.sub("{{ROuterConvergenceAngle}}", f"{(7.266 / cell_length):.5f}", inp_content)
        with open(os.path.join(felix_input_dataset_loc, str(ICSD_code), "felix.inp"), "w", encoding="utf8") as dest_file:
            dest_file.write(new_inp_content)
        
keyinfo_df = pd.DataFrame(keyinfo)

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011-1
2011-2
2012-1
2012-2
2013-1
2013-2
2014-1
2014-2
2015-1
2015-2
2016
2017-1
2017-2
2018
2019
2020-1
2020-2
2021-1
2021-2
2022-1
2022-2
2023
2024


In [5]:
# keyinfo_df = keyinfo_df.sort_values("ICSD_code")
print(keyinfo_df['chemical_formula_sum'].nunique())
keyinfo_df.to_csv("keyinfo.csv", index_label="index")
keyinfo_df

14291


Unnamed: 0,ICSD_code,chemical_formula_sum,cell_length,space_group
0,106924,Al1 Np3,4.26900,P m -3 m
1,106925,In1 Np3,4.61500,P m -3 m
2,106935,Pd0.9 Rh0.1,3.88120,F m -3 m
3,106936,Ni0.1 Pd0.9,3.86600,F m -3 m
4,106937,Co0.05 Pd0.9 Rh0.05,3.87600,F m -3 m
...,...,...,...,...
21596,149274,Ba1 Na1 O12 P3 Y2,10.19120,P 21 3
21597,149277,N0.58 O0.42 Ti1,4.22990,F m -3 m
21598,149278,N0.75 O0.25 Ti1,4.23500,F m -3 m
21599,149279,N0.72 O0.28 Ti1,4.23489,F m -3 m
