In [6]:
import numpy as np
import pandas as pd
import os

# cif2vec

## cell params

In [7]:
import pymatgen
from pymatgen.io.cif import CifParser
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
import fnmatch

In [41]:
main_folder = r"../pipeline_test/added_cifs/"

cif_data = pd.DataFrame(columns=["a", "b", "c", "alpha", "beta", "gamma", "volume", "sg_number"])

In [42]:
bad_parse  = []
bad_get_sg = []

In [43]:
for file in os.listdir(main_folder):
    if fnmatch.fnmatch(file, '*.cif'):
        try:
            print(os.path.join(file))
            struct = CifParser(os.path.join(main_folder, file)).get_structures()[0]
            try:
                d_lattice = struct.as_dict()['lattice']
                property = [d_lattice["a"], d_lattice["b"], d_lattice["b"],
                            d_lattice["alpha"], d_lattice["beta"], d_lattice["gamma"], d_lattice["volume"],
                            struct.get_space_group_info()[1]]
                cif_data.loc[file[:-4]] = property
            except ValueError:
                print(f"Error with get sg: {os.path.join(file)}")
                bad_get_sg.append(os.path.join(file))
        except ValueError:
            print(f"Error with prase: no structure in {file}")
            bad_parse.append(file)
        except KeyError:
            print(f"Error with prase: no parameters in {file}")
            bad_parse.append(file)


1-final.cif
2-final.cif
2080270.cif




2080334.cif
CSV157 initial.cif
CSV158 initial.cif


In [44]:
cif_data # all good

Unnamed: 0,a,b,c,alpha,beta,gamma,volume,sg_number
1-final,9.30808,13.6588,13.6588,95.8388,90.0,90.0,3649.562351,4.0
2-final,9.2669,13.5294,13.5294,95.68,90.0,90.0,3589.395825,4.0
2080270,24.7507,24.7507,24.7507,90.0,90.0,90.0,24068.819523,92.0
2080334,17.5111,17.5111,17.5111,90.0,90.0,90.0,12273.73218,91.0
CSV157 initial,9.5062,9.7578,9.7578,90.0,90.0,90.0,1384.390626,33.0
CSV158 initial,7.6061,8.9119,8.9119,90.0,90.0,114.965,1081.833153,14.0


## Zeo++ 
notes that zeo++ data was extracted with `-ha` flag

In [87]:
zeo_data = pd.DataFrame(columns=["lcd", "pld"])
for file in os.listdir(r"../pipeline_test/zeo/"):
    if fnmatch.fnmatch(file, '*.res'):
        with open("../pipeline_test/zeo/" + file) as f:
            line = f.readline()
            zeo_data.loc[file[:-4]] = line.split()[1:-1]

In [88]:
zeo_data

Unnamed: 0,lcd,pld
1-final,2.88964,1.23051
2-final,2.84798,1.19852
2080270,9.71973,8.88093
2080334,28.46977,25.67396
CSV157 initial,3.22228,1.32818
CSV158 initial,1.65324,0.93324


## mofid

```python
from mofid.id_constructor  import extract_fragments
import fnmatch
import os
import pandas as pd
mofid_data = pd.DataFrame(columns=["node", "linker", "cat", "mofkey"])
for file in os.listdir('added_cifs/'):
    if fnmatch.fnmatch(file, '*.cif'):
        mofid_data.loc[file[:-4]] = extract_fragments(os.path.join('added_cifs/', file), "o.csv")
```

In [57]:
mofid_data = pd.read_csv("./mofid/mofid_data.csv", index_col=0)

In [81]:
mofid_data

Unnamed: 0,node,linker,cat,mofkey
2080334,[O][Co][Co][O],O=C1NCc2ccc(cc2)CNC(=O)c2cc(cc(c2)C(=O)NCc2ccc...,3,Co.OPWILJLZIKYBTD.MOFkey-v1
2-final,[OH2][Ni],n1ccc(cc1)C=Cc1ccncc1,3,Ni.MGFJDEHFNMWYBD.MOFkey-v1
CSV158 initial,[Cu],C(Cn1cncc1)CCn1cncc1,0,Cu.LGCPYQSYWVJQCJ.MOFkey-v1
1-final,[OH2][Co],n1ccc(cc1)C=Cc1ccncc1,3,Co.MGFJDEHFNMWYBD.MOFkey-v1
CSV157 initial,Cl[Cu]Cl,C(Cn1cncc1)CCn1cncc1,1,Cu.LGCPYQSYWVJQCJ.MOFkey-v1
2080270,[O][Co][Co][O],O=C1NCc2ccc(cc2)CNC(=O)c2cc(cc(c2)C(=O)NCc2ccc...,3,Co.OPWILJLZIKYBTD.MOFkey-v1


## Mordred

In [82]:
from mordred import Calculator, descriptors
from rdkit import Chem

linkers = [[smi] for smi in mofid_data['linker'].values]
mols = [[Chem.MolFromSmiles(smi) for smi in smi_list] for smi_list in linkers]
calc = Calculator(descriptors, ignore_3D=False)
def f(mof):
    try: return calc.pandas(mof)
    except TypeError:
        return None
    
dfs = [f(mof) for mof in mols]
data_mordred = pd.DataFrame(columns=dfs[0].columns)

for i, filename in enumerate(mofid_data.index):
    try:
        if linkers[i] != [""]:
            data_mordred.loc[filename] = dfs[i].mean()
    except AttributeError:
        print(f"{filename:_^20}")
#data_mordred.to_csv("../data/all_f_main_dataset_mordred_V2.csv")

100%|██████████| 1/1 [00:02<00:00,  2.23s/it]
100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
100%|██████████| 1/1 [00:01<00:00,  1.93s/it]
100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


In [83]:
data_mordred

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
2080334,35.987817,21.201554,2.0,0.0,59.662424,2.385873,4.771746,59.662424,1.297009,4.748596,...,10.634436,83.654352,618.176161,8.58578,8007.0,76.0,240.0,278.0,15.277778,10.0
2-final,10.606602,8.731144,0.0,0.0,18.877841,2.210509,4.421017,18.877841,1.348417,3.551969,...,9.049115,44.690407,182.084398,7.58685,343.0,15.0,66.0,72.0,3.222222,3.25
CSV158 initial,10.606602,9.598735,0.0,0.0,18.383629,2.198691,4.255477,18.383629,1.313116,3.556652,...,8.859505,59.440193,190.121846,6.790066,373.0,11.0,66.0,72.0,3.222222,3.25
1-final,10.606602,8.731144,0.0,0.0,18.877841,2.210509,4.421017,18.877841,1.348417,3.551969,...,9.049115,44.690407,182.084398,7.58685,343.0,15.0,66.0,72.0,3.222222,3.25
CSV157 initial,10.606602,9.598735,0.0,0.0,18.383629,2.198691,4.255477,18.383629,1.313116,3.556652,...,8.859505,59.440193,190.121846,6.790066,373.0,11.0,66.0,72.0,3.222222,3.25
2080270,35.987817,21.201554,2.0,0.0,59.662424,2.385873,4.771746,59.662424,1.297009,4.748596,...,10.634436,83.654352,618.176161,8.58578,8007.0,76.0,240.0,278.0,15.277778,10.0


## Node

In [77]:
def metal_from_node(node: str):
    import re
    """
    input: smilesNodes: str
    return: 
    'metals' in node: list
    'unique' types of metals: np.array, dtype='<U2'
    'count' of unique: np.array, dtype=int
    """
    # "O[Zr]123(O)[OH]4[Zr]56([O]3[Zr]37([OH]2[Zr]28([O]1[Zr]14([O]6[Zr]([OH]53)([OH]21)([O]78)(O)O)([OH2])([OH2])O)[OH2])([OH2])([OH2])O)[OH2]"
    node = node.replace("OH", "").replace("O", "")
    node = node.replace("[", "").replace("]", "").replace(")", "").replace("(", "").replace(",", "")
    node = re.sub(r"\d", "", node) # replace numbers
    #print(node)
    # "ZrZrZrZrZrZr"
    start_cut = 0
    metals = []
    for i, char in enumerate(node[1:]):
        if not char.islower():
            metals.append(node[start_cut:i+1])
            start_cut = i+1
    metals.append(node[start_cut:])
    unique, counts = np.unique(np.array(metals), return_counts=True)
    return metals, unique, counts

In [78]:
elemental_descriptors = pd.read_csv("../data/elemental_descriptors.csv")

In [85]:
node_descriptors = pd.DataFrame(columns=("n_metals", 'n_types_metals', 'Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)'))

d = ['Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']

for filename, node in zip(mofid_data.index, mofid_data["node"]):
    try:
       metals, unique, count = metal_from_node(node.replace("'", "").replace(" ", "").replace("%", ""))
       #print(metals)
       n_metals = count.sum()
       n_metals_types = count.__len__()
       df = pd.DataFrame(columns=d, index=[range(count.sum())])
       for metal in metals:
          #print(metal)
          df.loc[len(df)] = elemental_descriptors.loc[elemental_descriptors["Symbol"] == metal].loc[:,d].iloc[0]
       node_descriptors.loc[filename] = n_metals, n_metals_types, *df.mean().array
    except IndexError:
       print(f"Error with {node}")

In [86]:
node_descriptors

Unnamed: 0,n_metals,n_types_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
2080334,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
2-final,1.0,1.0,28.0,58.6934,1.49,4.4,6.8,112.0
CSV158 initial,1.0,1.0,29.0,63.546,1.45,4.48,6.7,119.0
1-final,1.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
CSV157 initial,3.0,2.0,21.0,44.817133,1.01,7.026667,3.7,272.333333
2080270,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0


# concatenate

In [94]:
data = pd.concat([zeo_data, cif_data, data_mordred, node_descriptors], axis=1)

In [95]:
data

Unnamed: 0,lcd,pld,a,b,c,alpha,beta,gamma,volume,sg_number,...,mZagreb1,mZagreb2,n_metals,n_types_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
1-final,2.88964,1.23051,9.30808,13.6588,13.6588,95.8388,90.0,90.0,3649.562351,4.0,...,3.222222,3.25,1.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
2-final,2.84798,1.19852,9.2669,13.5294,13.5294,95.68,90.0,90.0,3589.395825,4.0,...,3.222222,3.25,1.0,1.0,28.0,58.6934,1.49,4.4,6.8,112.0
2080270,9.71973,8.88093,24.7507,24.7507,24.7507,90.0,90.0,90.0,24068.819523,92.0,...,15.277778,10.0,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
2080334,28.46977,25.67396,17.5111,17.5111,17.5111,90.0,90.0,90.0,12273.73218,91.0,...,15.277778,10.0,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
CSV157 initial,3.22228,1.32818,9.5062,9.7578,9.7578,90.0,90.0,90.0,1384.390626,33.0,...,3.222222,3.25,3.0,2.0,21.0,44.817133,1.01,7.026667,3.7,272.333333
CSV158 initial,1.65324,0.93324,7.6061,8.9119,8.9119,90.0,90.0,114.965,1081.833153,14.0,...,3.222222,3.25,1.0,1.0,29.0,63.546,1.45,4.48,6.7,119.0


# Preprocessing

In [91]:
import joblib

In [93]:
from pteproc_model import PreprocessingModel
model = joblib.load("../qmof_datasets/scaler.pkl")

In [97]:
x = model.transform(data.rename({ "sg_number": "spacegroupNumber"}, axis=1))

In [100]:
x.to_csv("../main_datasets/test_cifs.csv")