In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
data_property_cell = pd.read_csv("data/qmof_property_cell.csv")

In [3]:
elemental_descriptors = pd.read_csv("data/elemental_descriptors.csv")

In [4]:
elemental_descriptors.columns

Index(['Name', 'Symbol', 'Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)'],
      dtype='object')

In [5]:
def metal_from_node(node: str):
    """
    input: smilesNodes: str
    return: 
    'metals' in node: list
    'unique' types of metals: np.array, dtype='<U2'
    'count' of unique: np.array, dtype=int
    """
    # "O[Zr]123(O)[OH]4[Zr]56([O]3[Zr]37([OH]2[Zr]28([O]1[Zr]14([O]6[Zr]([OH]53)([OH]21)([O]78)(O)O)([OH2])([OH2])O)[OH2])([OH2])([OH2])O)[OH2]"
    node = node.replace("OH", "").replace("O", "")
    node = node.replace("[", "").replace("]", "").replace(")", "").replace("(", "").replace(",", "")
    node = re.sub(r"\d", "", node) # replace numbers
    #print(node)
    # "ZrZrZrZrZrZr"
    start_cut = 0
    metals = []
    for i, char in enumerate(node[1:]):
        if not char.islower():
            metals.append(node[start_cut:i+1])
            start_cut = i+1
    metals.append(node[start_cut:])
    unique, counts = np.unique(np.array(metals), return_counts=True)
    return metals, unique, counts

None


In [25]:
node_descriptors = pd.DataFrame(columns=("n_metals", 'n_types_metals', 'Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)'))

d = ['Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']

for filename, node in zip(data_property_cell["filename"], data_property_cell["smilesNodes"]):
    try:
       metals, unique, count = metal_from_node(node)
       n_metals = count.sum()
       n_metals_types = count.__len__()
       df = pd.DataFrame(columns=d, index=[range(count.sum())])
       for metal in metals:
          print(metal)
          df.loc[len(df)] = elemental_descriptors.loc[elemental_descriptors["Symbol"] == metal].loc[:,d].iloc[0]
       node_descriptors.loc[filename] = n_metals, n_metals_types, *df.mean().array
    except IndexError:
       print(node)


Zn
Zn
Al
Zr
Zr
Zr
Zr
Zr
Zr
Ag
Zn
Sr
Cd
Co
Co
Cd
Y
Al
Al
Zn
Zn
Cd
Al
Cl
Zn
Al
Zn
Zn
Zn
Zn
Al
Zn
Mg
Mn
Ag
Cu
Cu
Cu
Zn
Zn
Li
Co
Zn
Zn
Zn
Zn
Zn
Cl
Cu
S
Cu
W
S
Cu
S
Cu
S
Cl
Zn
Al
Zn
Ni
Cu
Cu
Zn
Zn
Co
Al
Zn
Zn
Cd
Cu
Co
Cd
Cd
Cd
Mn
Cu
Al
Al
Al
Cd
Zn
Zn
Zn
Zn
Cd
Ni
Zn
Zn
Zn
Zn
Cu
Cu
Cu
Cu
Zn
Zn
Zr
Zr
Zr
Zr
Zr
Zr
Cd
Ni
Al
Zn
Zn
Al
Zn
Zn
Cu
Cs
Zn
Zn
Zn
Zn
Cd
Zn
Zn
Cu
Zr
Zr
Zr
Zr
Zr
Zr
Zn
Cd
Li
Zn
Zn
Zn
Zn
Al
Cd
Zn
Zn
Zn
Co
Cu
Cu
Fe
Pt
Tb
Tb
Al
Al
Al
Cu
Cu
Cu
Zn
Zn
Zn
Zn
K
Zn
Zn
Zn
Zn
Cd
Zn
Zn
Zn
Cd
Zn
Cd
Zn
Zn
Zn
Zn
Al
Al
Zn
Al
Co
Gd
Hf
Hf
Hf
Hf
Hf
Hf
Zn
Zn
Zn
Zn
Ba
Co
Co
Zn
Co
Cd
Zn
Zr
Zr
Zr
Zr
Zr
Zr
Zn
Zn
Zn
Cu
Zn
Cu
Cu
Al
Zn
Zn
Zn
Zn
Zr
Zr
Zr
Zr
Zr
Zr
Ag
Ag
Fe
Cd
Cd
Al
Er
Er
Zn
Zn
Zn
I
Cu
Cu
I
Tb
Cd
Cu
Co
Zn
Zn
Co
Zn
Zn
Cu
Zn
Zn
Zn
Co
Cu
Zn
Ag
Ag
Al
Al
Al
Mn
Zn
Zn
Zn
Zn
Al
Ag
Zn
Zn
Zn
Zn
Zn
Zn
Cu
Cu
Zn
Ag
Ag
Ag
Ag
Cu
Cu
Al
Zn
Zn
Al
Al
Al
K
Zn
Zn
Al
Zn
Al
Zn
Al
K
Zn
Zn
Zn
Zn
Al
Mo
Cu
S
Cu
S
Cu
S
Zn
Zn
Zn
Zn
Zn
Zn
Cd
Zn
Zn
Cu
Cu
Al
Al
Al
Al
Ho
Co
Zn
Zn
Zn
Zn
Zn
Mn
Al
Cu
Al
Zn
Zn
Zn
Cd
Zn
Zn
Zn


In [27]:
node_descriptors.to_csv("data/node_descriptors.csv")