In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
mofid = pd.read_csv("../data/main_dataset_mofid_data_V2.csv", index_col=0)
mofid = mofid[["smiles_linkers", "smiles_nodes", "cifname"]].set_index("cifname")
mordred = pd.read_csv("../data/all_f_main_dataset_mordred_V2.csv", index_col=0)
zeo = pd.read_csv("../data/main_dataset_zeo_V2.csv", index_col=0)
cif = pd.read_csv("../data/main_dataset_cif_property_V2.csv", index_col=0)



In [3]:
mordred.shape, zeo.shape, cif.shape

((151, 1826), (162, 3), (165, 8))

In [4]:
def make_mordred():
    from mordred import Calculator, descriptors
    from rdkit import Chem
    mofid = pd.read_csv("../data/main_dataset_mofid_data_V2.csv", index_col=0)
    mofid = mofid[["smiles_linkers", "smiles_nodes", "cifname"]].set_index("cifname")
    linkers = [mofid.smiles_linkers[i].replace("'", "")[1:-1].split(", ") for i in range(mofid.__len__())]
    mols = [[Chem.MolFromSmiles(smi) for smi in smi_list] for smi_list in linkers]
    calc = Calculator(descriptors, ignore_3D=False)
    def f(mof):
        try: return calc.pandas(mof)
        except TypeError:
            return None
    dfs = [f(mof) for mof in mols]
    data_mordred = pd.DataFrame(columns=dfs[0].columns)

    for i, filename in enumerate(mofid.index):
        try:
            if linkers[i] != [""]:
                data_mordred.loc[filename] = dfs[i].mean()
        except AttributeError:
            print(f"{filename:_^20}")
    data_mordred = data_mordred.set_index(data_mordred.index.map(lambda name: name.replace(" ", "")))
    #data_mordred.to_csv("../data/all_f_main_dataset_mordred_V2.csv")
    return data_mordred



In [5]:
def metal_from_node(node: str):
    import re
    """
    input: smilesNodes: str
    return: 
    'metals' in node: list
    'unique' types of metals: np.array, dtype='<U2'
    'count' of unique: np.array, dtype=int
    """
    # "O[Zr]123(O)[OH]4[Zr]56([O]3[Zr]37([OH]2[Zr]28([O]1[Zr]14([O]6[Zr]([OH]53)([OH]21)([O]78)(O)O)([OH2])([OH2])O)[OH2])([OH2])([OH2])O)[OH2]"
    node = node.replace("OH", "").replace("O", "")
    node = node.replace("[", "").replace("]", "").replace(")", "").replace("(", "").replace(",", "")
    node = re.sub(r"\d", "", node) # replace numbers
    #print(node)
    # "ZrZrZrZrZrZr"
    start_cut = 0
    metals = []
    for i, char in enumerate(node[1:]):
        if not char.islower():
            metals.append(node[start_cut:i+1])
            start_cut = i+1
    metals.append(node[start_cut:])
    unique, counts = np.unique(np.array(metals), return_counts=True)
    return metals, unique, counts

In [6]:
elemental_descriptors = pd.read_csv("../data/elemental_descriptors.csv")

In [7]:
node_descriptors = pd.DataFrame(columns=("n_metals", 'n_types_metals', 'Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)'))

d = ['Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']

for filename, node in zip(mofid.index.map(lambda name: name.replace(" ", "").replace(".cif", "")), mofid["smiles_nodes"]):
    try:
       metals, unique, count = metal_from_node(node.replace("'", "").replace(" ", "").replace("%", ""))
       #print(metals)
       n_metals = count.sum()
       n_metals_types = count.__len__()
       df = pd.DataFrame(columns=d, index=[range(count.sum())])
       for metal in metals:
          #print(metal)
          df.loc[len(df)] = elemental_descriptors.loc[elemental_descriptors["Symbol"] == metal].loc[:,d].iloc[0]
       node_descriptors.loc[filename] = n_metals, n_metals_types, *df.mean().array
    except IndexError:
       print(f"Error with {node}")

Error with []
Error with []
Error with []
Error with []
Error with ['*']
Error with ['*']
Error with []
Error with []
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']


In [8]:
node_descriptors = node_descriptors.loc[node_descriptors["n_types_metals"] == 1.]


In [9]:
zeo.index = zeo.index.map(lambda s: s.replace(".res", ""))

In [10]:
data_pld_lcd = zeo[["num1", "num2"]].rename({"num1": "lcd", "num2": "pld"}, axis=1)

In [11]:
cif = cif.rename({ "sg_number": "spacegroupNumber"}, axis=1)

In [12]:
cif.index = cif.index.map(lambda name: name.replace(" ", "").replace(".cif", ""))

In [13]:
cif = cif.set_index(cif.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [14]:
node_descriptors = node_descriptors.set_index(node_descriptors.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [15]:
data_pld_lcd = data_pld_lcd.set_index(data_pld_lcd.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [16]:
mordred = mordred.set_index(mordred.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [17]:
data_pld_lcd.shape, cif.shape, mordred.shape, node_descriptors.shape

((162, 2), (165, 8), (151, 1826), (145, 8))

In [18]:
data_main = pd.concat([data_pld_lcd, cif, mordred, node_descriptors], axis=1)

In [19]:
data_main = data_main.loc[list({*list(data_pld_lcd.index)} & {*list(
    cif.index)} & {*list(mordred.index)} & {*list(node_descriptors.index)})]


In [20]:
data_main.shape

(138, 1844)

In [21]:
# data_main.to_csv("../data/main_no_marked.csv")

In [22]:
db_from_to = pd.read_csv("../data/DB_main_with_SG_for_EDA.csv", sep=';')

In [23]:
db_from_to.CIF_init = db_from_to.CIF_init.map(lambda name: name.replace("(", "").replace(")", "").replace(" ", "").replace(".cif", ""))
db_from_to.CIF_final = db_from_to.CIF_final.map(lambda name: name.replace("(", "").replace(")", "").replace(" ", "").replace(".cif", ""))


In [24]:
marked = pd.DataFrame(columns=["target"])

for ind in db_from_to.index:
    if db_from_to.loc[ind, "Stimuli"].find("solvent") != -1:
        marked.loc[db_from_to.loc[ind, "CIF_init"]] = [1]
        if db_from_to.loc[ind, "Reversible"] == "yes":
            marked.loc[db_from_to.loc[ind, "CIF_final"]] = [1]
        else:
            marked.loc[db_from_to.loc[ind, "CIF_final"]] = [0]
    else:
        marked.loc[db_from_to.loc[ind, "CIF_init"]] = [0]
        marked.loc[db_from_to.loc[ind, "CIF_final"]] = [0]

In [25]:
from pteproc_model import PreprocessingModel
model = joblib.load("../qmof_datasets/normalizer/small_normalizer.pkl")

In [26]:
data_main.shape

(138, 1844)

In [27]:
main_dataset = data_main[model.cols]

In [28]:
marked_dataset = pd.concat([data_main, marked], axis=1)

In [29]:
marked_dataset = marked_dataset.loc[list({*list(main_dataset.index)} & {*list(marked.index)})]

In [30]:
marked_dataset.shape

(102, 1845)

In [31]:
target, counts = np.unique(marked_dataset.target, return_counts=True)


print(f"|{'target':^10}|{'counts':^10}|")
print(f"{'-':-^23}")
print(f"|{'yes':^10}|{counts[0]:^10}|")
print(f"|{'no':^10}|{counts[1]:^10}|")
print(f"total count:{marked_dataset.__len__()}")

|  target  |  counts  |
-----------------------
|   yes    |    65    |
|    no    |    37    |
total count:102


In [32]:
marked_dataset.shape

(102, 1845)

In [33]:
x, y = model.transform(marked_dataset.drop(['target'], axis=1)), marked_dataset['target']

In [34]:
x.shape, x.dropna().shape

((102, 993), (102, 993))

In [35]:
x

Unnamed: 0,pld,lcd,a,b,c,alpha,beta,gamma,volume,spacegroupNumber,...,Zagreb2,mZagreb1,mZagreb2,n_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
toDUT-30Znafterphasetransition804771,5.310226e-05,6.594762e-05,1.004883e-04,1.519500e-04,1.519500e-04,0.000869,0.000734,0.000734,0.019167,0.001443,...,2.222021e-03,9.691059e-05,7.210609e-05,2.076656e-05,3.114983e-04,0.000679,1.474425e-05,4.620559e-05,6.645298e-05,-0.000623
to6cm6b05277_si_005-Copy,6.851157e-05,1.683577e-04,5.808509e-04,7.344746e-04,7.344746e-04,0.005291,0.003968,0.003968,0.139569,0.006482,...,3.571618e-03,3.270309e-04,1.469802e-04,4.409405e-05,1.234633e-03,0.002588,6.570014e-05,1.940138e-04,2.998395e-04,0.004939
2from2924308-Copy,8.421515e-05,1.162928e-04,1.849040e-04,3.578139e-04,3.578139e-04,0.002594,0.002309,0.002333,0.037257,0.000050,...,2.775961e-03,1.229592e-04,9.308778e-05,2.500866e-05,7.502597e-04,0.001635,3.551229e-05,1.112885e-04,1.600554e-04,-0.001501
to3D854461,3.950769e-05,4.985666e-05,5.167944e-05,1.928568e-04,1.928568e-04,0.000643,0.000643,0.000643,0.043597,0.000129,...,2.858811e-03,1.306814e-04,9.449958e-05,1.429405e-05,2.144108e-04,0.000467,1.014878e-05,3.180427e-05,4.574097e-05,-0.000429
toCo-MOF1912233,6.729968e-05,1.027378e-04,3.040749e-04,3.581234e-04,3.581234e-04,0.003506,0.003357,0.003213,0.028690,0.000043,...,3.085894e-03,1.702397e-04,1.132933e-04,8.631871e-05,1.165303e-03,0.002544,6.560222e-05,1.855852e-04,3.236952e-04,0.002762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
toDUT-49Mn100K2014971,6.567838e-05,1.199719e-04,2.656982e-04,2.656982e-04,2.656982e-04,0.000522,0.000522,0.000522,0.557339,0.001189,...,2.047848e-03,8.959739e-05,6.123563e-05,1.160254e-05,1.450317e-04,0.000319,9.340044e-06,2.158072e-05,5.453194e-05,-0.000290
to5cm6b05277_si_004,1.123555e-04,1.427779e-04,5.158970e-04,7.725375e-04,7.725375e-04,0.005507,0.004130,0.004130,0.126595,0.008858,...,3.350278e-03,2.326582e-04,1.402324e-04,4.589422e-05,1.285038e-03,0.002694,6.838239e-05,2.019346e-04,3.120807e-04,0.005140
toVMOP-+1590348,1.092856e-07,2.343842e-07,4.987963e-07,4.987963e-07,4.987963e-07,0.000003,0.000003,0.000003,0.000179,0.000005,...,7.332323e-07,1.601334e-07,3.736726e-08,2.309393e-08,5.311604e-07,0.000001,3.949062e-08,8.313815e-08,2.863647e-07,0.000001
toSION-11503702,7.525768e-05,1.630270e-04,4.148402e-04,8.368563e-04,8.368563e-04,0.006163,0.005626,0.005626,0.085485,0.000875,...,4.375556e-03,3.941473e-04,1.823148e-04,6.250794e-05,4.063016e-03,0.009934,1.406429e-04,1.937746e-04,1.593952e-03,0.007001


In [37]:
x.to_csv("../main_datasets/dataset.csv")

In [None]:
y.to_csv("../main_datasets/target.csv")