In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
mofid = pd.read_csv("../data/main_dataset_mofid_data_V2.csv", index_col=0)
mofid = mofid[["smiles_linkers", "smiles_nodes", "cifname"]].set_index("cifname")
mordred = pd.read_csv("../data/all_f_main_dataset_mordred_V2.csv", index_col=0)
zeo = pd.read_csv("../data/main_dataset_zeo_V2.csv", index_col=0)
cif = pd.read_csv("../data/main_dataset_cif_property_V2.csv", index_col=0)



In [3]:
mordred.shape, zeo.shape, cif.shape

((151, 1826), (162, 3), (165, 8))

In [4]:
def make_mordred():
    from mordred import Calculator, descriptors
    from rdkit import Chem
    mofid = pd.read_csv("../data/main_dataset_mofid_data_V2.csv", index_col=0)
    mofid = mofid[["smiles_linkers", "smiles_nodes", "cifname"]].set_index("cifname")
    linkers = [mofid.smiles_linkers[i].replace("'", "")[1:-1].split(", ") for i in range(mofid.__len__())]
    mols = [[Chem.MolFromSmiles(smi) for smi in smi_list] for smi_list in linkers]
    calc = Calculator(descriptors, ignore_3D=False)
    def f(mof):
        try: return calc.pandas(mof)
        except TypeError:
            return None
    dfs = [f(mof) for mof in mols]
    data_mordred = pd.DataFrame(columns=dfs[0].columns)

    for i, filename in enumerate(mofid.index):
        try:
            if linkers[i] != [""]:
                data_mordred.loc[filename] = dfs[i].mean()
        except AttributeError:
            print(f"{filename:_^20}")
    data_mordred = data_mordred.set_index(data_mordred.index.map(lambda name: name.replace(" ", "")))
    #data_mordred.to_csv("../data/all_f_main_dataset_mordred_V2.csv")
    return data_mordred



In [5]:
def metal_from_node(node: str):
    import re
    """
    input: smilesNodes: str
    return: 
    'metals' in node: list
    'unique' types of metals: np.array, dtype='<U2'
    'count' of unique: np.array, dtype=int
    """
    # "O[Zr]123(O)[OH]4[Zr]56([O]3[Zr]37([OH]2[Zr]28([O]1[Zr]14([O]6[Zr]([OH]53)([OH]21)([O]78)(O)O)([OH2])([OH2])O)[OH2])([OH2])([OH2])O)[OH2]"
    node = node.replace("OH", "").replace("O", "")
    node = node.replace("[", "").replace("]", "").replace(")", "").replace("(", "").replace(",", "")
    node = re.sub(r"\d", "", node) # replace numbers
    #print(node)
    # "ZrZrZrZrZrZr"
    start_cut = 0
    metals = []
    for i, char in enumerate(node[1:]):
        if not char.islower():
            metals.append(node[start_cut:i+1])
            start_cut = i+1
    metals.append(node[start_cut:])
    unique, counts = np.unique(np.array(metals), return_counts=True)
    return metals, unique, counts

In [6]:
elemental_descriptors = pd.read_csv("../data/elemental_descriptors.csv")

In [7]:
node_descriptors = pd.DataFrame(columns=("n_metals", 'n_types_metals', 'Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)'))

d = ['Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']

for filename, node in zip(mofid.index.map(lambda name: name.replace(" ", "").replace(".cif", "")), mofid["smiles_nodes"]):
    try:
       metals, unique, count = metal_from_node(node.replace("'", "").replace(" ", "").replace("%", ""))
       #print(metals)
       n_metals = count.sum()
       n_metals_types = count.__len__()
       df = pd.DataFrame(columns=d, index=[range(count.sum())])
       for metal in metals:
          #print(metal)
          df.loc[len(df)] = elemental_descriptors.loc[elemental_descriptors["Symbol"] == metal].loc[:,d].iloc[0]
       node_descriptors.loc[filename] = n_metals, n_metals_types, *df.mean().array
    except IndexError:
       print(f"Error with {node}")

Error with []
Error with []
Error with []
Error with []
Error with ['*']
Error with ['*']
Error with []
Error with []
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']
Error with ['*']


In [8]:
node_descriptors = node_descriptors.loc[node_descriptors["n_types_metals"] == 1.]


In [9]:
zeo.index = zeo.index.map(lambda s: s.replace(".res", ""))

In [10]:
data_pld_lcd = zeo[["num1", "num2"]].rename({"num1": "lcd", "num2": "pld"}, axis=1)

In [11]:
cif = cif.rename({ "sg_number": "spacegroupNumber"}, axis=1)

In [12]:
cif.index = cif.index.map(lambda name: name.replace(" ", "").replace(".cif", ""))

In [13]:
cif = cif.set_index(cif.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [14]:
node_descriptors = node_descriptors.set_index(node_descriptors.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [15]:
data_pld_lcd = data_pld_lcd.set_index(data_pld_lcd.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [16]:
mordred = mordred.set_index(mordred.index.map(lambda name: name.replace("(", "").replace(")", "")))

In [17]:
data_pld_lcd.shape, cif.shape, mordred.shape, node_descriptors.shape

((162, 2), (165, 8), (151, 1826), (145, 8))

In [18]:
data_main = pd.concat([data_pld_lcd, cif, mordred, node_descriptors], axis=1)

In [19]:
data_main = data_main.loc[list({*list(data_pld_lcd.index)} & {*list(
    cif.index)} & {*list(mordred.index)} & {*list(node_descriptors.index)})]


In [20]:
data_main.shape

(138, 1844)

In [21]:
# data_main.to_csv("../data/main_no_marked.csv")

In [22]:
db_from_to = pd.read_csv("../data/DB_main_with_SG_for_EDA.csv", sep=';')

In [23]:
db_from_to.CIF_init = db_from_to.CIF_init.map(lambda name: name.replace("(", "").replace(")", "").replace(" ", "").replace(".cif", ""))
db_from_to.CIF_final = db_from_to.CIF_final.map(lambda name: name.replace("(", "").replace(")", "").replace(" ", "").replace(".cif", ""))


In [24]:
marked = pd.DataFrame(columns=["target"])

for ind in db_from_to.index:
    if db_from_to.loc[ind, "Stimuli"].find("solvent") != -1:
        marked.loc[db_from_to.loc[ind, "CIF_init"]] = [1]
        if db_from_to.loc[ind, "Reversible"] == "yes":
            marked.loc[db_from_to.loc[ind, "CIF_final"]] = [1]
        else:
            marked.loc[db_from_to.loc[ind, "CIF_final"]] = [0]
    else:
        marked.loc[db_from_to.loc[ind, "CIF_init"]] = [0]
        marked.loc[db_from_to.loc[ind, "CIF_final"]] = [0]

In [25]:
from pteproc_model import PreprocessingModel
model = joblib.load("../qmof_datasets/normalizer/small_normalizer.pkl")

In [26]:
data_main.shape

(138, 1844)

In [27]:
main_dataset = data_main[model.cols]

In [28]:
marked_dataset = pd.concat([data_main, marked], axis=1)

In [29]:
marked_dataset = marked_dataset.loc[list({*list(main_dataset.index)} & {*list(marked.index)})]

In [30]:
marked_dataset.shape

(102, 1845)

In [31]:
target, counts = np.unique(marked_dataset.target, return_counts=True)


print(f"|{'target':^10}|{'counts':^10}|")
print(f"{'-':-^23}")
print(f"|{'yes':^10}|{counts[0]:^10}|")
print(f"|{'no':^10}|{counts[1]:^10}|")
print(f"total count:{marked_dataset.__len__()}")

|  target  |  counts  |
-----------------------
|   yes    |    65    |
|    no    |    37    |
total count:102


In [32]:
marked_dataset.shape

(102, 1845)

In [33]:
x, y = model.transform(marked_dataset.drop(['target'], axis=1)), marked_dataset['target']

In [34]:
x.shape, x.dropna().shape

((102, 990), (102, 990))

In [35]:
x

Unnamed: 0,pld,lcd,a,b,c,alpha,beta,gamma,volume,spacegroupNumber,...,Zagreb2,mZagreb1,mZagreb2,n_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
tocompoundwithmethanol1403845,0.000038,0.000109,0.000422,0.000422,0.000422,0.001597,0.001597,0.001397,0.149036,0.003889,...,0.002678,0.000181,0.000103,0.000233,0.000699,0.001523,0.000033,0.000104,0.000149,-0.001397
toanie202202073-sup-0001-nku-128_thf,0.000062,0.000080,0.000176,0.000292,0.000292,0.001212,0.001145,0.001093,0.097999,0.000027,...,0.002401,0.000098,0.000086,0.000027,0.000405,0.000882,0.000019,0.000060,0.000086,-0.000809
toZn-1withDMF238859,0.000088,0.000224,0.000857,0.000857,0.000857,0.005719,0.006966,0.006966,0.130337,0.008357,...,0.003701,0.000325,0.000162,0.000119,0.001791,0.003903,0.000085,0.000266,0.000382,-0.003581
toDMFsolvatedFebdp298K1058448,0.000322,0.000335,0.000459,0.000459,0.000459,0.003078,0.003078,0.003396,0.084970,0.000684,...,0.003352,0.000118,0.000122,0.000034,0.000889,0.001910,0.000053,0.000139,0.000287,0.000513
toDUT-49Mn100K2014971,0.000066,0.000120,0.000266,0.000266,0.000266,0.000522,0.000522,0.000522,0.557339,0.001189,...,0.002048,0.000090,0.000061,0.000012,0.000145,0.000319,0.000009,0.000022,0.000055,-0.000290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1from61055570,0.000091,0.000150,0.000552,0.000576,0.000576,0.006004,0.006004,0.006161,0.096855,0.000934,...,0.004269,0.000182,0.000183,0.000067,0.001935,0.004239,0.000097,0.000299,0.000447,0.007938
toCCDC_2065041_JUK-20-noh,0.000176,0.000209,0.000387,0.000399,0.000399,0.002301,0.002200,0.002200,0.130362,0.000318,...,0.002200,0.000098,0.000079,0.000024,0.001173,0.002748,0.000039,0.000106,0.000176,-0.001711
initialMOFDMF@DUT-8Ni1989709,0.000345,0.000387,0.000335,0.000657,0.000657,0.003208,0.003208,0.003208,0.113662,0.004598,...,0.003386,0.000220,0.000125,0.000071,0.000998,0.002092,0.000053,0.000157,0.000242,0.003992
fromHIMS-741884101,0.000059,0.000132,0.000713,0.000804,0.000804,0.004571,0.004571,0.005760,0.244822,0.000711,...,0.003962,0.000364,0.000158,0.000051,0.001524,0.003321,0.000072,0.000226,0.000325,-0.003048


In [36]:
x.to_csv("../main_datasets/dataset.csv")

In [37]:
y.to_csv("../main_datasets/target.csv")