In [1]:
import pandas as pd
import joblib

from models.classification_model import ClassifierModel # class with classifiers
from models.reduce_model import ReduceModel # class with autoencoder

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt 
from cycler import cycler

In [18]:
files = pd.read_csv("database/t_solvent.csv", sep=";")

In [19]:
files

Unnamed: 0,Folder num,CIF name,Stimuli
0,3,from part_2.cif,"T, solvent"
1,3,to part_4.cif,"T, solvent"
2,58,from (1) 1974527.cif,"T, solvent"
3,58,to (3) 1974529.cif,"T, solvent"


In [5]:
import json
mapping_ = json.load(open("names_mapping.json"))

In [20]:
files["CIF name"] = [mapping_[name] for name in files["CIF name"]]

In [30]:
files["CIF name"]

0       frompart2.cif
1         topart4.cif
2    from11974527.cif
3      to31974529.cif
Name: CIF name, dtype: object

In [8]:
from pymatgen.io.cif import CifParser
import fnmatch

In [35]:
import os
cif_data = pd.DataFrame(columns=["a", "b", "c", "alpha", "beta", "gamma", "volume", "sg_number"])
bad_parse  = []
bad_get_sg = []
main_folder = r"cifs/"

for file in os.listdir(main_folder):
    # print(file)
    if fnmatch.fnmatch(file, '*.cif') and file in files["CIF name"].values:
        try:
            # print(os.path.join(file))
            struct = CifParser(os.path.join(main_folder, file)).get_structures()[0]
            try:
                d_lattice = struct.as_dict()['lattice']
                property = [d_lattice["a"], d_lattice["b"], d_lattice["b"],
                            d_lattice["alpha"], d_lattice["beta"], d_lattice["gamma"], d_lattice["volume"],
                            struct.get_space_group_info()[1]]
                cif_data.loc[file[:-4]] = property
            except ValueError:
                print(f"Error with get sg: {os.path.join(file)}")
                bad_get_sg.append(os.path.join(file))
        except ValueError:
            print(f"Error with prase: no structure in {file}")
            bad_parse.append(file)
        except KeyError:
            print(f"Error with prase: no parameters in {file}")
            bad_parse.append(file)

In [36]:
cif_data

Unnamed: 0,a,b,c,alpha,beta,gamma,volume,sg_number
from11974527,13.842833,13.842833,13.842833,90.0,90.0,93.101835,2967.61936,41.0
frompart2,17.5111,17.5111,17.5111,90.0,90.0,90.0,12273.73218,91.0
to31974529,8.8467,10.058974,10.058974,108.460712,105.930959,93.057028,1214.122775,9.0
topart4,17.554724,17.554724,17.554724,90.0,90.0,93.289645,12339.49539,20.0


In [43]:
zeo_data = pd.read_csv("preprocessing/zeopp/zeo_data_.csv", index_col=0)
zeo_data = zeo_data.loc[list(v[:-4] for v in files["CIF name"].values)]

In [44]:
zeo_data

Unnamed: 0,pld,lcd
frompart2,25.67396,28.46977
topart4,27.54718,30.28158
from11974527,4.07961,5.55567
to31974529,1.5011,2.93504


In [45]:
mofid_data = pd.read_csv("preprocessing/mofid/mofid_data_.csv", index_col=0)
mofid_data = mofid_data.loc[list(v[:-4] for v in files["CIF name"].values)]

In [46]:
from mordred import Calculator, descriptors
from rdkit import Chem

linkers = [smi.strip('"')[1:-1].replace("'", "").split(", ") for smi in mofid_data['linker'].values]
mols = [[Chem.MolFromSmiles(smi) for smi in smi_list] for smi_list in linkers]
calc = Calculator(descriptors, ignore_3D=False)
def f(mof):
    try: return calc.pandas(mof)
    except TypeError:
        return None
    
dfs = [f(mof) for mof in mols]
data_mordred = pd.DataFrame(columns=dfs[0].columns)

for i, filename in enumerate(mofid_data.index):
    try:
        if linkers[i] != [""]:
            data_mordred.loc[filename] = dfs[i].mean()
    except AttributeError:
        print(f"{filename:_^20}")
#data_mordred.to_csv("../data/all_f_main_dataset_mordred_V2.csv")

100%|██████████| 1/1 [00:02<00:00,  2.47s/it]
100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
100%|██████████| 1/1 [00:01<00:00,  1.87s/it]
100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


  return avec - avec.mean()
  ret = ret.dtype.type(ret / rcount)
  return S / self.mol.GetNumAtoms()


In [47]:
def metal_from_node(node: str):
    import re
    """
    input: smilesNodes: str
    return: 
    'metals' in node: list
    'unique' types of metals: np.array, dtype='<U2'
    'count' of unique: np.array, dtype=int
    """
    # "O[Zr]123(O)[OH]4[Zr]56([O]3[Zr]37([OH]2[Zr]28([O]1[Zr]14([O]6[Zr]([OH]53)([OH]21)([O]78)(O)O)([OH2])([OH2])O)[OH2])([OH2])([OH2])O)[OH2]"
    node = node.replace("OH", "").replace("O", "")
    node = node.replace("[", "").replace("]", "").replace(")", "").replace("(", "").replace(",", "")
    node = re.sub(r"\d", "", node) # replace numbers
    #print(node)
    # "ZrZrZrZrZrZr"
    start_cut = 0
    metals = []
    for i, char in enumerate(node[1:]):
        if not char.islower():
            metals.append(node[start_cut:i+1])
            start_cut = i+1
    metals.append(node[start_cut:])
    unique, counts = np.unique(np.array(metals), return_counts=True)
    return metals, unique, counts

In [48]:
elemental_descriptors = pd.read_csv("preprocessing/qmof/data/elemental_descriptors.csv")

In [49]:
node_descriptors = pd.DataFrame(columns=("n_metals", 'n_types_metals', 'Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)'))

d = ['Atomic_Number', 'Atomic_Weight', 'Atomic Radius',
       'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']

for filename, node in zip(mofid_data.index, mofid_data["node"]):
    try:
       metals, unique, count = metal_from_node(node.replace("'", "").replace(" ", "").replace("%", ""))
       #print(metals)
       n_metals = count.sum()
       n_metals_types = count.__len__()
       df = pd.DataFrame(columns=d, index=[range(count.sum())])
       for metal in metals:
          #print(metal)
          df.loc[len(df)] = elemental_descriptors.loc[elemental_descriptors["Symbol"] == metal].loc[:,d].iloc[0]
       node_descriptors.loc[filename] = n_metals, n_metals_types, *df.mean().array
    except IndexError:
       print(f"Error with {node}")

Error with ['*']


In [50]:
node_descriptors

Unnamed: 0,n_metals,n_types_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
frompart2,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
topart4,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
from11974527,1.0,1.0,48.0,112.411,1.61,4.33,7.2,-70.0


In [53]:
index = list({*list(zeo_data.index)} & {*list(cif_data.index)} & {*list(data_mordred.index)} & {*list(node_descriptors.index)})

In [54]:
t_solvent = pd.concat([zeo_data.loc[index], cif_data.loc[index], data_mordred.loc[index], node_descriptors.loc[index]], axis=1)

In [55]:
t_solvent

Unnamed: 0,pld,lcd,a,b,c,alpha,beta,gamma,volume,sg_number,...,mZagreb1,mZagreb2,n_metals,n_types_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
topart4,27.54718,30.28158,17.554724,17.554724,17.554724,90.0,90.0,93.289645,12339.49539,20.0,...,15.277778,10.0,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0
from11974527,4.07961,5.55567,13.842833,13.842833,13.842833,90.0,90.0,93.101835,2967.61936,41.0,...,5.555556,3.666667,1.0,1.0,48.0,112.411,1.61,4.33,7.2,-70.0
frompart2,25.67396,28.46977,17.5111,17.5111,17.5111,90.0,90.0,90.0,12273.73218,91.0,...,15.277778,10.0,2.0,1.0,27.0,58.9332,1.52,4.3,7.5,64.0


In [61]:
from preproc_model import PreprocessingModel
preproc = joblib.load("preprocessing/preproc_m.pkl")
scaler = joblib.load("models/best/scaler.pkl")
reduce = joblib.load("models/best/reduce_model.pkl")
c_model = joblib.load("models/best/c_model.pkl")

In [72]:
x = preproc.transform(t_solvent.rename({"sg_number": "spacegroupNumber"}, axis=1))

In [73]:
x = reduce.transform(x)

In [74]:
np.save("preprocessing/datasets/t_solvent.npy", x)