In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

In [3]:
data_property_cell = pd.read_csv("qmof/data/qmof_property_cell.csv", index_col=1) # index_col=1 id "filename"
data_property_cell = data_property_cell.drop("Unnamed: 0", axis=1) # drop numbers of rows
property_cell_cols = ["pld", "lcd", "a", "b", "c", "alpha", "betta", "gamma", "volume", "spacegroupNumber"]
data_property_cell = data_property_cell.loc[:,property_cell_cols].rename({"betta": "beta"}, axis=1)

data_linker_mordred = pd.read_csv("qmof//data/mordred_descriptors.csv", index_col=0)

In [4]:
p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
print(f"Mordred data has {p*100:.2n} % of nan")

Mordred data has 25 % of nan


In [5]:
data_node = pd.read_csv("qmof/data/node_descriptors.csv", index_col=0)
data_node = data_node.loc[data_node["n_types_metals"] == 1.]
data_node = data_node.loc[:, ['n_metals', 'Atomic_Number', 'Atomic_Weight',
                              'Atomic Radius', 'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']]


In [6]:
data_qmof = pd.concat([data_property_cell, data_linker_mordred, data_node], axis=1)
data_qmof.shape

(7463, 1843)

In [7]:
from preproc_model import PreprocessingModel

In [8]:
# p_dorp means we drop columns with more than 5% of nan values
model = PreprocessingModel(p_drop=0.1, threshold=1e-6, normalizer="minmax")

In [9]:
data_transformed = model.fit_transform(data_qmof)

In [10]:
data_transformed

Unnamed: 0,pld,lcd,a,b,c,alpha,beta,gamma,volume,spacegroupNumber,...,Zagreb2,mZagreb1,mZagreb2,n_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
boydwoo_str_m3_o3_o28_pcu_sym_255,0.410619,0.479118,0.450473,0.464028,0.365010,0.583960,0.527963,0.441757,0.169915,0.000000,...,0.312528,0.358272,0.298246,0.090909,0.303371,0.252930,0.133333,0.841924,0.029197,0.014184
gmof_Al2O6-BDC_B-irmof7_B_No138,0.082168,0.118327,0.098236,0.197062,0.095802,0.487432,0.274687,0.215255,0.011610,0.000000,...,0.126162,0.169511,0.114662,0.000000,0.112360,0.086723,0.000000,0.422680,0.063869,0.158865
gmof_Uio66Zr-irmof20_A-irmof8_A_No139,0.117807,0.226642,0.396015,0.365802,0.209176,0.274848,0.096726,0.058777,0.061655,0.000000,...,0.169987,0.225764,0.150376,0.454545,0.415730,0.364723,0.488889,0.563574,0.239051,0.157447
YOSLIB01_FSR,0.011109,0.010565,0.108731,0.063569,0.105109,0.578144,0.526960,0.441639,0.007752,0.140969,...,0.055777,0.060004,0.050125,0.000000,0.494382,0.436748,0.261111,0.838488,0.056569,0.278014
SACCAD_FSR,0.139065,0.130483,0.206300,0.309442,0.215572,0.578142,0.526962,0.438559,0.047573,0.026432,...,0.104914,0.086255,0.096491,0.000000,0.303371,0.252930,0.133333,0.841924,0.029197,0.014184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
QOBZUD_FSR,0.159228,0.128743,0.332295,0.297183,0.195961,0.759244,0.850456,0.634459,0.049665,0.061674,...,0.233732,0.179261,0.255639,0.000000,0.505618,0.456406,0.238889,0.800687,0.043796,0.000000
boydwoo_str_m5_Al_o13_o27_sra_sym_16,0.270484,0.231007,0.097786,0.553784,0.354804,0.343260,0.516240,0.429053,0.061837,0.000000,...,0.233068,0.294956,0.239662,0.000000,0.112360,0.086723,0.000000,0.422680,0.063869,0.158865
NIMWIR_FSR,0.024456,0.041811,0.064723,0.142504,0.173167,0.578143,0.615112,0.441633,0.011946,0.057269,...,0.065073,0.083255,0.057644,0.000000,0.505618,0.456406,0.238889,0.800687,0.043796,0.000000
OSUNAT_FSR,0.027635,0.029027,0.101021,0.265624,0.177919,0.583864,0.526959,0.441636,0.023728,0.057269,...,0.081009,0.095443,0.058480,0.000000,0.719101,0.683676,0.600000,0.378007,0.343066,0.146099


In [11]:
train, test = train_test_split(data_transformed, test_size=0.2, random_state=42)

In [12]:
train.shape, test.shape

((5970, 1018), (1493, 1018))

In [13]:
data_transformed.shape

(7463, 1018)

In [14]:
train.to_csv("datasets/qmof_train.csv")
test.to_csv("datasets/qmof_test.csv")
joblib.dump(model, "preproc_m.pkl")

['preproc_m.pkl']