In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import joblib

In [2]:
data_property_cell = pd.read_csv("../data/qmof_property_cell.csv", index_col=1) # index_col=1 id "filename"
data_property_cell = data_property_cell.drop("Unnamed: 0", axis=1) # drop numbers of rows
property_cell_cols = ["pld", "lcd", "a", "b", "c", "alpha", "betta", "gamma", "volume", "spacegroupNumber"]
data_property_cell = data_property_cell.loc[:,property_cell_cols].rename({"betta": "beta"}, axis=1)

In [3]:
data_linker_mordred = pd.read_csv("../data/mordred_descriptors.csv", index_col=0)

In [4]:
p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
print(f"Mordred data has {p*100:.2n} % of nan")

Mordred data has 25 % of nan


In [5]:
data_linker_mordred.shape

(7109, 1826)

Load node data

In [6]:
data_node = pd.read_csv("../data/node_descriptors.csv", index_col=0)
data_node = data_node.loc[data_node["n_types_metals"] == 1.]
data_node = data_node.loc[:, ['n_metals', 'Atomic_Number', 'Atomic_Weight',
                              'Atomic Radius', 'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']]


In [7]:
data_qmof = pd.concat([data_property_cell, data_linker_mordred, data_node], axis=1)

In [8]:
data_qmof.shape

(7463, 1843)

In [9]:
data_qmof = data_qmof.loc[list({*list(data_linker_mordred.index)} & {*list(data_node.index)})]

In [10]:
data_qmof.shape

(6797, 1843)

In [11]:
data_qmof.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6797 entries, POPCII_FSR to core_AFIXAO_freeONLY
Columns: 1843 entries, pld to electron affinity(kJ/mol)
dtypes: float64(1842), int64(1)
memory usage: 95.6+ MB


In [12]:
1/2

0.5

In [13]:
train, test = train_test_split(data_qmof, test_size=0.2, random_state=0)

In [14]:
train.shape

(5437, 1843)

In [15]:
len(train["beta"].dropna())

5437

In [16]:
train["pld"].std()

4.319156985366193

In [17]:
for col in train.columns:
    print(f"{col} has {((5437 - len(train[col].dropna()))/5437)*100:.0f} % of nan and {train[col].std():.4f} std")

pld has 0 % of nan and 4.3192 std
lcd has 0 % of nan and 5.1560 std
a has 0 % of nan and 4.7980 std
b has 0 % of nan and 4.8305 std
c has 0 % of nan and 5.9688 std
alpha has 0 % of nan and 14.2684 std
beta has 0 % of nan and 12.2784 std
gamma has 0 % of nan and 14.4533 std
volume has 0 % of nan and 3322.2799 std
spacegroupNumber has 0 % of nan and 43.2527 std
ABC has 0 % of nan and 6.6541 std
ABCGG has 0 % of nan and 4.9142 std
nAcid has 0 % of nan and 1.0233 std
nBase has 0 % of nan and 0.6560 std
SpAbs_A has 0 % of nan and 10.6961 std
SpMax_A has 0 % of nan and 0.2262 std
SpDiam_A has 0 % of nan and 0.4574 std
SpAD_A has 0 % of nan and 10.6961 std
SpMAD_A has 0 % of nan and 0.0783 std
LogEE_A has 0 % of nan and 0.5065 std
VE1_A has 0 % of nan and 0.7018 std
VE2_A has 0 % of nan and 0.0774 std
VE3_A has 0 % of nan and 0.6820 std
VR1_A has 0 % of nan and 245.9761 std
VR2_A has 0 % of nan and 5.9020 std
VR3_A has 0 % of nan and 1.4213 std
nAromAtom has 0 % of nan and 6.0097 std
nAromBon

In [18]:
from pteproc_model import PreprocessingModel

In [19]:
# p_dorp means we drop columns with more than 5% of nan values
model = PreprocessingModel(p_drop=0.1, threshold=1e-10, normalizer="normalizer")

In [20]:
transformed_train = model.fit_transform(train)

In [21]:
for col in transformed_train.columns:
    print(f"{col} has {((5437 - len(train[col].dropna()))/5437)*100:.0f} % of nan and {train[col].std():.4f} std")

pld has 0 % of nan and 4.3192 std
lcd has 0 % of nan and 5.1560 std
a has 0 % of nan and 4.7980 std
b has 0 % of nan and 4.8305 std
c has 0 % of nan and 5.9688 std
alpha has 0 % of nan and 14.2684 std
beta has 0 % of nan and 12.2784 std
gamma has 0 % of nan and 14.4533 std
volume has 0 % of nan and 3322.2799 std
spacegroupNumber has 0 % of nan and 43.2527 std
ABC has 0 % of nan and 6.6541 std
ABCGG has 0 % of nan and 4.9142 std
nAcid has 0 % of nan and 1.0233 std
nBase has 0 % of nan and 0.6560 std
SpAbs_A has 0 % of nan and 10.6961 std
SpMax_A has 0 % of nan and 0.2262 std
SpDiam_A has 0 % of nan and 0.4574 std
SpAD_A has 0 % of nan and 10.6961 std
SpMAD_A has 0 % of nan and 0.0783 std
LogEE_A has 0 % of nan and 0.5065 std
VE1_A has 0 % of nan and 0.7018 std
VE2_A has 0 % of nan and 0.0774 std
VE3_A has 0 % of nan and 0.6820 std
VR1_A has 0 % of nan and 245.9761 std
VR2_A has 0 % of nan and 5.9020 std
VR3_A has 0 % of nan and 1.4213 std
nAromAtom has 0 % of nan and 6.0097 std
nAromBon

In [22]:
transformed_train.shape

(5437, 990)

In [23]:
transformed_test = model.transform(test)

In [24]:
transformed_train.shape, transformed_test.shape

((5437, 990), (1360, 990))

In [27]:
transformed_train.to_csv("../qmof_datasets/normalizer/small_train.csv")
transformed_test.to_csv("../qmof_datasets/normalizer/small_test.csv")
joblib.dump(model, "../qmof_datasets/normalizer/small_normalizer.pkl")

['../qmof_datasets/normalizer/small_normalizer.pkl']