In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import joblib

In [4]:
data_property_cell = pd.read_csv("../data/qmof_property_cell.csv", index_col=1) # index_col=1 id "filename"
data_property_cell = data_property_cell.drop("Unnamed: 0", axis=1) # drop numbers of rows
property_cell_cols = ["pld", "lcd", "a", "b", "c", "alpha", "betta", "gamma", "volume", "spacegroupNumber"]
data_property_cell = data_property_cell.loc[:,property_cell_cols].rename({"betta": "beta"}, axis=1)

In [5]:
data_linker_mordred = pd.read_csv("../data/mordred_descriptors.csv", index_col=0)

In [6]:
p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
print(f"Mordred data has {p*100:.2n} % of nan")

Mordred data has 25 % of nan


In [7]:
data_linker_mordred.shape

(7109, 1826)

Load node data

In [8]:
data_node = pd.read_csv("../data/node_descriptors.csv", index_col=0)
data_node = data_node.loc[data_node["n_types_metals"] == 1.]
data_node = data_node.loc[:, ['n_metals', 'Atomic_Number', 'Atomic_Weight',
                              'Atomic Radius', 'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']]


In [9]:
data_qmof = pd.concat([data_property_cell, data_linker_mordred, data_node], axis=1)

In [10]:
data_qmof.shape

(7463, 1843)

In [11]:
data_qmof = data_qmof.loc[list({*list(data_linker_mordred.index)} & {*list(data_node.index)})]

In [12]:
data_qmof.shape

(6797, 1843)

In [13]:
data_qmof.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6797 entries, gmof_Zn2O8N2-AZO_A-DPAC_A_No34 to XOTXUB_FSR
Columns: 1843 entries, pld to electron affinity(kJ/mol)
dtypes: float64(1842), int64(1)
memory usage: 95.6+ MB


In [14]:
1/2

0.5

In [15]:
train, test = train_test_split(data_qmof, test_size=0.2, random_state=0)

In [16]:
train.shape

(5437, 1843)

In [17]:
len(train["beta"].dropna())

5437

In [18]:
train["pld"].std()

4.309500082813449

In [19]:
for col in train.columns:
    print(f"{col} has {((5437 - len(train[col].dropna()))/5437)*100:.0f} % of nan and {train[col].std():.4f} std")

pld has 0 % of nan and 4.3095 std
lcd has 0 % of nan and 5.1588 std
a has 0 % of nan and 4.8018 std
b has 0 % of nan and 4.8323 std
c has 0 % of nan and 5.9884 std
alpha has 0 % of nan and 14.2829 std
beta has 0 % of nan and 12.2009 std
gamma has 0 % of nan and 14.3936 std
volume has 0 % of nan and 3254.8853 std
spacegroupNumber has 0 % of nan and 43.3403 std
ABC has 0 % of nan and 6.6795 std
ABCGG has 0 % of nan and 4.9250 std
nAcid has 0 % of nan and 1.0269 std
nBase has 0 % of nan and 0.6916 std
SpAbs_A has 0 % of nan and 10.7567 std
SpMax_A has 0 % of nan and 0.2248 std
SpDiam_A has 0 % of nan and 0.4549 std
SpAD_A has 0 % of nan and 10.7567 std
SpMAD_A has 0 % of nan and 0.0779 std
LogEE_A has 0 % of nan and 0.5058 std
VE1_A has 0 % of nan and 0.7048 std
VE2_A has 0 % of nan and 0.0773 std
VE3_A has 0 % of nan and 0.6822 std
VR1_A has 0 % of nan and 260.2599 std
VR2_A has 0 % of nan and 6.2537 std
VR3_A has 0 % of nan and 1.4160 std
nAromAtom has 0 % of nan and 6.0275 std
nAromBon

ATS6p has 0 % of nan and 70.4140 std
ATS7p has 0 % of nan and 73.0929 std
ATS8p has 0 % of nan and 74.1542 std
ATS0i has 0 % of nan and 1865.8304 std
ATS1i has 0 % of nan and 1837.0116 std
ATS2i has 0 % of nan and 3180.4816 std
ATS3i has 0 % of nan and 4395.2228 std
ATS4i has 0 % of nan and 5523.1430 std
ATS5i has 0 % of nan and 6106.8434 std
ATS6i has 0 % of nan and 6467.1361 std
ATS7i has 0 % of nan and 6745.2558 std
ATS8i has 0 % of nan and 6737.9626 std
AATS0dv has 0 % of nan and 4.2362 std
AATS1dv has 0 % of nan and 3.0921 std
AATS2dv has 1 % of nan and 4.0177 std
AATS3dv has 5 % of nan and 4.0837 std
AATS4dv has 6 % of nan and 4.9135 std
AATS5dv has 8 % of nan and 6.1820 std
AATS6dv has 17 % of nan and 6.9332 std
AATS7dv has 28 % of nan and 11.0834 std
AATS8dv has 57 % of nan and 7.7450 std
AATS0d has 0 % of nan and 0.6282 std
AATS1d has 0 % of nan and 0.7934 std
AATS2d has 1 % of nan and 0.8165 std
AATS3d has 5 % of nan and 0.6628 std
AATS4d has 6 % of nan and 0.5993 std
AATS5d 

In [20]:
from pteproc_model import PreprocessingModel

In [21]:
# p_dorp means we drop columns with more than 5% of nan values
model = PreprocessingModel(p_drop=0.1, threshold=1e-10, normalizer="normalizer")

In [22]:
transformed_train = model.fit_transform(train)

In [23]:
for col in transformed_train.columns:
    print(f"{col} has {((5437 - len(train[col].dropna()))/5437)*100:.0f} % of nan and {train[col].std():.4f} std")

pld has 0 % of nan and 4.3095 std
lcd has 0 % of nan and 5.1588 std
a has 0 % of nan and 4.8018 std
b has 0 % of nan and 4.8323 std
c has 0 % of nan and 5.9884 std
alpha has 0 % of nan and 14.2829 std
beta has 0 % of nan and 12.2009 std
gamma has 0 % of nan and 14.3936 std
volume has 0 % of nan and 3254.8853 std
spacegroupNumber has 0 % of nan and 43.3403 std
ABC has 0 % of nan and 6.6795 std
ABCGG has 0 % of nan and 4.9250 std
nAcid has 0 % of nan and 1.0269 std
nBase has 0 % of nan and 0.6916 std
SpAbs_A has 0 % of nan and 10.7567 std
SpMax_A has 0 % of nan and 0.2248 std
SpDiam_A has 0 % of nan and 0.4549 std
SpAD_A has 0 % of nan and 10.7567 std
SpMAD_A has 0 % of nan and 0.0779 std
LogEE_A has 0 % of nan and 0.5058 std
VE1_A has 0 % of nan and 0.7048 std
VE2_A has 0 % of nan and 0.0773 std
VE3_A has 0 % of nan and 0.6822 std
VR1_A has 0 % of nan and 260.2599 std
VR2_A has 0 % of nan and 6.2537 std
VR3_A has 0 % of nan and 1.4160 std
nAromAtom has 0 % of nan and 6.0275 std
nAromBon

In [24]:
transformed_train.shape

(5437, 993)

In [25]:
transformed_test = model.transform(test)

In [26]:
transformed_train.shape, transformed_test.shape

((5437, 993), (1360, 993))

In [29]:
transformed_train.to_csv("../qmof_datasets/normalizer/small_train.csv")
transformed_test.to_csv("../qmof_datasets/normalizer/small_test.csv")
joblib.dump(model, "../qmof_datasets/normalizer/small_normalizer.pkl")

['../qmof_datasets/normalizer/small_normalizer.pkl']