In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import joblib

In [2]:
data_property_cell = pd.read_csv("../data/qmof_property_cell.csv", index_col=1) # index_col=1 id "filename"
data_property_cell = data_property_cell.drop("Unnamed: 0", axis=1) # drop numbers of rows
property_cell_cols = ["pld", "lcd", "a", "b", "c", "alpha", "betta", "gamma", "volume", "spacegroupNumber"]
data_property_cell = data_property_cell.loc[:,property_cell_cols].rename({"betta": "beta"}, axis=1)

In [3]:
data_linker_mordred = pd.read_csv("../data/mordred_descriptors.csv", index_col=0)

In [4]:
p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
print(f"Mordred data has {p*100:.2n} % of nan")

Mordred data has 25 % of nan


In [5]:
data_linker_mordred.shape

(7109, 1826)

Load node data

In [6]:
data_node = pd.read_csv("../data/node_descriptors.csv", index_col=0)
data_node = data_node.loc[data_node["n_types_metals"] == 1.]
data_node = data_node.loc[:, ['n_metals', 'Atomic_Number', 'Atomic_Weight',
                              'Atomic Radius', 'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']]


In [7]:
data_qmof = pd.concat([data_property_cell, data_linker_mordred, data_node], axis=1)

In [8]:
data_qmof.shape

(7463, 1843)

In [9]:
data_qmof = data_qmof.loc[list({*list(data_linker_mordred.index)} & {*list(data_node.index)})]

In [10]:
data_qmof.shape

(6797, 1843)

In [11]:
data_qmof.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6797 entries, BEMWIA_FSR to MULQOB_FSR
Columns: 1843 entries, pld to electron affinity(kJ/mol)
dtypes: float64(1842), int64(1)
memory usage: 95.6+ MB


In [12]:
train, test = train_test_split(data_qmof, test_size=0.2, random_state=0)

In [13]:
from pteproc_model import PreprocessingModel

In [14]:
# p_dorp means we drop columns with more than 5% of nan values
model = PreprocessingModel(p_drop=0.05, normalizer="normalizer")

In [15]:
transformed_train = model.fit_transform(train)

In [16]:
transformed_test = model.fit_transform(test)

In [17]:
transformed_train.shape, transformed_test.shape

((5437, 1145), (1360, 1145))

In [18]:
transformed_train.to_csv("../qmof_datasets/normalizer/small_train.csv")
transformed_test.to_csv("../qmof_datasets/normalizer/small_test.csv")
joblib.dump(model, "../qmof_datasets/normalizer/small_normalizer.pkl")

['../qmof_datasets/normalizer/small_normalizer.pkl']