In [1]:
import pandas as pd

In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import joblib

In [3]:
data_property_cell = pd.read_csv("../data/qmof_property_cell.csv", index_col=1) # index_col=1 id "filename"
data_property_cell = data_property_cell.drop("Unnamed: 0", axis=1) # drop numbers of rows
property_cell_cols = ["pld", "lcd", "a", "b", "c", "alpha", "betta", "gamma", "volume", "spacegroupNumber"]
data_property_cell = data_property_cell.loc[:,property_cell_cols].rename({"betta": "beta"}, axis=1)

In [4]:
data_linker_mordred = pd.read_csv("../data/mordred_descriptors.csv", index_col=0)

In [5]:
p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
print(f"% of nan: {p*100:.2n}")

% of nan: 25


In [88]:
# data_linker_mordred = data_linker_mordred.fillna(data_linker_mordred.mean())
# data_linker_mordred = data_linker_mordred.fillna(0)

In [89]:
# selector = VarianceThreshold(threshold=0.0001)
# selector.fit(data_linker_mordred)
# kept = selector.get_support()

# data_linker_mordred = data_linker_mordred.loc[:,kept]

In [90]:
# p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
# print(f"% of nan after VarianceThreshold: {p*100:.2n}")

In [6]:
data_linker_mordred.shape

(7109, 1826)

In [7]:
data_node = pd.read_csv("../data/node_descriptors.csv", index_col=0)
data_node = data_node.loc[data_node["n_types_metals"] == 1.]
data_node = data_node.loc[:, ['n_metals', 'Atomic_Number', 'Atomic_Weight',
                              'Atomic Radius', 'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']]


In [8]:
data_qmof = pd.concat([data_property_cell, data_linker_mordred, data_node], axis=1)

In [9]:
data_qmof.shape

(7463, 1843)

In [10]:
data_qmof = data_qmof.loc[list({*list(data_linker_mordred.index)} & {*list(data_node.index)})]

In [11]:
data_qmof.shape

(6797, 1843)

In [12]:
data_qmof.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6797 entries, tobacco_SR_ttv_v1-3c_pyrrole_Ch_v2-12c_Zr_1_Ch_1DoU_Ch_2x2x2 to RINNAG_FSR
Columns: 1843 entries, pld to electron affinity(kJ/mol)
dtypes: float64(1842), int64(1)
memory usage: 95.6+ MB


In [13]:
train, test = train_test_split(data_qmof, test_size=0.2, random_state=0)

In [14]:
def do_drop(col: pd.Series, p: float = 0.05) -> bool:
    """return True if col has more than p proportion of nan values and False if not
    Args:
        col (pd.Series): column of DataFrame
        p (float, optional): cut off proportion. Defaults to 0.05.
    Returns:
        bool: 
    """
    tf, counts = np.unique(col.isna(), return_counts=True)
    if len(counts) == 1:
        if tf[0] == True:
            return True
        else:
            return False
    f_pos = 0 if tf[0] == False else 1
    if counts[(f_pos + 1) % 2]/counts[f_pos] > p:
        return True
    else:
        return False

In [14]:
means = train.mean()

In [14]:
means = train.mean()

train = train.fillna(means)
train = train.dropna(axis=1)
test = test.fillna(means)
test = test[train.columns]
train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5437 entries, GINTAA01_FSR to boydwoo_str_m4_Al_o6_o7_acs_sym_135
Columns: 1538 entries, pld to electron affinity(kJ/mol)
dtypes: float64(1537), int64(1)
memory usage: 63.8+ MB


In [15]:
train.shape

(5437, 1538)

In [16]:
test.shape == test.dropna().shape

True

In [17]:
selector = VarianceThreshold(threshold=0.0001)
selector.fit(train)
kept = selector.get_support()

train = train.loc[:,kept]
test = test.loc[:,kept]

In [18]:
train.shape, test.shape

((5437, 1378), (1360, 1378))

In [21]:
qmof_mm = MinMaxScaler()
train_mm = pd.DataFrame(qmof_mm.fit_transform(train), columns=train.columns, index=train.index)

test_mm = pd.DataFrame(qmof_mm.transform(test), columns=test.columns, index=test.index)

train_mm.to_csv("../qmof_datasets/minmax/train.csv")
test_mm.to_csv("../qmof_datasets/minmax/test.csv")
joblib.dump(qmof_mm, "../qmof_datasets/minmax/minmax.pkl")

['../qmof_datasets/minmax/minmax.pkl']

In [22]:
qmof_nm = Normalizer()
train_nm = pd.DataFrame(qmof_nm.fit_transform(train), columns=train.columns, index=train.index)

test_nm = pd.DataFrame(qmof_nm.transform(test), columns=test.columns, index=test.index)

train_mm.to_csv("../qmof_datasets/normalizer/train.csv")
test_mm.to_csv("../qmof_datasets/normalizer/test.csv")
joblib.dump(qmof_mm, "../qmof_datasets/normalizer/normalizer.pkl")

['../qmof_datasets/normalizer/normalizer.pkl']