In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import joblib

In [3]:
data_property_cell = pd.read_csv("../data/qmof_property_cell.csv", index_col=1) # index_col=1 id "filename"
data_property_cell = data_property_cell.drop("Unnamed: 0", axis=1) # drop numbers of rows
property_cell_cols = ["pld", "lcd", "a", "b", "c", "alpha", "betta", "gamma", "volume", "spacegroupNumber"]
data_property_cell = data_property_cell.loc[:,property_cell_cols].rename({"betta": "beta"}, axis=1)

In [4]:
data_linker_mordred = pd.read_csv("../data/mordred_descriptors.csv", index_col=0)

In [5]:
p = len(data_linker_mordred.values[np.isnan(data_linker_mordred)]) / len(data_linker_mordred) / len(data_linker_mordred.iloc[0])
print(f"Mordred data has {p*100:.2n} % of nan")

Mordred data has 25 % of nan


In [6]:
data_linker_mordred.shape

(7109, 1826)

Load node data

In [7]:
data_node = pd.read_csv("../data/node_descriptors.csv", index_col=0)
data_node = data_node.loc[data_node["n_types_metals"] == 1.]
data_node = data_node.loc[:, ['n_metals', 'Atomic_Number', 'Atomic_Weight',
                              'Atomic Radius', 'Mulliken EN', 'polarizability(A^3)', 'electron affinity(kJ/mol)']]


In [8]:
data_qmof = pd.concat([data_property_cell, data_linker_mordred, data_node], axis=1)

In [9]:
data_qmof.shape

(7463, 1843)

In [10]:
data_qmof = data_qmof.loc[list({*list(data_linker_mordred.index)} & {*list(data_node.index)})]

In [11]:
data_qmof.shape

(6797, 1843)

In [12]:
data_qmof.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6797 entries, boydwoo_str_m5_Al_o4_o17_sra_sym_74 to WADDOX_FSR
Columns: 1843 entries, pld to electron affinity(kJ/mol)
dtypes: float64(1842), int64(1)
memory usage: 95.6+ MB


In [13]:
train, test = train_test_split(data_qmof, test_size=0.2, random_state=0)

In [14]:
train.shape

(5437, 1843)

In [15]:
len(train["beta"].dropna())

5437

In [16]:
train["pld"].std()

4.303728942107217

In [17]:
for col in train.columns:
    print(f"{col} has {((5437 - len(train[col].dropna()))/5437)*100:.0f} % of nan and {train[col].std():.4f} std")

pld has 0 % of nan and 4.3037 std
lcd has 0 % of nan and 5.1598 std
a has 0 % of nan and 4.8369 std
b has 0 % of nan and 4.8338 std
c has 0 % of nan and 6.0162 std
alpha has 0 % of nan and 14.2155 std
beta has 0 % of nan and 12.2899 std
gamma has 0 % of nan and 14.3865 std
volume has 0 % of nan and 3412.0627 std
spacegroupNumber has 0 % of nan and 43.0058 std
ABC has 0 % of nan and 6.7819 std
ABCGG has 0 % of nan and 4.9480 std
nAcid has 0 % of nan and 1.0135 std
nBase has 0 % of nan and 0.6974 std
SpAbs_A has 0 % of nan and 10.9278 std
SpMax_A has 0 % of nan and 0.2252 std
SpDiam_A has 0 % of nan and 0.4553 std
SpAD_A has 0 % of nan and 10.9278 std
SpMAD_A has 0 % of nan and 0.0786 std
LogEE_A has 0 % of nan and 0.5081 std
VE1_A has 0 % of nan and 0.7094 std
VE2_A has 0 % of nan and 0.0775 std
VE3_A has 0 % of nan and 0.6847 std
VR1_A has 0 % of nan and 303.1886 std
VR2_A has 0 % of nan and 7.1422 std
VR3_A has 0 % of nan and 1.4251 std
nAromAtom has 0 % of nan and 6.1421 std
nAromBon

In [18]:
from pteproc_model import PreprocessingModel

In [19]:
# p_dorp means we drop columns with more than 5% of nan values
model = PreprocessingModel(p_drop=0.9, threshold=1e-3, normalizer="minmax")

In [20]:
transformed_train = model.fit_transform(train)

In [21]:
transformed_train.shape

(5437, 1169)

In [22]:
pd.options.display.max_columns = 100
transformed_train

Unnamed: 0,pld,lcd,a,b,c,alpha,beta,gamma,volume,spacegroupNumber,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,VE3_A,VR3_A,nAromAtom,nAromBond,nAtom,nHeavyAtom,nHetero,nH,nC,nN,nO,nS,nP,nF,nCl,nBr,nI,nX,ATS0dv,ATS1dv,ATS2dv,ATS3dv,ATS4dv,ATS5dv,ATS6dv,ATS7dv,ATS8dv,ATS0d,...,JGI2,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,Diameter,Radius,TopoShapeIndex,PetitjeanIndex,Vabc,VAdjMat,MWC01,MWC02,MWC03,MWC04,MWC05,MWC06,MWC07,MWC08,MWC09,MWC10,TMWC10,SRW02,SRW04,SRW05,SRW06,SRW07,SRW08,SRW09,SRW10,TSRW10,MW,AMW,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,n_metals,Atomic_Number,Atomic_Weight,Atomic Radius,Mulliken EN,polarizability(A^3),electron affinity(kJ/mol)
boydwoo_str_m3_o15_o17_nbo_sym_15,0.270155,0.448419,0.562968,0.535403,0.327225,0.598806,0.947294,0.807056,0.151355,0.424779,0.215447,0.285505,0.200000,0.000000,0.192259,0.671013,0.671013,0.192259,0.753604,0.641229,0.379500,0.173225,0.615106,0.617473,0.192308,0.194444,0.162651,0.207547,0.162162,0.073529,0.177083,0.000000,0.153846,0.0,0.000000,0.0,0.0,0.000,0.5,0.150,0.207422,0.204662,0.232709,0.219959,0.165259,0.123769,0.073609,0.043917,0.022705,0.204748,...,0.383632,0.322790,0.372181,0.282140,0.430710,0.379780,0.301263,0.153846,0.333333,0.513409,8.000000e-08,4.500000e-08,0.616667,0.740260,0.138720,0.676692,0.204000,0.714934,0.759136,0.795275,0.822206,0.823053,0.818648,0.815094,0.811957,0.809606,0.371058,0.650781,0.739083,0.000000,0.803791,0.000000,0.805624,0.000000,0.796101,0.319821,0.280441,0.619536,0.256345,0.216612,0.223772,0.226116,0.186717,0.090909,0.303371,0.252930,0.133333,0.841924,0.018450,0.014184
boydwoo_str_m3_o3_o5_pcu_sym_50,0.205213,0.234178,0.179022,0.419676,0.271395,0.571950,0.530553,0.443976,0.063217,0.000000,0.217368,0.271941,0.133333,0.000000,0.209805,0.553953,0.553953,0.209805,0.766404,0.575269,0.333386,0.237499,0.555415,0.573176,0.256410,0.222222,0.216867,0.213836,0.063063,0.196078,0.222222,0.027778,0.102564,0.0,0.000000,0.0,0.0,0.000,0.0,0.000,0.191967,0.195880,0.200924,0.174566,0.135913,0.136482,0.127179,0.112580,0.106866,0.205737,...,0.333829,0.192347,0.288039,0.132273,0.200624,0.126198,0.084601,0.159512,0.162728,0.421721,9.333333e-08,4.666667e-08,0.791667,0.880808,0.206496,0.615530,0.208000,0.640554,0.672639,0.702646,0.722845,0.722872,0.717382,0.714024,0.710540,0.708399,0.348282,0.587563,0.665203,0.000000,0.720417,0.000000,0.717770,0.000000,0.705399,0.304126,0.131982,0.157498,0.197970,0.204126,0.196990,0.160040,0.203425,0.090909,0.303371,0.252930,0.133333,0.841924,0.018450,0.014184
ARUBIA_FSR,0.036872,0.036091,0.196981,0.203636,0.144698,0.571383,0.331739,0.172511,0.024125,0.004425,0.187235,0.226357,0.100000,0.000000,0.175257,0.550664,0.550664,0.175257,0.760848,0.579817,0.351214,0.226454,0.565600,0.568936,0.192308,0.166667,0.183735,0.183962,0.121622,0.161765,0.166667,0.083333,0.134615,0.0,0.000000,0.0,0.0,0.000,0.0,0.000,0.185588,0.179798,0.177637,0.143361,0.094494,0.073501,0.069547,0.037078,0.037079,0.172107,...,0.372794,0.208642,0.290811,0.325209,0.072963,0.182769,0.106324,0.122967,0.122920,0.471225,1.200000e-07,5.500000e-08,1.000000,1.000000,0.172602,0.621476,0.176000,0.649117,0.684820,0.714509,0.736154,0.734898,0.729383,0.724923,0.721066,0.718120,0.328327,0.592562,0.674345,0.000000,0.733029,0.000000,0.731816,0.000000,0.719669,0.287566,0.116635,0.133191,0.149746,0.171010,0.159363,0.150367,0.171679,0.000000,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071
BAXXAA_FSR,0.029944,0.047473,0.142598,0.173669,0.093549,0.808817,0.536711,0.462336,0.013987,0.004425,0.100270,0.149419,0.200000,0.000000,0.084711,0.536547,0.536547,0.084711,0.661951,0.464176,0.237214,0.312181,0.447613,0.448396,0.076923,0.066667,0.084337,0.094340,0.081081,0.058824,0.083333,0.000000,0.153846,0.0,0.000000,0.0,0.0,0.000,0.0,0.000,0.134554,0.107698,0.116243,0.080000,0.046627,0.025418,0.021242,0.031021,0.000000,0.086053,...,0.529412,0.256790,0.305882,0.590909,0.666667,0.000000,0.000000,0.000000,0.000000,0.624063,6.000000e-08,3.000000e-08,0.750000,0.857143,0.087158,0.513805,0.088000,0.552816,0.599574,0.638493,0.667134,0.673405,0.673870,0.674316,0.674348,0.674624,0.246258,0.478099,0.585500,0.000000,0.662122,0.000000,0.676190,0.000000,0.674535,0.213271,0.061656,0.168114,0.076142,0.087948,0.081009,0.113812,0.077694,0.181818,0.269663,0.224989,0.188889,0.790378,0.038745,0.190071
boydwoo_str_m3_o18_o19_pcu_sym_70,0.206228,0.241181,0.258480,0.307168,0.195000,0.666286,0.567819,0.657544,0.049229,0.000000,0.175428,0.223661,0.133333,0.000000,0.160627,0.569929,0.569929,0.160627,0.735780,0.566742,0.329232,0.237153,0.549611,0.555614,0.153846,0.155556,0.168675,0.169811,0.099099,0.147059,0.159722,0.055556,0.128205,0.0,0.000000,0.0,0.0,0.000,0.0,0.000,0.177082,0.175280,0.185527,0.159776,0.110615,0.088964,0.068083,0.055556,0.033708,0.167161,...,0.383564,0.235512,0.300158,0.220955,0.447244,0.258692,0.227405,0.263577,0.146278,0.494980,8.666667e-08,4.333333e-08,0.802381,0.889703,0.155252,0.608541,0.165333,0.638748,0.676193,0.709322,0.731935,0.733461,0.728682,0.726115,0.722799,0.721149,0.318848,0.579018,0.664830,0.000000,0.725308,0.000000,0.726655,0.000000,0.716876,0.276800,0.107249,0.160085,0.170897,0.167210,0.164232,0.155518,0.155806,0.090909,0.303371,0.252930,0.133333,0.841924,0.018450,0.014184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HICQIW_FSR,0.030652,0.054791,0.160234,0.199005,0.149666,0.482072,0.257170,0.154425,0.020039,0.017699,0.147141,0.211048,0.000000,0.000000,0.132696,0.566702,0.560063,0.132696,0.761734,0.552111,0.301644,0.233509,0.531487,0.547271,0.141026,0.122222,0.156627,0.141509,0.081081,0.161765,0.135417,0.083333,0.076923,0.0,0.000000,0.0,0.0,0.000,0.0,0.000,0.128706,0.132093,0.121247,0.096134,0.072917,0.050837,0.039011,0.027900,0.013858,0.137982,...,0.352941,0.190215,0.199230,0.227273,0.250370,0.477777,0.303819,0.481783,0.000000,0.462091,8.000000e-08,4.000000e-08,0.800000,0.888889,0.136384,0.597643,0.136000,0.627840,0.667728,0.701596,0.725755,0.727684,0.724175,0.721792,0.719396,0.717845,0.295998,0.566501,0.654881,0.545665,0.715879,0.666496,0.717767,0.724751,0.708687,0.336784,0.089833,0.088608,0.106599,0.133550,0.124834,0.123610,0.126566,0.000000,0.247191,0.207700,0.238889,0.591065,0.073801,0.028369
boydwoo_str_m3_o4_o10_pcu_sym_199,0.098374,0.144495,0.169922,0.153280,0.095521,0.574930,0.544767,0.554193,0.016019,0.000000,0.095790,0.142483,0.133333,0.000000,0.079145,0.545031,0.545031,0.079145,0.648128,0.437939,0.204987,0.346425,0.409501,0.434431,0.051282,0.044444,0.096386,0.088050,0.072072,0.098039,0.079861,0.027778,0.115385,0.0,0.000000,0.0,0.0,0.000,0.0,0.000,0.109746,0.094145,0.094432,0.064986,0.043568,0.036574,0.017157,0.015051,0.002871,0.088032,...,0.372353,0.330725,0.291478,0.138258,0.154963,0.103314,0.104686,0.000000,0.000000,0.479951,5.000000e-08,2.666667e-08,0.600000,0.729630,0.084595,0.488500,0.085333,0.527697,0.575535,0.618498,0.648222,0.657791,0.659704,0.662284,0.663414,0.665097,0.238935,0.452500,0.570829,0.000000,0.652640,0.000000,0.670744,0.000000,0.671979,0.207031,0.058094,0.124679,0.065990,0.085776,0.081452,0.090195,0.071846,0.090909,0.303371,0.252930,0.133333,0.841924,0.018450,0.014184
boydwoo_str_m5_Al_o23_o27_sra_sym_48,0.314291,0.278622,0.105075,0.549300,0.424652,0.697473,0.526964,0.441629,0.077698,0.013274,0.228235,0.303718,0.200000,0.034091,0.212064,0.651235,0.651235,0.212064,0.732435,0.653859,0.353700,0.137460,0.621639,0.678485,0.115385,0.100000,0.234940,0.231132,0.229730,0.213235,0.177083,0.166667,0.153846,0.0,0.000000,0.0,0.0,0.075,0.0,0.075,0.231093,0.219261,0.234012,0.204594,0.145737,0.115347,0.087793,0.070831,0.060678,0.223294,...,0.512677,0.283216,0.378434,0.285205,0.379206,0.476604,0.264456,0.362060,0.531814,0.659303,1.200000e-07,6.000000e-08,0.854167,0.921212,0.236706,0.685651,0.216000,0.717798,0.755103,0.787547,0.811595,0.810655,0.805221,0.800796,0.797197,0.794363,0.380766,0.660418,0.740519,0.000000,0.800650,0.000000,0.800058,0.000000,0.789163,0.334514,0.192542,0.206620,0.241117,0.221498,0.219124,0.261353,0.224624,0.000000,0.112360,0.086723,0.000000,0.422680,0.053506,0.158865
NECCUU_FSR,0.046234,0.031904,0.145662,0.133377,0.058624,0.740195,0.903711,0.441522,0.008729,0.017699,0.119364,0.151959,0.000000,0.000000,0.119133,0.294595,0.294595,0.119133,0.634435,0.332368,0.209701,0.579323,0.323845,0.325577,0.153846,0.133333,0.141566,0.117925,0.013514,0.161765,0.135417,0.020833,0.000000,0.0,0.333333,0.0,0.0,0.000,0.0,0.000,0.068002,0.071678,0.061132,0.048105,0.037202,0.044623,0.049700,0.041850,0.028652,0.117211,...,0.135747,0.080247,0.063235,0.071023,0.076410,0.031154,0.051220,0.109756,0.000000,0.125505,4.500000e-08,2.000000e-08,0.500000,0.500000,0.125879,0.351633,0.116000,0.360090,0.375594,0.387358,0.396971,0.394306,0.390301,0.386895,0.384251,0.382063,0.191489,0.339618,0.369716,0.000000,0.393441,0.000000,0.388302,0.000000,0.379240,0.166924,0.079995,0.174413,0.101523,0.110749,0.105578,0.073488,0.115288,0.000000,0.292135,0.244950,0.150000,0.852234,0.023985,0.268085


In [23]:
for col in transformed_train.columns:
    print(f"{col} has {((5437 - len(train[col].dropna()))/5437)*100:.0f} % of nan, {transformed_train[col].std():.4f} std, number of unique: {(len(np.unique(transformed_train[col]))):.0f}")

pld has 0 % of nan, 0.1458 std, number of unique: 5408
lcd has 0 % of nan, 0.1610 std, number of unique: 5413
a has 0 % of nan, 0.1466 std, number of unique: 5437
b has 0 % of nan, 0.1513 std, number of unique: 5437
c has 0 % of nan, 0.1283 std, number of unique: 5436
alpha has 0 % of nan, 0.1525 std, number of unique: 5351
beta has 0 % of nan, 0.1788 std, number of unique: 5337
gamma has 0 % of nan, 0.1819 std, number of unique: 5365
volume has 0 % of nan, 0.0696 std, number of unique: 5417
spacegroupNumber has 0 % of nan, 0.1903 std, number of unique: 136
ABC has 0 % of nan, 0.0769 std, number of unique: 2763
ABCGG has 0 % of nan, 0.0902 std, number of unique: 3242
nAcid has 0 % of nan, 0.1014 std, number of unique: 32
nBase has 0 % of nan, 0.0475 std, number of unique: 28
SpAbs_A has 0 % of nan, 0.0747 std, number of unique: 3268
SpMax_A has 0 % of nan, 0.0944 std, number of unique: 3163
SpDiam_A has 0 % of nan, 0.0954 std, number of unique: 3173
SpAD_A has 0 % of nan, 0.0747 std, n

In [24]:
transformed_test = model.transform(test)

In [25]:
transformed_train.shape, transformed_test.shape

((5437, 1169), (1360, 1169))

In [26]:
transformed_train.to_csv("../qmof_datasets/train.csv")
transformed_test.to_csv("../qmof_datasets/test.csv")
joblib.dump(model, "../qmof_datasets/scaler.pkl")

['../qmof_datasets/scaler.pkl']