# Machine Learning CO<sub>2</sub> Working Capacity of MOFs

โดย รังสิมันต์ เกษแก้ว <br>
มหาวิทยาลัยแห่งซูริค <br>
E-mail: rangsiman1993@gmail.com

## 0. Import packages

In [1]:
# import standard scientific libraries
import os
import math
import numpy as np
import pandas as pd

# import ML models from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error

## 1. Import the data

In [2]:
pd.set_option('max_columns', None)
pd.set_option("display.precision", 8)

dataset = "./"

เนื่องจากในโปรเจ็คนี้เป็นการสาธิตการรัน Machine Learning ดังนั้นจึงจะขอดึงข้อมูล MOFs แค่ 20,000 โครงสร้างแรกเท่านั้น !!!

In [3]:
train = pd.read_csv(dataset + "train.csv")[:20000]
train.shape

(20000, 14)

In [4]:
train.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_1,1116.667429,875.2406,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.86416611,6.786041,105.28450172
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678033,7.147286,101.22477418
2,mof_unit_3,1089.818728,773.68796,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.26372569,6.347967,118.98701075
3,mof_unit_4,2205.198301,1304.63872,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.70137692,6.190085,187.6260045
4,mof_unit_5,1137.800963,901.73612,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.00183795,6.478063,79.21000066


In [5]:
feat = list(train.columns.values)
feat = {k: v for k, v in enumerate(feat)}
feat

{0: 'MOFname',
 1: 'volume [A^3]',
 2: 'weight [u]',
 3: 'surface_area [m^2/g]',
 4: 'void_fraction',
 5: 'void_volume [cm^3/g]',
 6: 'functional_groups',
 7: 'metal_linker',
 8: 'organic_linker1',
 9: 'organic_linker2',
 10: 'topology',
 11: 'CO2/N2_selectivity',
 12: 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
 13: 'CO2_working_capacity [mL/g]'}

## 2. Clean data

In [6]:
train = train.iloc[:, [1,2,3,4,5,7,8,9,11,12,13]]
train

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,1116.667429,875.240600,0.00,0.07899,0.0607,3,4,11,22.86416611,6.786041,105.28450172
1,2769.503842,2211.697211,603.61,0.13794,0.1040,10,44,57,33.61678033,7.147286,101.22477418
2,1089.818728,773.687960,788.50,0.14874,0.1262,2,22,24,19.26372569,6.347967,118.98701075
3,2205.198301,1304.638720,1441.53,0.21814,0.2220,9,17,24,25.70137692,6.190085,187.62600450
4,1137.800963,901.736120,0.00,0.07778,0.0591,2,1,22,30.00183795,6.478063,79.21000066
...,...,...,...,...,...,...,...,...,...,...,...
19995,5189.923599,2423.541440,2944.18,0.33672,0.4342,12,11,11,8.85242576,4.787996,63.35786482
19996,13710.436745,4426.378000,3208.43,0.44798,0.8356,3,3,3,14.80568930,4.909318,90.66697340
19997,1215.143789,691.014960,2102.05,0.33106,0.3506,3,1,19,18.93154465,5.574042,78.37355779
19998,1271.408263,713.463760,1818.66,0.22698,0.2436,2,12,13,19.42797408,5.772102,114.99909150


In [7]:
# find rows having NaN
train.isnull().any(axis=0)

volume [A^3]                                     False
weight [u]                                       False
surface_area [m^2/g]                             False
void_fraction                                    False
void_volume [cm^3/g]                             False
metal_linker                                     False
organic_linker1                                  False
organic_linker2                                  False
CO2/N2_selectivity                               False
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    False
CO2_working_capacity [mL/g]                      False
dtype: bool

In [8]:
# find row having inf
np.isinf(train).any(axis=0)

volume [A^3]                                     False
weight [u]                                       False
surface_area [m^2/g]                             False
void_fraction                                    False
void_volume [cm^3/g]                             False
metal_linker                                     False
organic_linker1                                  False
organic_linker2                                  False
CO2/N2_selectivity                               False
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    False
CO2_working_capacity [mL/g]                      False
dtype: bool

## 3. Prepare training and test sets

Prepare input
- x_train = train input
- y_train = train output
- x_test = test input
- y_test = test predict

In [9]:
ratio = 0.8
train_size = math.floor(train.shape[0]*0.8)
test_size = train.shape[0] - train_size
print("dataset size:", train.shape[0])
print("train size:", train_size)
print("test size:", test_size)

dataset size: 20000
train size: 16000
test size: 4000


In [10]:
feat = list(train.columns.values)
feat = {k: v for k, v in enumerate(feat)}
feat

{0: 'volume [A^3]',
 1: 'weight [u]',
 2: 'surface_area [m^2/g]',
 3: 'void_fraction',
 4: 'void_volume [cm^3/g]',
 5: 'metal_linker',
 6: 'organic_linker1',
 7: 'organic_linker2',
 8: 'CO2/N2_selectivity',
 9: 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
 10: 'CO2_working_capacity [mL/g]'}

In [11]:
train.dtypes

volume [A^3]                                     float64
weight [u]                                       float64
surface_area [m^2/g]                             float64
void_fraction                                    float64
void_volume [cm^3/g]                             float64
metal_linker                                       int64
organic_linker1                                    int64
organic_linker2                                    int64
CO2/N2_selectivity                               float64
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    float64
CO2_working_capacity [mL/g]                      float64
dtype: object

In [12]:
x_train = train.iloc[0:train_size, :-1].astype(np.float32)
y_train = train.iloc[0:train_size, -1].astype(np.float32)
x_test = train.iloc[train_size:train_size+test_size, :-1].astype(np.float32)
y_true = train.iloc[train_size:train_size+test_size, -1].astype(np.float32)

In [13]:
x_train.dtypes


volume [A^3]                                     float32
weight [u]                                       float32
surface_area [m^2/g]                             float32
void_fraction                                    float32
void_volume [cm^3/g]                             float32
metal_linker                                     float32
organic_linker1                                  float32
organic_linker2                                  float32
CO2/N2_selectivity                               float32
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    float32
dtype: object

In [14]:
x_train

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,1116.66748047,875.24060059,0.00000000,0.07899000,0.06070000,3.0,4.0,11.0,22.86416626,6.78604078
1,2769.50390625,2211.69726562,603.60998535,0.13794000,0.10400000,10.0,44.0,57.0,33.61677933,7.14728594
2,1089.81872559,773.68798828,788.50000000,0.14873999,0.12620001,2.0,22.0,24.0,19.26372528,6.34796715
3,2205.19824219,1304.63867188,1441.53002930,0.21814001,0.22200000,9.0,17.0,24.0,25.70137787,6.19008493
4,1137.80090332,901.73614502,0.00000000,0.07778000,0.05910000,2.0,1.0,22.0,30.00183868,6.47806311
...,...,...,...,...,...,...,...,...,...,...
15995,1118.13647461,879.33984375,724.85998535,0.15647000,0.11980000,2.0,1.0,12.0,19.48492050,6.61062479
15996,1042.06066895,715.28967285,571.57000732,0.10680000,0.09370000,3.0,10.0,23.0,32.47252655,7.10846806
15997,6099.76220703,3157.22558594,2195.46997070,0.36535999,0.42510000,12.0,25.0,25.0,21.63723183,5.80341291
15998,3866.07568359,2765.51123047,733.80999756,0.15019999,0.12639999,4.0,1.0,24.0,39.14509964,6.60240793


## 4. Neural network

### 4.1: Feed forward neural network

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2021-10-06 14:04:43.488154: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-06 14:04:43.488194: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [16]:
model = Sequential()
model.add(Dense(12, input_dim=(x_train.shape[1]), activation='relu')) # input
model.add(Dense(24, activation='relu')) # hidden 1
model.add(Dense(12, activation='relu')) # hidden 2
model.add(Dense(1, activation='linear')) # output

2021-10-06 14:04:47.571585: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-06 14:04:47.574211: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-06 14:04:47.574231: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-06 14:04:47.574249: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (p-7ac02acf-7324-4951-a962-19eb20207f2c): /proc/driver/nvidia/version does not exist
2021-10-06 14:04:47.574487: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operatio

In [17]:
model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=88, batch_size=64)

2021-10-06 14:04:52.768353: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-06 14:04:52.776231: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2499995000 Hz
Epoch 1/88
Epoch 2/88
Epoch 3/88
Epoch 4/88
Epoch 5/88
Epoch 6/88
Epoch 7/88
Epoch 8/88
Epoch 9/88
Epoch 10/88
Epoch 11/88
Epoch 12/88
Epoch 13/88
Epoch 14/88
Epoch 15/88
Epoch 16/88
Epoch 17/88
Epoch 18/88
Epoch 19/88
Epoch 20/88
Epoch 21/88
Epoch 22/88
Epoch 23/88
Epoch 24/88
Epoch 25/88
Epoch 26/88
Epoch 27/88
Epoch 28/88
Epoch 29/88
Epoch 30/88
Epoch 31/88
Epoch 32/88
Epoch 33/88
Epoch 34/88
Epoch 35/88
Epoch 36/88
Epoch 37/88
Epoch 38/88
Epoch 39/88
Epoch 40/88
Epoch 41/88
Epoch 42/88
Epoch 43/88
Epoch 44/88
Epoch 45/88
Epoch 46/88
Epoch 47/88
Epoch 48/88
Epoch 49/88
Epoch 50/88
Epoch 51/88
Epoch 52/88
Epoch 53/88
Epoch 54/88
Epoch 55/88
Epoch 56/88
Epoch 57/88
Epoch 58/88
Epoch 59/88
Epoch 60/88
Epoch 61/88
Epoch

<tensorflow.python.keras.callbacks.History at 0x7f621d4c0690>

<details>
<summary> <font color='green'>Click here for some more information about hyperparaper of neural network</font></summary>
We use MAE as a loss function in the neural network but we use LMAE as a metric in our competition. Is this reasonable?
</details>

In [18]:
y_pred = model.predict(x_test)
y_pred

array([[179.12062 ],
       [ 79.922264],
       [341.51205 ],
       ...,
       [105.396   ],
       [121.42011 ],
       [186.655   ]], dtype=float32)

In [19]:
log_mae = np.log(mean_absolute_error(y_pred, y_true))
log_mae

3.4628491

## 4.2 Customization

In [20]:
import tensorflow.keras.backend as kb

def custom_loss(y_actual, y_pred): 
    custom_loss=tf.experimental.numpy.log10(kb.sum(kb.abs(y_actual - y_pred)) / y_actual.shape[0])
    return custom_loss

In [27]:
model.compile(loss=custom_loss, optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=60, verbose=0, batch_size=64)

<tensorflow.python.keras.callbacks.History at 0x7f61cc6c98d0>

In [28]:
# predict
y_pred = model.predict(x_test)
# evaluate error
log_mae = np.log(mean_absolute_error(y_pred, y_true))
log_mae

3.4239626

## 5. Predicting CO<sub>2</sub> WC 

### Import pretest and proprocessing

Today we will prepare a submission file for pretest set for the phase 1 (Development).

In [None]:
pretest = pd.read_csv(dataset + "pretest.csv")
pretest.shape

(2000, 13)

In [None]:
col = ["functional_groups", "topology"]
for i in col:
    pretest[i] = pretest[i].astype("category").cat.codes

pretest.dtypes

MOFname                                           object
volume [A^3]                                     float64
weight [u]                                       float64
surface_area [m^2/g]                             float64
void_fraction                                    float64
void_volume [cm^3/g]                             float64
functional_groups                                  int16
metal_linker                                       int64
organic_linker1                                    int64
organic_linker2                                    int64
topology                                            int8
CO2/N2_selectivity                               float64
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    float64
dtype: object

In [None]:
pretest

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_pretest_1,6288.293858,2271.687140,4148.48,0.41225,0.6872,305,4,7,27,0,14.04879074,5.604779
1,mof_unit_pretest_2,1790.506437,887.747320,2191.34,0.30231,0.3672,149,2,4,26,5,20.21722169,6.148776
2,mof_unit_pretest_3,2348.969203,1239.765880,2030.88,0.28533,0.3256,371,3,18,22,5,33.10866151,6.164397
3,mof_unit_pretest_4,2941.571525,1147.951400,3587.13,0.41963,0.6475,91,2,8,15,5,12.80056168,5.164957
4,mof_unit_pretest_5,705.397601,643.270740,0.00,0.07060,0.0466,65,3,10,22,5,23.39561652,7.090687
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,mof_unit_pretest_1996,5111.109714,1578.082220,3630.31,0.58981,1.1504,49,2,1,14,4,4.77869779,3.675003
1996,mof_unit_pretest_1997,911.269336,481.279680,2546.02,0.36132,0.4120,26,2,10,20,5,6.88338750,3.732121
1997,mof_unit_pretest_1998,4236.596494,1127.792600,4296.42,0.60298,1.3641,143,2,7,20,5,5.27073403,3.354425
1998,mof_unit_pretest_1999,22861.645381,3492.712720,6252.01,0.75732,2.9852,186,2,6,11,4,3.17291438,2.643592


### Let's predict and create a submission file

Join the [Codalab competition](https://competitions.codalab.org/competitions/34540) for this course!

Create a `submission.csv` with your predictions to join the competition and upload it to the competition site.

In [None]:
pretest_pred = model.predict(pretest.iloc[:, [1,2,3,4,5,7,8,9,11,12]])
pretest_pred

array([[ 82.290665],
       [129.65016 ],
       [213.65913 ],
       ...,
       [ 37.44519 ],
       [ 42.51303 ],
       [ 63.10031 ]], dtype=float32)

In [None]:
submission = pd.DataFrame({
    "id": ["pretest_" + str(i) for i in range(1,2001)],
    "CO2_working_capacity [mL/g]": pretest_pred.T[0]
    })

submission.to_csv("submission.csv", index=False)

In [None]:
!ls

 1-neural-network-with-tensorflow.ipynb   test.csv
 2-neural-network-MOFs.ipynb		  TestSubmission01_0210.zip
 2-neural-network-MOFs-mon.ipynb	  TestSubmission01_0536.zip
 best_model_HDF5_format.h5		  TestSubmission03byBank_0210.zip
 best_model_SavedModel_format		  TestSubmission03byOshi_0683.zip
 init.ipynb				  train.csv
 pretest.csv				 'Zero -To-Hero-ML.ipynb'
 submission.csv				 'Zero -To-Hero-ML-Phase1-Recov.ipynb'


In [None]:
%%capture
!sudo apt-get update
!sudo apt-get install zip

In [None]:
!zip TestSubmission01_0536.zip submission.csv

updating: submission.csv (deflated 66%)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=89ff0356-6e0b-40ab-922e-602a46fe5c41' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>