In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict

In [2]:
df_train = pd.read_csv('tmlcc-2021/train.csv')
df_test = pd.read_csv('tmlcc-2021/test.csv')
df_pre_train = pd.read_csv('tmlcc-2021/pretest.csv')

In [3]:
df_train.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_1,1116.667429,875.2406,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678,7.147286,101.224774
2,mof_unit_3,1089.818728,773.68796,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.63872,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
4,mof_unit_5,1137.800963,901.73612,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68613 entries, 0 to 68612
Data columns (total 14 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   MOFname                                        68613 non-null  object 
 1   volume [A^3]                                   68613 non-null  float64
 2   weight [u]                                     68613 non-null  float64
 3   surface_area [m^2/g]                           68613 non-null  float64
 4   void_fraction                                  68613 non-null  float64
 5   void_volume [cm^3/g]                           68613 non-null  float64
 6   functional_groups                              68290 non-null  object 
 7   metal_linker                                   68613 non-null  int64  
 8   organic_linker1                                68613 non-null  int64  
 9   organic_linker2                                686

In [5]:
df_train = df_train.dropna()

In [6]:
df_train = df_train.loc[(df_train['void_volume [cm^3/g]'] > 0) & (df_train['void_fraction'] > 0) & (df_train['surface_area [m^2/g]'] > 0) & (df_train['volume [A^3]'] > 0)]

In [7]:
df_train, y = df_train.loc[:, 'volume [A^3]':'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'], df_train.loc[:,'CO2_working_capacity [mL/g]']

In [8]:
dummies_meatl = pd.get_dummies(df_train['metal_linker'], prefix='met')
dummies_topology = pd.get_dummies(df_train['metal_linker'], prefix='topo')
df_onehot = pd.concat([dummies_meatl, dummies_topology], axis = 1)

In [9]:
df_train = df_train.drop(['functional_groups', 'metal_linker', 'organic_linker1', 'organic_linker2', 'topology', 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'], axis=1)

In [10]:
df_train.head()

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],CO2/N2_selectivity
1,2769.503842,2211.697211,603.61,0.13794,0.104,33.61678
2,1089.818728,773.68796,788.5,0.14874,0.1262,19.263726
3,2205.198301,1304.63872,1441.53,0.21814,0.222,25.701377
5,3954.659761,1543.02768,2430.55,0.37094,0.5725,17.146541
6,3565.914939,1954.749656,1530.02,0.33337,0.3662,18.363791


In [11]:
df_train

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],CO2/N2_selectivity
1,2769.503842,2211.697211,603.61,0.13794,0.1040,33.616780
2,1089.818728,773.687960,788.50,0.14874,0.1262,19.263726
3,2205.198301,1304.638720,1441.53,0.21814,0.2220,25.701377
5,3954.659761,1543.027680,2430.55,0.37094,0.5725,17.146541
6,3565.914939,1954.749656,1530.02,0.33337,0.3662,18.363791
...,...,...,...,...,...,...
66519,1426.479810,1272.451540,1343.62,0.30190,0.2038,5.867674
66520,23943.701366,5497.752320,4182.24,0.66340,1.7399,4.060772
66521,14389.971556,4396.164320,4149.64,0.57051,1.1246,4.313411
66522,16997.806645,3932.703680,4326.62,0.66963,1.7430,3.447440


In [12]:
normalized_df = (df_train-df_train.mean()) / df_train.std()

In [13]:
normalized_df

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],CO2/N2_selectivity
1,-0.201702,0.419680,-1.268376,-1.171988,-0.705051,0.790893
2,-0.519554,-0.690281,-1.112749,-1.097363,-0.660725,-0.143390
3,-0.308487,-0.280455,-0.563078,-0.617827,-0.469444,0.275656
5,0.022570,-0.096448,0.269405,0.437981,0.230389,-0.281203
6,-0.050994,0.221349,-0.488593,0.178382,-0.181524,-0.201969
...,...,...,...,...,...,...
66519,-0.455847,-0.305299,-0.645491,-0.039067,-0.505783,-1.015378
66520,3.805167,2.956099,1.743847,2.458805,2.561303,-1.132995
66521,1.997281,2.105812,1.716406,1.816959,1.332752,-1.116550
66522,2.490770,1.748079,1.865375,2.501852,2.567493,-1.172918


In [14]:
df_train = pd.concat([normalized_df, df_onehot], axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.33, random_state=42)

In [16]:
reg = LinearRegression()

In [17]:
reg.fit(X_train, y_train)

LinearRegression()

In [19]:
y_pred = reg.predict(X_test)

In [20]:
from sklearn.metrics import mean_absolute_error

In [21]:
np.log(mean_absolute_error(y_test, y_pred))

3.4717952373559036

In [22]:
df_test

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,pcu,36.639791,7.005640
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,acs,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,pcu,13.062850,5.045400
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,pcu,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,pcu,12.974954,5.287639
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,nbo,4.536626,3.146698
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,nbo,6.745508,3.658871
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,pcu,4.666206,3.593052
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,pcu,4.823305,3.454497


### Test

In [24]:
df_test = df_test.loc[:, 'volume [A^3]':'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]']

dummies_meatl = pd.get_dummies(df_test['metal_linker'], prefix='met')
dummies_topology = pd.get_dummies(df_test['metal_linker'], prefix='topo')
df_onehot = pd.concat([dummies_meatl, dummies_topology], axis = 1)

df_test = df_test.drop(['functional_groups', 'metal_linker', 'organic_linker1', 'organic_linker2', 'topology', 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'], axis=1)

In [25]:
normalized_df = (df_test-df_test.mean()) / df_test.std()

In [26]:
df_test = pd.concat([normalized_df, df_onehot], axis=1)

In [32]:
pred = reg.predict(df_test)

In [33]:
ID = np.arange(68614, 85614)

In [40]:
pd.DataFrame({'id':ID, 'CO2_working_capacity [mL/g]': pred}).set_index('id').to_csv('wonderland01.csv')