In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.metrics import mean_absolute_error as mae
from hyperopt import hp, tpe, Trials, STATUS_OK, fmin
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge,ARDRegression,BayesianRidge,ElasticNet,HuberRegressor
from sklearn.linear_model import Lasso, LassoLars, LinearRegression, LogisticRegression, PassiveAggressiveRegressor,Ridge,SGDRegressor

In [None]:
traindata = pd.read_csv(f'data/train_{1}_group_co2.csv')
valdata = pd.read_csv(f'data/val_{1}_group_co2.csv')
data=pd.concat([traindata, valdata],ignore_index=True)
data.head()

In [None]:
data.columns

In [None]:
smile= list(data['new_cation'])
mols = [Chem.MolFromSmiles(smi) for smi in smile]
calc = Calculator(descriptors, ignore_3D=True)
data_md_C = calc.pandas(mols)

smile= list(data['new_anion'])
mols = [Chem.MolFromSmiles(smi) for smi in smile]
calc = Calculator(descriptors, ignore_3D=True)
data_md_A = calc.pandas(mols)
data_md_C=data_md_C.astype('float64')
data_md_A=data_md_A.astype('float64')

data_md_C.replace([np.inf, -np.inf], np.nan, inplace=True)
data_md_C.dropna(axis=1,inplace=True)
data_md_A.replace([np.inf, -np.inf], np.nan, inplace=True)
data_md_A.dropna(axis=1,inplace=True)

data_md_C.columns = [i + '_C' for i in data_md_C.columns]
data_md_A.columns = [i + '_A' for i in data_md_A.columns]
data_md = pd.concat([data_md_C, data_md_A], axis=1)

corr = data_md.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = data_md.columns[columns]
data_md = data_md[selected_columns]
data_md['T']=data['T']
data_md['P']=data['P']
data_md['CO2-exp']=data['CO2-exp']

In [None]:
for i in data_md.columns:
    if len(data_md[i].value_counts())<2:
        data_md.drop(columns=[i], inplace=True)
len(data_md.columns)
selected_columns=data_md.columns
len(selected_columns)

In [None]:
data_md.head()

In [None]:
from BorutaShap import BorutaShap
model =CatBoostRegressor(random_state=10, verbose=False)
x = data_md.drop(columns=['CO2-exp'])
y = data_md['CO2-exp']
Feature_Selector = BorutaShap(model=model,importance_measure='shap',classification=False)
Feature_Selector.fit(X=x, y=y, n_trials=100, train_or_test = 'test', normalize=True, verbose=True)

In [None]:
subset = Feature_Selector.Subset()
subset.head()

In [None]:
import pickle
#with open('CO2_col.dump', 'wb') as f:
    #pickle.dump(list(subset.columns), f)
col_input=pickle.load(open("CO2_col.dump" , "rb"))

In [None]:
def get_data_md(data):
    smile= list(data['new_cation'])
    mols = [Chem.MolFromSmiles(smi) for smi in smile]
    calc = Calculator(descriptors, ignore_3D=True)
    data_md_C = calc.pandas(mols)

    smile= list(data['new_anion'])
    mols = [Chem.MolFromSmiles(smi) for smi in smile]
    calc = Calculator(descriptors, ignore_3D=True)
    data_md_A = calc.pandas(mols)
    
    data_md_C=data_md_C.astype('float64')
    data_md_A=data_md_A.astype('float64')
    data_md_C.columns = [i + '_C' for i in data_md_C.columns]
    data_md_A.columns = [i + '_A' for i in data_md_A.columns]
    data_md = pd.concat([data_md_C, data_md_A], axis=1)
    data_md['T']=data['T']
    data_md['P']=data['P']
    data_md['CO2-exp']=data['CO2-exp']
    col = col_input +['CO2-exp']
    data_final = data_md[col]
    return data_final

In [None]:
for i in range(1, 6):
    traindata = pd.read_csv(f'data/train_{i}_group_co2.csv')
    valdata = pd.read_csv(f'data/val_{i}_group_co2.csv')
    traindata_md = get_data_md(traindata)
    valdata_md = get_data_md(valdata)
    traindata_md.to_csv(f'data/train_{i}_group_co2_md.csv',index=False)
    valdata_md.to_csv(f'data/val_{i}_group_co2_md.csv',index=False)

In [None]:
def conv_data_pd(data):
    x_pd =data.drop(columns = ['CO2-exp'])
    y = data['CO2-exp'].values
    return x_pd, y

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer, QuantileTransformer
models = [CatBoostRegressor(verbose =False,random_state=10),XGBRegressor(random_state=10), svm.SVR(),RandomForestRegressor(random_state=10),
          AdaBoostRegressor(random_state=10),BaggingRegressor(random_state=10),GradientBoostingRegressor(random_state=10),
         Lasso(), Ridge(random_state=10)]
results = pd.DataFrame(columns=['train_rmse','train_r2', 'test_rmse','test_r2', 'name', 'scaler'])
scaler = [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(), PowerTransformer(), QuantileTransformer(random_state=40)]
k =0 
for model in models:
    for sc in scaler:
        t_rmse=[]
        t_r2=[]
        v_rmse=[]
        v_r2=[]
        for i in range(1, 6):
            train_data = pd.read_csv(f'data/train_{i}_group_co2_md.csv')
            val_data = pd.read_csv(f'data/val_{i}_group_co2_md.csv')
            x_train_pd, y_train = conv_data_pd(train_data)
            x_train = x_train_pd.values
            x_train = sc.fit_transform(x_train)
            x_val_pd, y_val = conv_data_pd(val_data)
            x_val = x_val_pd.values
            x_val = sc.transform(x_val)
            model.fit(x_train, y_train)
            y_val_pred = model.predict(x_val)
            y_train_pred = model.predict(x_train)
            t_rmse.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
            v_rmse.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
            t_r2.append(r2_score(y_train,y_train_pred))
            v_r2.append(r2_score(y_val, y_val_pred))
        results.loc[k, 'train_rmse']=np.mean(t_rmse)
        results.loc[k, 'test_rmse']=np.mean(v_rmse)+np.std(v_rmse)
        results.loc[k, 'train_r2']=np.mean(t_r2)
        results.loc[k, 'test_r2']=np.mean(v_r2)+np.std(v_r2)
        results.loc[k, 'name']=model.__class__.__name__
        results.loc[k, 'scaler']=sc.__class__.__name__
        k+=1


In [None]:
results.sort_values(['test_rmse'], ascending= True, inplace = True)
results.head()

In [None]:
results.to_excel('data/co2_model_selection_MD.xlsx', index=False)

In [None]:
space = {'depth': hp.quniform('depth', 1,6,1),
         'l2_leaf_reg': hp.uniform('l2_leaf_reg', 3, 100.0),
          'learning_rate':hp.loguniform('learning_rate', np.log(0.0001), np.log(0.025)),
          'iterations':hp.quniform('iterations', 1, 1000, 1),
         'bagging_temperature':hp.uniform('bagging_temperature', 1, 200),
         'random_strength':hp.uniform('random_strength', 1, 200)}
def fit(params):
    model = CatBoostRegressor(**params,random_state=10, verbose=False)
    val_loss=[]
    train_loss=[]
    for i in range(1, 6):
        train_data = pd.read_csv(f'data/train_{i}_group_co2_md.csv')
        val_data = pd.read_csv(f'data/val_{i}_group_co2_md.csv')
        x_train_pd, y_train = conv_data_pd(train_data)
        x_train = x_train_pd.values
        x_val_pd, y_val = conv_data_pd(val_data)
        x_val = x_val_pd.values
        model.fit(x_train, y_train)
        y_val_pred =model.predict(x_val)
        y_train_pred =model.predict(x_train)
        train_loss.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
        val_loss.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
    return np.mean(val_loss)+np.std(val_loss), np.mean(train_loss)

def objective(params):
    global ITERATION
    ITERATION +=1
    for name in ['depth', 'iterations']:
        params[name] = int(params[name])
    loss, train_loss = fit(params)
    loss =loss
    off_connection = open(out_file, 'a')
    writer = csv.writer(off_connection)
    writer.writerow([loss,train_loss, params, ITERATION])
    #pickle.dump(bayes_trial, open(dir_data + "h2_cat.p", "wb"))
    return {'loss':loss,'train_loss':train_loss, 'params': params, 'iteration':ITERATION, 'status':STATUS_OK}

import csv
out_file ='data/CO2_MD_hyper.csv'
off_connection =open(out_file, 'w')
writer = csv.writer(off_connection)
writer.writerow(['loss','train_loss', 'params', 'iteration'])
off_connection.close()

tpe_algo = tpe.suggest
bayes_trial = Trials()

In [None]:
#%%capture
from hyperopt.early_stop import no_progress_loss
global ITERATION
ITERATION =0
best = fmin(fn = objective, space =space, algo = tpe_algo, trials = bayes_trial,
            early_stop_fn=no_progress_loss(100),max_evals=3000, rstate= np.random.default_rng()) 

In [None]:
result = pd.read_csv('data/CO2_MD_hyper.csv')
result.sort_values('loss', ascending= True, inplace = True)
result.reset_index(drop = True, inplace =True)
result.head()

In [None]:
import ast
params = ast.literal_eval(result['params'][0])                                                                                                                                                                                                                                                               
params

In [None]:
testdata = pd.read_csv(f'data/test_group_co2.csv')
testdata_md = get_data_md(testdata)

In [None]:
testdata_md.to_csv(f'data/test_group_co2_md.csv', index=False)
testdata_md=pd.read_csv(f'data/test_group_co2_md.csv')
testdata_md.head()

In [None]:
train_data = pd.read_csv(f'data/train_{1}_group_co2_md.csv')
val_data = pd.read_csv(f'data/val_{1}_group_co2_md.csv')
train_data_merge= pd.concat([train_data, val_data],ignore_index=True)
train_data_merge.head()

In [None]:
model = CatBoostRegressor(**params,random_state=10, verbose=False)
x_test_pd, y_test =conv_data_pd(testdata_md)
x_test = x_test_pd.values
x_train_pd, y_train =conv_data_pd(train_data_merge)
x_train =x_train_pd.values
model.fit(x_train, y_train)
y_test_pred =model.predict(x_test)
y_train_pred =model.predict(x_train)