In [None]:
import xgboost as xgb
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
from hyperopt import hp, tpe, Trials, STATUS_OK, fmin
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from rdkit import DataStructs, Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge,ARDRegression,BayesianRidge,ElasticNet,HuberRegressor
from sklearn.linear_model import Lasso, LassoLars, LinearRegression, LogisticRegression, PassiveAggressiveRegressor,Ridge,SGDRegressor

In [None]:
class morgan_fp:
    def __init__(self, radius, length):
        self.radius = radius
        self.length = length
    def __call__(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetHashedMorganFingerprint(mol, self.radius, self.length)
        npfp = np.array(list(fp)).astype('float32')
        return npfp

In [None]:
def conv_data_pd(data,fp):
    data['c-fp'] = data['new_cation'].apply(fp)
    x_c=np.array(list(data['c-fp']))
    x_c_pd = pd.DataFrame(data=x_c, columns=[f'c_fp_{i}' for i in range(x_c.shape[1])])
    
    data['a-fp'] = data['new_anion'].apply(fp)
    x_a=np.array(list(data['a-fp']))
    x_a_pd = pd.DataFrame(data=x_a, columns=[f'a_fp_{i}' for i in range(x_a.shape[1])])
    
    hh_fp = pd.concat([x_c_pd, x_a_pd], axis =1)
    
    x_con_pd = data[['T', 'P']]
    
    hh_final = pd.concat([hh_fp,x_con_pd], axis=1)
    y = data['CO2-exp'].values
    
    return hh_final, y

In [None]:
fp = morgan_fp(1, 1024)
models = [CatBoostRegressor(verbose =False,random_state=10),XGBRegressor(random_state=10), svm.SVR(),RandomForestRegressor(random_state=10),
          AdaBoostRegressor(random_state=10),BaggingRegressor(random_state=10),GradientBoostingRegressor(random_state=10),
         Lasso(), Ridge(random_state=10)]
results = pd.DataFrame(columns=['train_rmse','train_r2', 'test_rmse','test_r2', 'name'])
k =0 
for model in models:
    val_loss=[]
    train_loss=[]
    val_r2 = []
    train_r2=[]
    for i in range(1, 6):
        traindata = pd.read_csv(f'data/train_{i}_group_co2.csv')
        valdata = pd.read_csv(f'data/val_{i}_group_co2.csv')
        x_train_pd, y_train, =conv_data_pd(traindata, fp)
        x_train= x_train_pd.values
        x_val_pd, y_val, =conv_data_pd(valdata,fp)
        x_val=x_val_pd.values
        model.fit(x_train, y_train)
        y_val_pred =model.predict(x_val)
        y_train_pred =model.predict(x_train)
        train_loss.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
        val_loss.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
        train_r2.append(r2_score(y_train, y_train_pred))
        val_r2.append(r2_score(y_val, y_val_pred))
    results.loc[k, 'train_rmse']=np.mean(train_loss)
    results.loc[k, 'test_rmse']=np.mean(val_loss)+np.std(val_loss)
    results.loc[k, 'train_r2']=np.mean(train_r2)
    results.loc[k, 'test_r2']=np.mean(val_r2)+np.std(val_r2)
    results.loc[k, 'name']=model.__class__.__name__
    k+=1

In [None]:
results.sort_values(['test_rmse'], ascending= True, inplace = True)
results.head()

In [None]:
space = {'depth': hp.quniform('depth', 1,6,1),
         'l2_leaf_reg': hp.uniform('l2_leaf_reg', 3, 100.0),
          'learning_rate':hp.loguniform('learning_rate', np.log(0.0001), np.log(0.025)),
          'iterations':hp.quniform('iterations', 1, 1000, 1),
         'bagging_temperature':hp.uniform('bagging_temperature', 1, 200),
         'random_strength':hp.uniform('random_strength', 1, 200)}

In [None]:
def fit(params):
    #fp = morgan_fp(params['fp_radius'], params['fp_length'])
    model = CatBoostRegressor(depth = params['depth'], l2_leaf_reg= params['l2_leaf_reg'], learning_rate = params['learning_rate'],
                         iterations=params['iterations'], bagging_temperature=params['bagging_temperature'],
                         random_strength=params['random_strength'],random_state=10, verbose=False)
    val_loss=[]
    train_loss=[]
    for i in range(1, 6):
        traindata = pd.read_csv(f'data/train_{i}_group_co2.csv')
        valdata = pd.read_csv(f'data/val_{i}_group_co2.csv')
        x_train, y_train, =conv_data_pd(traindata,fp)
        x_val, y_val, =conv_data_pd(valdata,fp)
        model.fit(x_train, y_train)
        y_val_pred =model.predict(x_val)
        y_train_pred =model.predict(x_train)
        train_loss.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
        val_loss.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
    return np.mean(val_loss)+np.std(val_loss), np.mean(train_loss)

def objective(params):
    global ITERATION
    ITERATION +=1
    for name in ['depth', 'iterations']:
        params[name] = int(params[name])
    loss, train_loss = fit(params)
    loss =loss
    off_connection = open(out_file, 'a')
    writer = csv.writer(off_connection)
    writer.writerow([loss,train_loss, params, ITERATION])
    #pickle.dump(bayes_trial, open(dir_data + "h2_cat.p", "wb"))
    return {'loss':loss,'train_loss':train_loss, 'params': params, 'iteration':ITERATION, 'status':STATUS_OK}

import csv
out_file ='data/co2_MF_hyper.csv'
off_connection =open( out_file, 'w')
writer = csv.writer(off_connection)
writer.writerow(['loss','train_loss', 'params', 'iteration'])
off_connection.close()

tpe_algo = tpe.suggest
bayes_trial = Trials()

In [None]:
#%%capture
from hyperopt.early_stop import no_progress_loss
global ITERATION
ITERATION =0
best = fmin(fn = objective, space =space, algo = tpe_algo, trials = bayes_trial, 
            early_stop_fn=no_progress_loss(100),max_evals=3000, rstate= np.random.default_rng()) 

In [None]:
result = pd.read_csv('data/co2_MF_hyper.csv')
result.sort_values('loss', ascending= True, inplace = True)
result.reset_index(drop = True, inplace =True)
result.head()

In [None]:
import ast
params = ast.literal_eval(result['params'][0])                                                                                                                                                                                                                                                               
params

In [None]:
testdata = pd.read_csv(f'data/test_group_co2.csv')
testdata.head()

In [None]:
traindata = pd.read_csv(f'data/train_{1}_group_co2.csv')
valdata = pd.read_csv(f'data/val_{1}_group_co2.csv')
train_data_merge= pd.concat([traindata, valdata],ignore_index=True)
train_data_merge.head()

In [None]:
#fp = morgan_fp(params['fp_radius'], params['fp_length'])
model = CatBoostRegressor(depth = params['depth'], l2_leaf_reg= params['l2_leaf_reg'],learning_rate= params['learning_rate'],
                         iterations=params['iterations'], bagging_temperature=params['bagging_temperature'],
                         random_strength=params['random_strength'],random_state=10, verbose=False)

x_test, y_test =conv_data_pd(testdata, fp)
x_train, y_train =conv_data_pd(train_data_merge, fp)
model.fit(x_train, y_train)
y_test_pred =model.predict(x_test)
y_train_pred =model.predict(x_train)