In [None]:
def compute_metrics_by_method(csvpath, substrate='implicit'):
    results = pd.read_csv(csvpath)
    results = results[results['substrate'] == substrate]
    labels = results['label'].unique()
    for label in labels:
        results_label = results[results['label'] == label]
        r2 = r2_score(results_label['target'], results_label['pred'])
        mserr = mean_squared_error(results_label['target'], results_label['pred'])
        rmserr = np.sqrt(mserr)
        maerr = mean_absolute_error(results_label['target'], results_label['pred'])
        txt = f"{substrate} {label}\tMAErr={maerr:.4f}\tRMSErr={rmserr:.4f}\t" + r"$R^2$" + f"\t={r2:.2f}"
        print(txt)

# Read data

### Read data for implicit substrates

In [None]:
df = pd.read_csv('data/implicit_dataset.csv')
target = pd.read_csv('data/implicit_PT.csv')

### Read data for explicit substrates

In [None]:
df_es = pd.read_csv('data/explicit_dataset.csv')
target_es = pd.read_csv('data/explicit_PT.csv')

In [None]:
# prepare data for LinearRegression and Lasso
# remove possible NaNs and perform one-hot encodings
df_nonans = df.dropna(axis=1)
df_es_nonans = df_es.dropna(axis=1)
no_nans_common_columns = list(set(df_nonans.columns).intersection(df_es_nonans.columns))
df_nonans = df_nonans[no_nans_common_columns]
df_es_nonans = df_es_nonans[no_nans_common_columns]

cat_features = categorical_features(df)
cat_features_nonans = categorical_features(df_nonans)
df_nonans_onehot = pd.get_dummies(df_nonans, columns=cat_features_nonans)
df_es_nonans_onehot = pd.get_dummies(df_es_nonans, columns=cat_features_nonans)

# Training: Cross Validation using all features

In [None]:
scoring = {
    'mean_squared_error': 'neg_mean_squared_error',
    'mean_absolute_error': 'neg_mean_absolute_error',
    'r2': 'r2',
}

cb_hyperparams = {
    'max_depth': 5, 
    'n_estimators': 2000,
    'eta':0.05
}

cb = CatBoostRegressor(verbose=False, cat_features=cat_features, 
                       **cb_hyperparams)
cv_cb = cross_validate(cb, df, target, cv=5, scoring=scoring, 
                           return_train_score=True, 
                           return_estimator=True,
                           return_indices=True)

lasso = Lasso(alpha=0.2)
cv_lasso = cross_validate(lasso, df_nonans_onehot, target, cv=5, scoring=scoring,                            
                              return_train_score=True, 
                              return_estimator=True,
                              return_indices=True)
lr = LinearRegression(fit_intercept=True)
cv_lr = cross_validate(lr, df_nonans_onehot, target, cv=5, scoring=scoring,                            
                              return_train_score=True, 
                              return_estimator=True,
                              return_indices=True)

In [None]:
# Train on all data without CV for better prediction for Explicit Substrates
nocv_cb = train_nocv(cb, df, target)
nocv_lasso = train_nocv(lasso, df_nonans_onehot, target)
nocv_lr = train_nocv(lr, df_nonans_onehot, target)

In [None]:
# Make predictions on whole dataset
csv_path_all_features = save_fold_predictions(cv_objs=[cv_lr, cv_lasso, cv_cb],
                                            labels=['MLR', 'LASSO', 'ML'],
                                            dfs=[df_nonans_onehot, df_nonans_onehot, df],
                                            targets=[target, target, target],
                                            filenamebase=f'fit_all_features_PTpredict',
                                            substrate='implicit')

In [None]:
# Metrics for implicit substrates. PT used as target.
compute_metrics_by_method(csv_path_all_features, substrate='implicit')

In [None]:
# Metrics for explicit substrates
# This metrics are given in Table XXX in the manuscript
csv_path_all_features_es = save_nofolds_predictions(estimators=[nocv_lr, nocv_lasso, nocv_cb],
                                        labels=['MLR', 'LASSO', 'ML'],
                                        dfs=[df_es_nonans_onehot, df_es_nonans_onehot, df_es],
                                        targets=[target_es, target_es, target_es],
                                        filenamebase=f'fit_all_features_es_PTpredict',
                                        substrate='explicit')

In [None]:
compute_metrics_by_method(csv_path_all_features_es, substrate='explicit')

# Feature Selection

In [None]:
# Feature selection with CV
num_features_to_selects = [2, 3, 5, 6, 7, 8, 9, 10, 12, 15, 17, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 80, 90, 100]

mse_feature = []
mae_feature = []
r2_feature = []
features_selection = {}

for num_features_to_select in num_features_to_selects:
    mod = CatBoostRegressor(cat_features=cat_features)
    summary = mod.select_features(df, target,
                          features_for_select=list(range(df.shape[1])),
                          num_features_to_select=num_features_to_select,
                          steps=20,
                          algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                          shap_calc_type=EShapCalcType.Regular,
                          train_final_model=True,
                          plot=False, logging_level='Silent')

    features_selection[num_features_to_select] = summary['selected_features_names']
    df_selected = df[summary['selected_features_names']]
    cb_mod = CatBoostRegressor(**cb_hyperparams, cat_features=cat_features, verbose=False)
    cv_cb_fetures = cross_validate(cb_mod, df_selected, target, cv=5, scoring=scoring, 
                           return_train_score=True, 
                           return_estimator=True,
                           return_indices=True)
    y_pred_folds, y_test_folds = prediction_on_folds(cv_cb_fetures, df_selected, target)
    mae = mean_absolute_error(y_test_folds, y_pred_folds)
    mse = mean_squared_error(y_test_folds, y_pred_folds)
    r2 = r2_score(y_test_folds, y_pred_folds)
    mse_feature.append(mse)
    mae_feature.append(mae)
    r2_feature.append(r2)
    print('****')
    print(f'REQUIRED NUMBER OF FEATURES {num_features_to_select}')
    print(summary['selected_features_names'])
    print(f'MAE={mae} MSE={mse} RMSE={np.sqrt(mse)} R2={r2}')
    print('****')

# Build models for list selected 25 features

In [None]:
# copy-paste list from output above
N_FEATURES = 25
FEATURES = features_selection[N_FEATURES]
#FEATURES = ['BG_dir_up', 'dimer_len', 'h_substrate', 'sr_A2_dz2-r2UP', 'sr_A1_dz2-r2DOWN', 'sr_A2_dz2-r2DOWN', 'sr_A1_dxzUP', 'sr_A2_dxzUP', 'sr_A1_dosmodeldxzUP_integral_below_1.0', 'sr_A1_dosmodeldxzUP_integral_above_1.0', 'sr_A1_dosmodeldz2-r2DOWN_integral_below_1.0', 'sr_A2_dosmodeldz2-r2DOWN_integral_below_1.0', 'bader_A1', 'bader_A2', 'sr_A1_dosmodeldx2-y2UP_E_below', 'sr_A1_dosmodeldz2-r2UP_peak_above', 'sr_A1_dosmodeldxzUP_peak_above', 'sr_A1_dosmodeldxyDOWN_E_below', 'sr_A1_dosmodeldx2-y2DOWN_E_below', 'sr_A1_dosmodeldx2-y2DOWN_peak_below', 'sr_A1_dosmodeldyzDOWN_E_above', 'sr_A1_dosmodeldz2-r2DOWN_E_above', 'sr_A1_dosmodeldxzDOWN_E_above', 'sr_A2_dosmodeldz2-r2UP_peak_below', 'sr_A2_dosmodeldx2-y2UP_E_below']
print(f'FEATURES: {FEATURES}')
df_selected = df[FEATURES].copy()

df_es_selected = df_es[FEATURES]
# prepare data for LinearRegression
# remove NaNs
# do one-hot encodings
df_nonans_selected = df_selected.dropna(axis=1)
df_es_nonans_selected = df_es_selected.dropna(axis=1)
no_nans_common_columns = list(set(df_nonans_selected.columns).intersection(df_es_nonans_selected.columns))
df_nonans_selected = df_nonans_selected[no_nans_common_columns]
df_es_nonans_selected = df_es_nonans_selected[no_nans_common_columns]

cat_features_selected = categorical_features(df_selected)
cat_features_nonans_selected = categorical_features(df_nonans_selected)
df_nonans_onehot_selected = pd.get_dummies(df_nonans_selected, columns=cat_features_nonans_selected)
df_es_nonans_onehot_selected = pd.get_dummies(df_es_nonans_selected, columns=cat_features_nonans_selected)

cat_features_nonans_selected = categorical_features(df_nonans_selected)
df_nonans_selected_onehot = pd.get_dummies(df_nonans_selected, columns=cat_features_nonans_selected)

In [None]:
# print selected features in LaTeX format
[labels(f) for f in FEATURES]

In [None]:
lasso = Lasso(alpha=0.1)
cv_lasso_selected = cross_validate(lasso, df_nonans_selected_onehot, target, cv=5, scoring=scoring,                            
                              return_train_score=True, 
                              return_estimator=True,
                              return_indices=True)

cv_lr_selected = cross_validate(lr,df_nonans_selected_onehot, target, cv=5, scoring=scoring,                            
                              return_train_score=True, 
                              return_estimator=True,
                              return_indices=True)

cb = CatBoostRegressor(verbose=False, cat_features=cat_features_selected)
cv_cb_selected = cross_validate(cb, df_selected, target, cv=5, scoring=scoring, 
                           return_train_score=True, 
                           return_estimator=True,
                           return_indices=True)

In [None]:
nocv_cb_selected = train_nocv(cb, df_selected, target)
nocv_lasso_selected = train_nocv(lasso, df_nonans_selected_onehot, target)
nocv_lr_selected = train_nocv(lr, df_nonans_selected_onehot, target)

In [None]:
csv_path_selected = save_fold_predictions(cv_objs=[cv_lr_selected, cv_lasso_selected, cv_cb_selected],
                                        labels=['MLR', 'LASSO', 'ML'],
                                        dfs=[df_nonans_selected_onehot, df_nonans_selected_onehot, df_selected],
                                        targets=[target, target, target],
                                        filenamebase=f'fit_selected_features_PTpredict',
                                        substrate='implicit')

## Trainig metrics when predicting PT: Reproduce Figure S7

In [None]:
compute_metrics_by_method(csv_path_selected, substrate='implicit')

**Conclusion**: Machine Learning model (ML), MLR and LASSO have larger performance when PT is used as target value. 

Metrics above corresponds to Figure S7 and is moreover discussed in the Manuscript in lines 421-430. Small differences in RMSErr between Supplementary Information and this code is due to stochastic effects (different train/val/test splits etc). 

In [None]:
csv_path_selected_es = save_nofolds_predictions(estimators=[nocv_lr_selected, nocv_lasso_selected, nocv_cb_selected],
                                        labels=['MLR', 'LASSO', 'ML'],
                                        dfs=[df_es_nonans_onehot_selected, df_es_nonans_onehot_selected, df_es_selected],
                                        targets=[target_es, target_es, target_es],
                                        filenamebase=f'fit_selected_features_es_PTpredict',
                                        substrate='explicit')        

In [None]:
compute_metrics_by_method(csv_path_selected_es, substrate='explicit')

Table above shows the prediction for explicit substrates. When compared with Table 1 one can see that R2 coefficient for ML is larger then when MAE is used as target.