In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
# from lightgbm.lgb import LGBMRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [2]:
def clean_feature_names(df):
    # Function to clean feature names
    def clean_name(name):
        # Replace any character that is not alphanumeric or underscore with an underscore
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)

    # Apply the cleaning function to each column name
    df.columns = [clean_name(col) for col in df.columns]
    return df

In [6]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [7]:
#Monomer composition
df_mc_train = pd.read_csv('Monomer_features/Train_mon_comp.csv')
df_mc_train = clean_feature_names(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('Monomer_features/Test_mon_comp.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(5568, 385)
(5568,)
(1392, 385)
(1392,)
0.5464820023986398
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 574
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 61
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 591
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 65
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000653 seconds.
You can set `force_row_wise

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.324,0.408,0.5692,0.4798,0.7077,0.678,0.287,0.3841,0.5357,0.5465,0.7448,0.7132
LGBMRegressor,0.3008,0.4004,0.5485,0.517,0.7202,0.6958,0.3008,0.4033,0.5484,0.5247,0.7259,0.7057
XGBRegressor,0.2699,0.3799,0.5195,0.5666,0.7528,0.7135,0.2559,0.3704,0.5058,0.5957,0.7728,0.7435
DecisionTreeRegressor,0.4343,0.4638,0.659,0.3028,0.632,0.6051,0.3224,0.4017,0.5678,0.4906,0.7168,0.6916
RandomForestRegressor,0.2928,0.3913,0.5411,0.5298,0.7298,0.693,0.2742,0.3794,0.5236,0.5667,0.7533,0.7215
GradientBoostingRegressor,0.3347,0.4324,0.5785,0.4626,0.6906,0.6539,0.3367,0.4335,0.5802,0.468,0.6956,0.6735
AdaBoostRegressor,0.5216,0.5764,0.7222,0.1625,0.4923,0.4384,0.497,0.5692,0.705,0.2146,0.5446,0.4979
SVR,0.3201,0.4012,0.5657,0.4861,0.7012,0.6885,0.3163,0.3982,0.5624,0.5002,0.7107,0.6991
LinearRegression,0.384,0.446,0.6196,0.3835,0.6264,0.6372,0.3637,0.4399,0.6031,0.4253,0.6543,0.668
KNeighborsRegressor,0.3702,0.4459,0.6084,0.4056,0.652,0.6163,0.3449,0.428,0.5873,0.455,0.6821,0.6512


In [8]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.682, -6.829199999999998, -5.6058, -5.39870...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9338000000000015, -6.576300000000003, -6....","[-6.92923, -6.662480000000002, -6.618329999999...","[0.021276503472140735, 0.13309262038144554, 0...."
1,LGBMRegressor,"[-6.474209958508049, -6.088745749683978, -5.12...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.658264488678373, -6.420684318030899, -6.7...","[-6.895847020647485, -6.4802321135909455, -6.6...","[0.23526243115789805, 0.057434991655542326, 0...."
2,XGBRegressor,"[-5.7332354, -6.5855656, -5.2311544, -5.181078...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.3695793, -6.166127, -7.0612116, -5.902663...","[-6.4050436, -6.4133353, -7.03028, -5.640264, ...","[0.25567287, 0.19114357, 0.34366196, 0.1417031..."
3,DecisionTreeRegressor,"[-7.0, -6.92, -5.15, -4.68, -5.15, -4.59, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -5.89, -6.24, -7.0, -7.0,...","[-7.0, -6.08, -6.16, -5.758, -5.879, -6.334000...","[0.0, 0.7593681584053944, 0.6914332939626208, ..."
4,RandomForestRegressor,"[-6.366176190476193, -6.334999999999999, -5.48...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8537, -6.423642857142861, -6.499599999999...","[-6.760695796358002, -6.539240034453241, -6.30...","[0.14228066585014743, 0.0921094304859949, 0.13..."
5,GradientBoostingRegressor,"[-5.661132826352657, -6.1049209987207815, -5.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.030654600992403, -5.686372544162835, -6.0...","[-5.965359961798017, -5.722811104192969, -5.93...","[0.07600969637957382, 0.06791902208528118, 0.1..."
6,AdaBoostRegressor,"[-5.746646566799839, -6.085248650371831, -5.67...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.954906012906758, -5.954906012906758, -5.7...","[-5.996077476395867, -5.950694138844628, -5.96...","[0.049043737214825626, 0.11624832276308386, 0...."
7,SVR,"[-4.746940374655192, -6.992144524494609, -4.61...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8630504517521915, -4.69010704863332, -5.2...","[-6.840530818840689, -4.729312626905109, -5.21...","[0.03300027405190978, 0.040329207650596534, 0...."
8,LinearRegression,"[-5.051873989189084, -5.837585594266053, -4.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.788147915702857, -4.910120991697232, -5.4...","[-5.8928536688605035, -4.947099605589861, -5.3...","[0.08600423802372432, 0.05053495664178832, 0.0..."
9,KNeighborsRegressor,"[-5.3566666666666665, -7.0, -5.853333333333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.433333333333334, -6.04, -5.38333333...","[-6.9093333333333335, -5.915333333333334, -6.2...","[0.04533333333333331, 0.3732178392782902, 0.23..."


In [9]:
result_df.to_csv('monomer_results/Monomer_comp_results.csv')
prediction_df.to_csv('monomer_results/Monomer_comp_prediction_data.csv')

In [3]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [11]:
df_mc_train = pd.read_csv('Monomer_features/Train_mon_comp.csv')
df_mc_train = clean_feature_names(df_mc_train)
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('Monomer_features/Test_mon_comp.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
print(X_test)
print(y_test)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(5568, 243)
(5568,)
(1392, 243)
(1392,)
             A        dA       meA     Me_dA  Ala_indol_2_yl_  \
0    -0.501813 -0.349957  1.366587 -0.476635        -0.032844   
1    -0.501813 -0.349957  1.366587  0.793660        -0.032844   
2    -0.501813 -0.349957  1.366587 -0.476635        -0.032844   
3     0.624153 -0.349957 -0.555249 -0.476635        -0.032844   
4    -0.501813 -0.349957  1.366587  0.793660        -0.032844   
...        ...       ...       ...       ...              ...   
1387  4.752694 -0.349957 -0.555249 -0.476635        -0.032844   
1388  4.752694 -0.349957 -0.555249 -0.476635        -0.032844   
1389  3.439067 -0.349957 -0.555249 -0.476635        -0.032844   
1390 -0.501813 -0.349957 -0.555249 -0.476635        -0.032844   
1391  4.752694 -0.349957 -0.555249 -0.476635        -0.032844   

      dAla_indol_2_yl_  Ala_5_Tet_       Abu      dAbu    Me_Abu  ...  \
0            -0.013403   -0.023154 -0.122013 -0.107696 -0.052596  ...   
1            -0.013403   -0.02315

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.3242,0.4079,0.5694,0.4794,0.7076,0.6773,0.2867,0.3838,0.5354,0.547,0.7451,0.7145
LGBMRegressor,0.3008,0.4004,0.5485,0.517,0.7202,0.6958,0.3008,0.4033,0.5484,0.5247,0.7259,0.7057
XGBRegressor,0.2699,0.3799,0.5195,0.5666,0.7528,0.7135,0.2559,0.3704,0.5058,0.5957,0.7728,0.7435
DecisionTreeRegressor,0.4315,0.4626,0.6569,0.3071,0.6344,0.6085,0.3207,0.4008,0.5663,0.4932,0.7186,0.692
RandomForestRegressor,0.2923,0.3911,0.5407,0.5306,0.7304,0.6935,0.2736,0.3791,0.5231,0.5676,0.7538,0.7218
GradientBoostingRegressor,0.3347,0.4324,0.5786,0.4625,0.6905,0.6539,0.3365,0.4335,0.5801,0.4682,0.6959,0.6736
AdaBoostRegressor,0.5151,0.5703,0.7177,0.173,0.4929,0.436,0.4921,0.564,0.7015,0.2223,0.5416,0.4927
SVR,0.3201,0.4012,0.5657,0.4861,0.7012,0.6885,0.3163,0.3982,0.5624,0.5002,0.7107,0.6991
LinearRegression,0.384,0.446,0.6196,0.3835,0.6264,0.6372,0.3637,0.4399,0.6031,0.4253,0.6543,0.668
KNeighborsRegressor,0.3698,0.4459,0.6081,0.4062,0.6521,0.6158,0.3448,0.4277,0.5872,0.4551,0.6821,0.6522


In [12]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.720999999999999, -6.836799999999999, -5.62...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9633, -6.477500000000004, -6.633549999999...","[-6.94846, -6.579693333333336, -6.58385, -6.13...","[0.020029438334611318, 0.0679120817266291, 0.1..."
1,LGBMRegressor,"[-6.474209958508049, -6.088745749683978, -5.12...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.658264488678373, -6.420684318030899, -6.7...","[-6.895847020647485, -6.4802321135909455, -6.6...","[0.23526243115789805, 0.057434991655542326, 0...."
2,XGBRegressor,"[-5.7332354, -6.5855656, -5.2311544, -5.181078...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.3695793, -6.166127, -7.0612116, -5.902663...","[-6.4050436, -6.4133353, -7.03028, -5.640264, ...","[0.25567287, 0.19114357, 0.34366196, 0.1417031..."
3,DecisionTreeRegressor,"[-7.0, -6.92, -5.15, -4.68, -5.15, -5.04, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -6.92, -5.89, -6.24, -7.0, -7.0...","[-7.0, -6.08, -6.4079999999999995, -5.80200000...","[0.0, 0.7593681584053944, 0.6971484777290989, ..."
4,RandomForestRegressor,"[-6.314800000000003, -6.355499999999998, -5.48...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.866200000000001, -6.3479000000000045, -6....","[-6.769855444444445, -6.521707277777781, -6.29...","[0.1357908885152214, 0.1017731964996332, 0.095..."
5,GradientBoostingRegressor,"[-5.661132826352657, -6.1049209987207815, -5.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.030654600992403, -5.686372544162835, -6.0...","[-5.965359961798017, -5.722811104192969, -5.93...","[0.07600969637957404, 0.06791902208528118, 0.1..."
6,AdaBoostRegressor,"[-5.746646566799846, -6.085248650371841, -5.67...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.95490601290677, -5.95490601290677, -5.746...","[-5.993029782302644, -5.96964426654275, -5.962...","[0.04783613247453708, 0.08195349802888562, 0.1..."
7,SVR,"[-4.747084708363476, -6.991938546745366, -4.61...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.862979570324438, -4.6903321426120135, -5....","[-6.840600493445666, -4.729365327013005, -5.21...","[0.03295343652981298, 0.04023257183936915, 0.0..."
8,LinearRegression,"[-5.051873989189086, -5.837585594266059, -4.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.788147915702867, -4.910120991697235, -5.4...","[-5.892853668860508, -4.947099605589864, -5.38...","[0.0860042380237223, 0.05053495664178806, 0.04..."
9,KNeighborsRegressor,"[-5.3566666666666665, -7.0, -5.853333333333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.433333333333334, -6.04, -5.38333333...","[-6.9093333333333335, -5.915333333333334, -6.2...","[0.04533333333333331, 0.3732178392782902, 0.23..."


In [13]:
const_col

['Ala_tBu_',
 'Me_Ala_indol_2_yl_',
 'Me_Abu_morpholino_',
 'meD',
 'Asp_Ph_2_NH2__',
 'Glu_3R_Me_',
 'Phe_CHF2_',
 'Me_Phe_4_Cl_',
 'Bn_4_OH__Gly',
 'Bu_Gly',
 'EtOEt_Gly',
 'PhEt_Gly',
 'isoamyl_Gly',
 '2_pyridylmethyl_Gly',
 'Me_Hph',
 'Hph_2_Cl_',
 'Hph_3_Cl_',
 'Hph_4_Cl_',
 'Hse_Et_',
 'Hyp_Et_',
 'dK',
 'meK',
 'Me_dK',
 'Lys_Cbz_',
 'Lys_iPr_',
 'Lys_Me_',
 'Me_Lys_Me_',
 'dLeu_3R_OH_',
 'dN',
 'Nle_CHF2_',
 'Nle_OH_',
 'Orn',
 '4Pal',
 'dPip',
 'Gln_Mes_',
 'Ser_Bn_',
 'Ser_EtNMe2_',
 'Ser_EtOH_',
 'Ser_isoamyl_',
 'dSer_Me_',
 'Ser_Ph_2_Cl__',
 'Ser_Ph_3_Cl__',
 'Ser_Pr_',
 'Me_Ser_isoamyl_',
 'Me_Ser_Pr_',
 'dT',
 'Me_Tza',
 '_N__O_Val',
 'meW',
 'Me_dW',
 'Trp_6_Br_',
 'Tyr_CHF2_',
 'dTyr_bR_OMe_',
 '_N__O_Tyr',
 'Mono3',
 'Mono4',
 'Mono5',
 'Mono15',
 'Mono17',
 'Mono18',
 'Mono19',
 'Mono20',
 'Mono23',
 'Mono24',
 'Mono25',
 'Mono32',
 'Mono33',
 'Mono36',
 'Mono48',
 'Mono49',
 'Mono50',
 'Mono51',
 'Mono52',
 'Mono53',
 'Mono54',
 'Mono55',
 'Mono56',
 'Mono57',
 'Mon

In [14]:
result_df.to_csv('monomer_results/Monomer_comp_constRemoval_results.csv')
prediction_df.to_csv('monomer_results/Monomer_comp_constRemoval_prediction_data.csv')

In [4]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [21]:
df_train = pd.read_csv('Monomer_features/Train_mon_comp.csv')
df_mc_train = clean_feature_names(df_train)
df_mc_train = df_mc_train.drop(['ID','SMILES','Permeability'],axis=1)
df_mc, const_col = remove_low_variance_columns(df_mc_train)
X_train = df_mc
y_train = df_train['Permeability']
print(X_train.shape)
print(y_train.shape)

df_mc_test = pd.read_csv('Monomer_features/Test_mon_comp.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(5568, 8)
(5568,)
(1392, 8)
(1392,)
0.3410821104954864
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 8
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 8
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 157
[Light



0.30401115376335774




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.4258,0.4766,0.6525,0.3164,0.5756,0.5533,0.417,0.473,0.6457,0.3411,0.5889,0.5887
LGBMRegressor,0.4141,0.475,0.6435,0.3351,0.5789,0.5587,0.4217,0.4793,0.6494,0.3337,0.5779,0.579
XGBRegressor,0.4199,0.4734,0.648,0.3258,0.5782,0.5587,0.416,0.4738,0.645,0.3426,0.588,0.5932
DecisionTreeRegressor,0.4471,0.4836,0.6687,0.2821,0.5556,0.541,0.4302,0.4782,0.6559,0.3202,0.5759,0.5829
RandomForestRegressor,0.4176,0.4743,0.6462,0.3294,0.5819,0.5539,0.4158,0.4734,0.6448,0.3429,0.5888,0.5863
GradientBoostingRegressor,0.4363,0.4902,0.6605,0.2995,0.5492,0.5422,0.4334,0.4866,0.6583,0.3152,0.5661,0.5672
AdaBoostRegressor,0.5762,0.6014,0.7591,0.0749,0.3905,0.3941,0.5503,0.593,0.7418,0.1304,0.4476,0.4397
SVR,0.4635,0.4803,0.6808,0.2558,0.5261,0.5353,0.463,0.4839,0.6804,0.2684,0.5368,0.5524
LinearRegression,0.527,0.5342,0.726,0.1538,0.3922,0.4645,0.5477,0.5426,0.7401,0.1345,0.3682,0.4534
KNeighborsRegressor,0.5076,0.5149,0.7125,0.185,0.5042,0.4922,0.4691,0.4966,0.6849,0.2588,0.5357,0.5318


In [22]:
result_df.to_csv('monomer_results/Monomer_comp_LVR_results.csv')
prediction_df.to_csv('monomer_results/Monomer_comp_LVR_prediction_data.csv')

In [23]:
#AA composition
df_aac_train = pd.read_csv('Monomer_features/Train_aac.csv')
X_train = df_aac_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_aac_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_aac_test = pd.read_csv('Monomer_features/Test_aac.csv')
X_test = df_aac_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_aac_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
aac_comp,prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
aac_comp

(5568, 21)
(5568,)
(1392, 21)
(1392,)
0.3606185478919217
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 265
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 14
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 269
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 14
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=t



0.31262581423987024




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.3889,0.4635,0.6236,0.3756,0.6142,0.5286,0.4046,0.4665,0.6361,0.3606,0.6028,0.5487
LGBMRegressor,0.3945,0.4723,0.6281,0.3666,0.6055,0.5264,0.4022,0.4697,0.6342,0.3645,0.6037,0.5489
XGBRegressor,0.3997,0.4704,0.6322,0.3583,0.6006,0.519,0.401,0.4632,0.6333,0.3663,0.6063,0.5524
DecisionTreeRegressor,0.4287,0.4822,0.6547,0.3117,0.572,0.4987,0.4105,0.4696,0.6407,0.3514,0.5961,0.543
RandomForestRegressor,0.3914,0.4671,0.6256,0.3716,0.6099,0.5288,0.403,0.4663,0.6348,0.3632,0.6032,0.5483
GradientBoostingRegressor,0.4115,0.4887,0.6415,0.3392,0.5838,0.4987,0.4243,0.4899,0.6514,0.3295,0.5746,0.5231
AdaBoostRegressor,0.5533,0.5967,0.7438,0.1116,0.4306,0.3858,0.5405,0.5939,0.7352,0.1458,0.4498,0.4328
SVR,0.42,0.4721,0.6481,0.3256,0.5822,0.5071,0.434,0.468,0.6588,0.3141,0.5729,0.5254
LinearRegression,0.4971,0.5382,0.7051,0.2018,0.4494,0.4145,0.4887,0.5294,0.6991,0.2278,0.4777,0.4461
KNeighborsRegressor,0.5524,0.5548,0.7432,0.113,0.5079,0.447,0.5472,0.5442,0.7397,0.1353,0.5194,0.4883


In [24]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.467900000000003, -6.799633333333327, -6.36...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.856399999999997, -6.240000000000006, -6.7...","[-6.890009999999999, -6.372850000000005, -6.79...","[0.05233085514302277, 0.26569999999999644, 0.0..."
1,LGBMRegressor,"[-6.819708776899031, -6.741522957198804, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8824174200856305, -6.732630090516726, -6....","[-6.852068279536186, -6.59581741369701, -6.403...","[0.11170179221897755, 0.17425490343377098, 0.1..."
2,XGBRegressor,"[-6.53945, -6.331082, -6.351551, -5.277498, -5...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.6684756, -6.428478, -6.665841, -6.5263243...","[-6.78958, -6.3500047, -6.774565, -6.413087, -...","[0.16713886, 0.06491435, 0.2544896, 0.19761877..."
3,DecisionTreeRegressor,"[-6.24, -6.89, -5.88, -5.92, -5.88, -4.6850000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -7.0, -7.0, -6.7866666666...","[-7.0, -6.392, -6.006, -6.698, -6.848000000000...","[0.0, 0.3039999999999999, 0.8215984420627878, ..."
4,RandomForestRegressor,"[-6.662025, -6.649999999999997, -6.19659666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.731189444444443, -6.425633333333335, -6.5...","[-6.705324055555556, -6.4566971111111116, -6.4...","[0.033727279222509725, 0.1051401933965135, 0.1..."
5,GradientBoostingRegressor,"[-5.6719335838951555, -5.80558682864085, -5.81...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.6719335838951555, -6.166947448380577, -5....","[-5.707385290317585, -6.063318291225626, -5.75...","[0.038115408217044726, 0.13052320412081825, 0...."
6,AdaBoostRegressor,"[-5.679886214442038, -5.858147654584215, -5.58...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.679886214442038, -5.655448717948713, -5.6...","[-5.678199179006834, -5.672434775988421, -5.67...","[0.012810614882255955, 0.015089402287063593, 0..."
7,SVR,"[-5.927183211254778, -5.841334576665473, -4.91...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986617529188543, -5.706267302243386, -6.2...","[-6.8633311727334725, -5.711928660349113, -6.0...","[0.07100479555606345, 0.06968864693284889, 0.1..."
8,LinearRegression,"[-4.725680904639959, -4.998135232151143, -5.08...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-4.964648579345606, -4.822410898603708, -5.1...","[-4.997225506437236, -4.858070408550772, -5.09...","[0.030655284748342598, 0.032966270644718604, 0..."
9,KNeighborsRegressor,"[-5.633333333333333, -6.936666666666667, -4.53...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.973333333333334, -5.3500000000000005, -7....","[-6.8933333333333335, -5.826666666666666, -6.0...","[0.04173993557999636, 0.29902805516837017, 0.6..."


In [25]:
aac_comp.to_csv('monomer_results/AAC_comp_results.csv')
prediction_df.to_csv('monomer_results/AAC_comp_prediction_data.csv')

In [26]:
#Constant column removal
df_mc_train = pd.read_csv('Monomer_features/Train_aac.csv')
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('Monomer_features/Test_aac.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

(5568, 21)
(5568,)
(1392, 21)
(1392,)


In [27]:
#LVR column removal
df_mc_train = pd.read_csv('Monomer_features/Train_aac.csv')
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
X_train, const_col = remove_low_variance_columns(X_train)

y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('Monomer_features/Test_aac.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_mc = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_mc, X_train,y_train, X_test,  y_test)
result_df

(5568, 5)
(5568,)
(1392, 5)
(1392,)
0.31690614824044205
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 148
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 5
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 5
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149
[Ligh



0.2867090237170975


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.432,0.4836,0.6573,0.3063,0.5594,0.4943,0.4323,0.4803,0.6575,0.3169,0.5685,0.5282
LGBMRegressor,0.4231,0.4885,0.6505,0.3206,0.5662,0.4961,0.4191,0.4809,0.6474,0.3377,0.5814,0.5292
XGBRegressor,0.43,0.4832,0.6557,0.3096,0.562,0.4931,0.4226,0.4755,0.65,0.3323,0.5795,0.5322
DecisionTreeRegressor,0.4558,0.491,0.6752,0.2681,0.5364,0.4785,0.4409,0.4839,0.664,0.3034,0.5599,0.5219
RandomForestRegressor,0.4219,0.4814,0.6496,0.3226,0.5699,0.4953,0.4262,0.4787,0.6528,0.3266,0.5739,0.5325
GradientBoostingRegressor,0.4433,0.5038,0.6658,0.2883,0.5377,0.4703,0.4363,0.4972,0.6605,0.3106,0.5597,0.5123
AdaBoostRegressor,0.5377,0.5881,0.7333,0.1367,0.4386,0.3955,0.5202,0.5811,0.7213,0.1779,0.477,0.454
SVR,0.4778,0.4995,0.6912,0.2329,0.5027,0.4557,0.4609,0.4906,0.6789,0.2716,0.5406,0.4884
LinearRegression,0.5534,0.5745,0.7439,0.1115,0.3339,0.3312,0.5555,0.5736,0.7453,0.1221,0.3502,0.3441
KNeighborsRegressor,0.6313,0.5849,0.7946,-0.0137,0.4307,0.3815,0.6007,0.5677,0.7751,0.0507,0.4589,0.4423


In [28]:
result_df.to_csv('monomer_results/AAC_comp_LVR_results.csv')
prediction_df.to_csv('monomer_results/AAC_comp_LVR_prediction_data.csv')

In [29]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.142719047619044, -6.180599999999997, -5.94...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.339573809523806, -6.620000000000004, -5.4...","[-5.826437587301586, -6.620000000000003, -5.79...","[0.2909876448670126, 0.2403331021727948, 0.208..."
1,LGBMRegressor,"[-5.976825451316693, -5.887059516210813, -5.99...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.923473547800348, -6.244154870974508, -6.1...","[-5.893191470346106, -5.986203698348892, -5.99...","[0.07449471667393556, 0.18380319668265066, 0.1..."
2,XGBRegressor,"[-6.6075077, -6.0224543, -5.993145, -5.842207,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.4230413, -6.5967507, -6.5607395, -6.14823...","[-6.545579, -6.606382, -6.6169686, -6.1833873,...","[0.24737814, 0.26260722, 0.23212905, 0.2083770..."
3,DecisionTreeRegressor,"[-5.835, -4.68, -6.62, -5.92, -5.88, -4.55, -4...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.96, -6.62, -5.65, -6.126666666666666, -7....","[-6.984, -6.62, -6.2275, -6.126666666666667, -...","[0.01959591794226544, 0.24033310217279674, 0.9..."
4,RandomForestRegressor,"[-6.162554557109558, -5.606467142857142, -6.10...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.109942771395271, -6.538520000000001, -6.1...","[-6.163444914474415, -6.446531666666668, -6.09...","[0.2980911340070613, 0.18822081674340704, 0.25..."
5,GradientBoostingRegressor,"[-6.126947156479217, -5.212574111973634, -5.81...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.098635754560118, -5.2132252966222, -6.130...","[-5.72037334503896, -5.2236408739114815, -5.66...","[0.21353816733169637, 0.09392775506445697, 0.2..."
6,AdaBoostRegressor,"[-5.46850810810811, -5.46850810810811, -5.8252...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.46850810810811, -5.327770382695508, -5.46...","[-5.435808819875936, -5.430541770134086, -5.43...","[0.053288430056596385, 0.051931924586600155, 0..."
7,SVR,"[-5.035739048515927, -5.09510980483286, -5.086...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-4.900920359456845, -4.670978845536693, -4.8...","[-4.937783119503217, -4.613126966572476, -4.82...","[0.06061003804844521, 0.03672591032708144, 0.0..."
8,LinearRegression,"[-5.200498365394476, -5.337586828677065, -5.10...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.216527926195354, -5.347950483023906, -5.2...","[-5.223138485234157, -5.3589403902088595, -5.2...","[0.014302159346596418, 0.013000734351834633, 0..."
9,KNeighborsRegressor,"[-5.706666666666667, -6.28, -4.663333333333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-4.95, -5.926666666666667, -5.37666666666666...","[-5.012, -5.666666666666666, -5.164, -5.844666...","[0.2203391325510141, 0.32276582498427253, 0.32..."


In [30]:
const_col

['W',
 'H',
 'M',
 'X',
 'E',
 'Q',
 'T',
 'N',
 'D',
 'V',
 'Y',
 'S',
 'C',
 'R',
 'I',
 'K']

In [5]:
from sklearn.model_selection import GridSearchCV
def train_and_test_predict_with_tuning(models, param_grids, X_train, y_train, X_test, y_test):
   
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        best_params = None

        # hyperparameter tuning 
        if model_name in param_grids and param_grids[model_name]:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            grid_search = GridSearchCV(
                estimator=model, 
                param_grid=param_grids[model_name], 
                cv=kf,
                scoring='neg_mean_squared_error', 
                n_jobs=-1)
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            print(model_name)
            print(": best params",best_params)
        else:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            best_params = {}
            print(model_name, ':Used Default params')

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)  
            test_predictions_folds.append(predictions_test_fold)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
            'Best Parameters': best_params
        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df


In [6]:
param_grids = {
        'ExtraTreesRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None,1,5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'LGBMRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'num_leaves': [31, 50, 100]
        },
        'DecisionTreeRegressor': {
            'max_depth': [None, 10, 20, 50, 100],
            'min_samples_split': [2, 5, 10]
        },
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None, 1, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'GradientBoostingRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7, 10]
        },
        'AdaBoostRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.1, 1.0]
        },
        'SVR': {
            'C': [0.001, 0.1, 1, 10],
            'epsilon': [0.1, 0.2, 0.5],
            'gamma': [0.001, 0.1, 1, 10]
        },
        'KNeighborsRegressor': {
            'n_neighbors': [3, 5, 10],
            'weights': ['uniform', 'distance']
        },
        'MLPRegressor': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'learning_rate': ['constant', 'adaptive'],
            'max_iter': [100,200, 400]
}
    }


In [7]:
df_mc_train = pd.read_csv('Monomer_features/Train_mon_comp.csv')
df_mc_train = clean_feature_names(df_mc_train)
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('Monomer_features/Test_mon_comp.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
print(X_test)
print(y_test)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict_with_tuning(models,param_grids, X_train,y_train, X_test,  y_test)
result_df

(5568, 243)
(5568,)
(1392, 243)
(1392,)
             A        dA       meA     Me_dA  Ala_indol_2_yl_  \
0    -0.501813 -0.349957  1.366587 -0.476635        -0.032844   
1    -0.501813 -0.349957  1.366587  0.793660        -0.032844   
2    -0.501813 -0.349957  1.366587 -0.476635        -0.032844   
3     0.624153 -0.349957 -0.555249 -0.476635        -0.032844   
4    -0.501813 -0.349957  1.366587  0.793660        -0.032844   
...        ...       ...       ...       ...              ...   
1387  4.752694 -0.349957 -0.555249 -0.476635        -0.032844   
1388  4.752694 -0.349957 -0.555249 -0.476635        -0.032844   
1389  3.439067 -0.349957 -0.555249 -0.476635        -0.032844   
1390 -0.501813 -0.349957 -0.555249 -0.476635        -0.032844   
1391  4.752694 -0.349957 -0.555249 -0.476635        -0.032844   

      dAla_indol_2_yl_  Ala_5_Tet_       Abu      dAbu    Me_Abu  ...  \
0            -0.013403   -0.023154 -0.122013 -0.107696 -0.052596  ...   
1            -0.013403   -0.02315



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 5568, number of used features: 71
[LightGBM] [Info] Start training from score -5.742906
LGBMRegressor
: best params {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 50}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 574
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 61
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[L

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.2794,0.3866,0.5286,0.5514,0.7426,0.7083,0.2726,0.3798,0.5221,0.5692,0.7548,0.7267
LGBMRegressor,0.2694,0.3732,0.5191,0.5674,0.7552,0.7209,0.2516,0.3656,0.5016,0.6025,0.7763,0.7456
XGBRegressor,0.2699,0.3799,0.5195,0.5666,0.7528,0.7135,0.2559,0.3704,0.5058,0.5957,0.7728,0.7435
DecisionTreeRegressor,0.3572,0.4278,0.5977,0.4264,0.674,0.6369,0.2845,0.3842,0.5334,0.5504,0.7431,0.715
RandomForestRegressor,0.2798,0.3845,0.529,0.5507,0.7422,0.7077,0.2666,0.3769,0.5163,0.5788,0.7612,0.7328
GradientBoostingRegressor,0.2629,0.3712,0.5127,0.5779,0.7605,0.7257,0.2495,0.3645,0.4995,0.6057,0.7784,0.7465
AdaBoostRegressor,0.4825,0.5214,0.6946,0.2253,0.4749,0.4591,0.4732,0.5162,0.6879,0.2522,0.5047,0.5209
SVR,0.3118,0.3997,0.5584,0.4993,0.7084,0.688,0.3147,0.3986,0.5609,0.5028,0.7107,0.6959
LinearRegression,0.3994,0.4479,0.6319,0.3588,0.6116,0.6371,0.3637,0.4399,0.6031,0.4253,0.6543,0.668
KNeighborsRegressor,0.3548,0.438,0.5957,0.4303,0.6612,0.6292,0.3452,0.4271,0.5875,0.4545,0.6775,0.6504


In [8]:
result_df.to_csv('monomer_results/Monomer_comp_constRemoval_results_with_HPT.csv')
prediction_df.to_csv('monomer_results/Monomer_comp_constRemoval_prediction_data_with_HPT.csv')