In [1]:
print('start')

start


In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -4.0)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [3]:
#Monomeric models
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)
    df.columns = [clean_name(col) for col in df.columns]
    return df

In [8]:
#Monomer composition
df_mc_train = pd.read_csv('features/Monomeric/Train_mon_comp_caco2.csv')
df_mc_train = clean_feature_names(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_Caco2.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(1008, 385)
(1008,)
(252, 385)
(252,)
0.580111592342035
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 342
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 41
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can s



0.09694411088317789




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.2194,0.3335,0.4684,0.6382,0.8021,0.7887,0.243,0.3617,0.4929,0.5801,0.7639,0.7597
LGBMRegressor,0.2849,0.3987,0.5338,0.5301,0.7283,0.7143,0.2618,0.3885,0.5116,0.5477,0.7426,0.749
XGBRegressor,0.2137,0.3395,0.4623,0.6475,0.8066,0.7879,0.2203,0.3521,0.4693,0.6194,0.7883,0.7754
DecisionTreeRegressor,0.3418,0.4127,0.5846,0.4363,0.7154,0.6973,0.2831,0.378,0.5321,0.5108,0.7262,0.7286
RandomForestRegressor,0.2263,0.3403,0.4757,0.6268,0.7928,0.7852,0.248,0.3671,0.498,0.5714,0.7568,0.753
GradientBoostingRegressor,0.2591,0.3862,0.509,0.5727,0.7636,0.7465,0.2649,0.3968,0.5147,0.5423,0.7482,0.7298
AdaBoostRegressor,0.4178,0.5404,0.6464,0.311,0.6234,0.5936,0.4029,0.5272,0.6347,0.3039,0.655,0.624
SVR,0.264,0.3815,0.5138,0.5646,0.7534,0.743,0.2748,0.3889,0.5242,0.5252,0.7261,0.7149
LinearRegression,0.3124,0.4067,0.559,0.4847,0.7158,0.7248,0.2496,0.3817,0.4996,0.5688,0.7586,0.7409
KNeighborsRegressor,0.3665,0.4295,0.6054,0.3955,0.6492,0.6376,0.3612,0.4391,0.601,0.3758,0.6326,0.6281


In [9]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.652200000000002, -7.506799999999998, -7.00...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.724900000000003, -6.528299999999998, -6.5...","[-7.092400000000014, -6.416161303327999, -6.46...","[0.1865115974946376, 0.05943425047060978, 0.22..."
1,LGBMRegressor,"[-6.498354448047952, -6.749015507587469, -6.53...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.573035422876963, -6.249070577797671, -6.0...","[-6.66522835331723, -6.247716263148864, -6.081...","[0.20861314792576288, 0.046786672410358075, 0...."
2,XGBRegressor,"[-6.675162, -7.0794444, -7.1032376, -6.960121,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6338024, -6.460422, -6.3340034, -5.745178...","[-6.8184595, -6.354491, -6.2241526, -5.774813,...","[0.19987, 0.0787556, 0.103004664, 0.021963114,..."
3,DecisionTreeRegressor,"[-7.24, -7.0, -7.0, -6.89, -6.24, -7.01, -5.28...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.28, -6.82, -5.68, -5.8, -6.89, -7....","[-7.228, -6.134, -6.534000000000001, -5.752000...","[0.009797958971132939, 0.33260186409579845, 0...."
4,RandomForestRegressor,"[-6.426050000000001, -7.264999999999997, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.494500000000001, -6.362259999999999, -6.4...","[-6.76550321080734, -6.346912, -6.314615, -5.8...","[0.1759774099110228, 0.0665824284327326, 0.100..."
5,GradientBoostingRegressor,"[-6.269181026009225, -7.220545515274205, -6.75...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.334304945055369, -6.493082045032435, -5.9...","[-6.417420329250184, -6.427409747647121, -5.99...","[0.14588358973431115, 0.07044407079835334, 0.1..."
6,AdaBoostRegressor,"[-6.383843780397887, -6.1803248988829225, -6.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.325271327547723, -6.503703703703707, -6.1...","[-6.306898643891217, -6.375790410166475, -6.19...","[0.044704540828054855, 0.09367398485557321, 0...."
7,SVR,"[-6.310582654844096, -7.097940458489336, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.49842725061972, -5.988407762877407, -5.49...","[-6.427842681072842, -6.004752149585627, -5.49...","[0.05012548389159596, 0.012801609027450898, 0...."
8,LinearRegression,"[-5.969249601595713, -7.176891768362518, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.146361230017039, -6.001837235668774, -5.1...","[-6.211648809845014, -5.979794362401708, -5.22...","[0.05395726953568732, 0.09763126047740729, 0.0..."
9,KNeighborsRegressor,"[-6.696666666666666, -7.296666666666667, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.19, -6.653333333333333, -5.85333333333333...","[-6.3533333333333335, -6.370666666666668, -5.9...","[0.1317067449550959, 0.1691600162896391, 0.156..."


In [10]:
result_df.to_csv('results/Monomeric/Monomer_comp_results_Caco2.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_prediction_data_Caco2.csv')

In [3]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [12]:
df_mc_train = pd.read_csv('features/Monomeric/Train_mon_comp_caco2.csv')
df_mc_train = clean_feature_names(df_mc_train)
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_Caco2.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(1008, 222)
(1008,)
(252, 222)
(252,)
0.580316877998511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 342
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 41
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[Ligh



0.08613042332012655




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.2215,0.335,0.4706,0.6347,0.8001,0.788,0.2429,0.3622,0.4928,0.5803,0.764,0.7592
LGBMRegressor,0.2849,0.3987,0.5338,0.5301,0.7283,0.7143,0.2618,0.3885,0.5116,0.5477,0.7426,0.749
XGBRegressor,0.2137,0.3395,0.4623,0.6475,0.8066,0.7879,0.2203,0.3521,0.4693,0.6194,0.7883,0.7754
DecisionTreeRegressor,0.3259,0.4073,0.5709,0.4625,0.7288,0.7088,0.2963,0.3872,0.5443,0.4881,0.7144,0.7161
RandomForestRegressor,0.2268,0.3409,0.4762,0.626,0.7922,0.7823,0.2487,0.3674,0.4987,0.5702,0.756,0.7527
GradientBoostingRegressor,0.2585,0.3859,0.5084,0.5736,0.7646,0.7482,0.2645,0.3962,0.5143,0.5429,0.7484,0.7293
AdaBoostRegressor,0.4267,0.55,0.6532,0.2962,0.6022,0.5734,0.4091,0.5346,0.6396,0.2931,0.6481,0.6197
SVR,0.264,0.3815,0.5138,0.5646,0.7534,0.743,0.2748,0.3889,0.5242,0.5252,0.7261,0.7149
LinearRegression,0.3124,0.4067,0.559,0.4847,0.7158,0.7248,0.2496,0.3817,0.4996,0.5688,0.7586,0.7409
KNeighborsRegressor,0.3657,0.4291,0.6048,0.3968,0.65,0.6386,0.3615,0.4391,0.6013,0.3753,0.6324,0.6271


In [13]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.626200000000003, -7.611199999999999, -7.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.649200000000004, -6.530499999999996, -6.6...","[-7.084560000000013, -6.4637362604519994, -6.5...","[0.21794076809996316, 0.05795858393719415, 0.2..."
1,LGBMRegressor,"[-6.498354448047952, -6.749015507587469, -6.53...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.573035422876963, -6.249070577797671, -6.0...","[-6.66522835331723, -6.247716263148864, -6.081...","[0.20861314792576288, 0.046786672410358075, 0...."
2,XGBRegressor,"[-6.675162, -7.0794444, -7.1032376, -6.960121,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6338024, -6.460422, -6.3340034, -5.745178...","[-6.8184595, -6.354491, -6.2241526, -5.774813,...","[0.19987, 0.0787556, 0.103004664, 0.021963114,..."
3,DecisionTreeRegressor,"[-7.24, -8.0, -6.89, -7.0, -6.24, -7.01, -5.28...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.28, -6.82, -5.68, -5.8, -7.0, -7.5...","[-7.228, -6.118, -6.566, -5.7620000000000005, ...","[0.009797958971132939, 0.26671332925071456, 0...."
4,RandomForestRegressor,"[-6.430964610920003, -7.261899999999998, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.490000000000002, -6.412633333333336, -6.4...","[-6.776047910158006, -6.351036470586667, -6.32...","[0.18134797894714744, 0.05730498900420543, 0.0..."
5,GradientBoostingRegressor,"[-6.269181026009225, -6.855172428897728, -6.75...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.334304945055369, -6.493082045032435, -5.9...","[-6.417423912437816, -6.427411817358864, -5.99...","[0.14588429135982317, 0.07044051788997106, 0.1..."
6,AdaBoostRegressor,"[-6.330680522982478, -6.180324898882928, -6.18...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.330680522982478, -6.503703703703705, -6.1...","[-6.341477100575583, -6.396179787636259, -6.20...","[0.06610411093187735, 0.09744797876661841, 0.0..."
7,SVR,"[-6.310582562292099, -7.097940367903124, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.498426766683607, -5.98840778252204, -5.49...","[-6.427842826400142, -6.004741914917428, -5.49...","[0.050130643889173244, 0.012769670885946183, 0..."
8,LinearRegression,"[-5.969249601595711, -7.1768917683625215, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.146361230017037, -6.001837235668778, -5.1...","[-6.211648809845014, -5.979794362401711, -5.22...","[0.05395726953568729, 0.09763126047740868, 0.0..."
9,KNeighborsRegressor,"[-6.696666666666666, -7.296666666666667, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.19, -6.653333333333333, -5.85333333333333...","[-6.3533333333333335, -6.370666666666668, -5.9...","[0.1317067449550959, 0.1691600162896391, 0.156..."


In [14]:
const_col

['Ala_tBu_',
 'Me_Ala_indol_2_yl_',
 'Ala_5_Tet_',
 'Me_dAbu',
 '2Abz',
 'HOCOCH2_Bal',
 'Cys_EtO2H__NH2',
 'Cha',
 'dCha',
 'Asp_OMe_',
 'Asp_Ph_2_NH2__',
 'dAsp_pyrrol_1_yl_',
 'E',
 'Glu_NH2',
 'Glu_3R_Me_',
 'Glu_OMe_',
 'dGlu_OMe_',
 'Phe_4_F_',
 'dPhe_4_F_',
 'Phe_4_NO2_',
 'dPhe_3_4_diF_',
 'Me_Phe_a_b_dehydro_',
 'Bn_4_OH__Gly',
 'HOCOCH2_Gly_ol',
 'NH2Bu_Gly',
 'PhPr_Gly',
 'cHexCH2_Gly',
 '2_pyridylmethyl_Gly',
 'd_N__O_Gly_allyl_',
 'GABA',
 'bHph',
 'dHyp',
 '_N__O_xiIle',
 'd_N__O_aIle',
 'Me_dK',
 'Lys_Cbz_',
 'Lys_Me_',
 'Me_Lys_Me_',
 'Lys_Tfa_',
 'aMeLeu',
 'dLeu_3R_OH_',
 '_N__O_Leu',
 'd_N__O_Leu',
 'M',
 'meM',
 'Met_O2_',
 'meN',
 'dAsn_Me2_',
 '1_Nal',
 'd1_Nal',
 'Me_dNle',
 'dNva',
 'Me_dNva',
 'Orn',
 'meQ',
 'dGln_Me2_',
 'R',
 'Arg_Me_Me_',
 'Ser_Ac_',
 'dSer_Me_',
 'Sta',
 'Sta_3R_4R_',
 'dT',
 'Tza',
 'Me_dV',
 '_N__O_Val',
 'd_N__O_Val',
 '_N__O_Val_3_OH_',
 'Me_dW',
 'Trp_5_Br_',
 'Trp_6_Br_',
 'Trp_7_Br_',
 'dY',
 'Me_dY',
 'Me_Tyr_Me_',
 'dTyr_bR_OMe_',

In [18]:
result_df.to_csv('results/Monomeric/Monomer_comp_constRemoval_results_Caco2.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_constRemoval_prediction_data_Caco2.csv')

In [5]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [16]:
df_train = pd.read_csv('features/Monomeric/Train_mon_comp_caco2.csv')
df_mc_train = clean_feature_names(df_train)
df_mc_train = df_mc_train.drop(['ID','SMILES','Permeability'],axis=1)
df_mc, const_col = remove_low_variance_columns(df_mc_train)
X_train = df_mc
y_train = df_train['Permeability']
print(X_train.shape)
print(y_train.shape)

df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_Caco2.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(1008, 5)
(1008,)
(252, 5)
(252,)
0.3218453784980745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 5
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 5
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83
[LightGBM] [Info] Number of data points in the train set: 806, number of used



-0.16478956567744585




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.4317,0.4966,0.657,0.288,0.573,0.5385,0.3925,0.4807,0.6265,0.3218,0.5897,0.53
LGBMRegressor,0.4232,0.4999,0.6505,0.302,0.5533,0.5078,0.4163,0.506,0.6452,0.2807,0.5355,0.491
XGBRegressor,0.4463,0.5064,0.6681,0.2639,0.561,0.5302,0.3917,0.4849,0.6259,0.3232,0.5912,0.5393
DecisionTreeRegressor,0.5133,0.5274,0.7164,0.1535,0.5157,0.4998,0.4182,0.4966,0.6467,0.2774,0.5631,0.5061
RandomForestRegressor,0.417,0.4901,0.6458,0.3123,0.5768,0.5406,0.4032,0.4951,0.635,0.3032,0.5656,0.5172
GradientBoostingRegressor,0.4146,0.5062,0.6439,0.3162,0.5629,0.5084,0.3949,0.505,0.6284,0.3177,0.5719,0.5235
AdaBoostRegressor,0.4807,0.5815,0.6933,0.2072,0.4629,0.4087,0.4813,0.5819,0.6938,0.1684,0.4676,0.4281
SVR,0.4358,0.5036,0.6602,0.2813,0.5438,0.4869,0.4816,0.5343,0.694,0.1678,0.4304,0.39
LinearRegression,0.5819,0.6336,0.7628,0.0403,0.2051,0.1922,0.5505,0.6174,0.742,0.0488,0.2223,0.1993
KNeighborsRegressor,0.4997,0.5408,0.7069,0.1759,0.4815,0.4386,0.459,0.5265,0.6775,0.2068,0.4841,0.4436


In [17]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.656475000000001, -6.179999999999995, -6.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.240000000000006, -6.336666666666666, -6.0...","[-7.171453333333341, -6.4257857142857135, -5.9...","[0.13709333333333368, 0.08761664075369621, 0.0..."
1,LGBMRegressor,"[-6.299147141949681, -6.071719488661043, -6.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.521021498146382, -6.174461871046051, -6.1...","[-6.58385324536282, -6.213737594350562, -6.118...","[0.2806316760054967, 0.10374678795144017, 0.04..."
2,XGBRegressor,"[-6.1325917, -6.1396337, -6.078206, -6.083127,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.2385, -6.226497, -6.5272536, -6.013157, -...","[-7.1323385, -6.3202715, -6.291472, -5.9834166...","[0.22118671, 0.106037535, 0.18238238, 0.032099..."
3,DecisionTreeRegressor,"[-6.2, -6.180000000000001, -6.061666666666667,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.336666666666667, -6.82, -5.946, -5...","[-6.9, -6.425785714285714, -6.4236666666666675...","[0.68, 0.0876166407536919, 0.48593598115160985..."
4,RandomForestRegressor,"[-6.318423015873013, -6.166567391774894, -6.03...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.672235844155846, -6.413104379509382, -6.2...","[-6.743550040043294, -6.443011351259853, -6.08...","[0.20590116043166343, 0.053913711789654525, 0...."
5,GradientBoostingRegressor,"[-6.825762223792113, -6.23428640439486, -6.032...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.903101172239964, -6.201058412954233, -6.3...","[-6.812967498618247, -6.207114010488236, -6.28...","[0.22750124916911613, 0.04440623484577123, 0.1..."
6,AdaBoostRegressor,"[-6.494307346377118, -6.318312213078638, -6.15...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.318312213078638, -6.287116884633327, -6.2...","[-6.262950725315946, -6.308914884114311, -6.23...","[0.0439005806112004, 0.09770467178685975, 0.07..."
7,SVR,"[-6.452643702787322, -6.138505063002963, -6.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.303362913392696, -6.049364975905198, -6.0...","[-6.50550855990547, -6.029261945015946, -6.042...","[0.1478185527105568, 0.03511442581654253, 0.06..."
8,LinearRegression,"[-6.288542287026149, -6.156128910308309, -6.12...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.108487335048254, -6.193935655028156, -5.8...","[-6.123218009407512, -6.208052471933376, -5.94...","[0.025417066659350174, 0.023648362650625972, 0..."
9,KNeighborsRegressor,"[-6.55, -6.27, -5.973333333333334, -6.26333333...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.586666666666666, -6.613333333333333, -6.0...","[-6.965333333333332, -6.503333333333333, -5.94...","[0.25472948614385227, 0.2478709341572748, 0.08..."


In [18]:
result_df.to_csv('results/Monomeric/Monomer_comp_LVR_results_Caco2.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_LVR_prediction_data_Caco2.csv')

In [19]:
#AA composition
df_aac_train = pd.read_csv('features/Monomeric/Train_aac_Caco2.csv')
X_train = df_aac_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_aac_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_aac_test = pd.read_csv('features/Monomeric/Test_aac_Caco2.csv')
X_test = df_aac_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_aac_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
aac_comp,prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
aac_comp

(1008, 21)
(1008,)
(252, 21)
(252,)
0.6144423831427892
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 178
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 13
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 13
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179
[Light



-0.07142284765620999




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.2524,0.3742,0.5024,0.5836,0.7694,0.7363,0.2231,0.3632,0.4724,0.6144,0.7862,0.7759
LGBMRegressor,0.2861,0.4054,0.5349,0.5282,0.7268,0.6937,0.2352,0.3878,0.4849,0.5936,0.7731,0.751
XGBRegressor,0.2569,0.3782,0.5068,0.5763,0.7651,0.7324,0.2558,0.3847,0.5058,0.5579,0.7556,0.7496
DecisionTreeRegressor,0.3641,0.4348,0.6034,0.3996,0.6864,0.6532,0.2688,0.3955,0.5185,0.5355,0.7451,0.7405
RandomForestRegressor,0.2422,0.3704,0.4921,0.6006,0.7753,0.7426,0.2265,0.3693,0.4759,0.6086,0.7807,0.7685
GradientBoostingRegressor,0.2812,0.4158,0.5303,0.5362,0.7342,0.7015,0.2448,0.3987,0.4948,0.5769,0.7666,0.7568
AdaBoostRegressor,0.4114,0.5445,0.6414,0.3214,0.5892,0.5724,0.38,0.5237,0.6164,0.3434,0.6462,0.6411
SVR,0.3073,0.4267,0.5544,0.4931,0.7035,0.6704,0.2637,0.4106,0.5135,0.5443,0.7401,0.7121
LinearRegression,0.4591,0.5552,0.6776,0.2428,0.4941,0.4741,0.409,0.5202,0.6395,0.2932,0.5423,0.5023
KNeighborsRegressor,0.3107,0.4177,0.5574,0.4875,0.7082,0.6622,0.3182,0.4361,0.5641,0.4501,0.6805,0.6571


In [20]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-7.240000000000006, -7.445000000000011, -6.92...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.240000000000006, -6.451549999999998, -6.5...","[-7.230000000000011, -6.5103608415346645, -6.7...","[0.006324555320333535, 0.04902707112102967, 0...."
1,LGBMRegressor,"[-6.885036443496819, -6.508432339278743, -6.52...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.885036443496819, -6.477142153017984, -6.0...","[-7.003516345466998, -6.505361150894795, -6.00...","[0.11388761905466495, 0.0773902325281761, 0.10..."
2,XGBRegressor,"[-7.227475, -7.4860234, -6.5240793, -6.3824825...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.227475, -6.4718294, -6.7645855, -5.80428,...","[-7.2225313, -6.422544, -6.8524237, -5.8481283...","[0.01198302, 0.15170343, 0.33546636, 0.0966097..."
3,DecisionTreeRegressor,"[-7.24, -7.445, -7.015000000000001, -7.445, -6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -5.96, -7.85, -5.8, -5.92, -7.0, -7.0...","[-7.230000000000001, -6.176, -7.60333333333333...","[0.006324555320336905, 0.43250895944477263, 0...."
4,RandomForestRegressor,"[-6.8987705999200015, -7.374499999999997, -6.6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.8987705999200015, -6.479327173699999, -6....","[-6.98563945331734, -6.471273577972856, -6.817...","[0.11943406396826285, 0.05440338769036966, 0.1..."
5,GradientBoostingRegressor,"[-6.441232430236538, -7.05082411872343, -7.201...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.441232430236538, -6.6699274817389655, -6....","[-6.610504534588509, -6.645583384672752, -6.48...","[0.18106457432881606, 0.14477337947855853, 0.1..."
6,AdaBoostRegressor,"[-6.606840802130835, -6.837977777777769, -6.76...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.606840802130835, -6.41314700738942, -6.41...","[-6.639689637893137, -6.418508434517584, -6.31...","[0.03557183981343186, 0.062461197929265785, 0...."
7,SVR,"[-6.441194364396824, -7.079131200183025, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.441194364396824, -6.632502828496561, -6.1...","[-6.4721203510061684, -6.725736790149783, -6.1...","[0.07675401497490701, 0.09149188996432588, 0.0..."
8,LinearRegression,"[-6.208949023822581, -6.9107658497571025, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.208949023822581, -5.717043932589794, -5.3...","[-6.171426907982546, -5.888639566358313, -5.41...","[0.02849486463249442, 0.1128691446147457, 0.04..."
9,KNeighborsRegressor,"[-7.4433333333333325, -7.3066666666666675, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.4433333333333325, -6.6499999999999995, -6...","[-7.346000000000001, -6.730666666666667, -7.00...","[0.2907033615981154, 0.15333913032520063, 0.38..."


In [21]:
aac_comp.to_csv('results/Monomeric/AAC_comp_results_Caco2.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_prediction_data_Caco2.csv')

In [6]:
#Constant column removal
df_mc_train = pd.read_csv('features/Monomeric/Train_aac_Caco2.csv')
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_aac_Caco2.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
aac_comp,prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
aac_comp

(1008, 18)
(1008,)
(252, 18)
(252,)
0.6179269685791806
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 178
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 13
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 13
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can se



0.010255344988451709




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.2546,0.3759,0.5046,0.5801,0.7675,0.7337,0.2211,0.3621,0.4702,0.6179,0.7883,0.7791
LGBMRegressor,0.2861,0.4054,0.5349,0.5282,0.7268,0.6937,0.2352,0.3878,0.4849,0.5936,0.7731,0.751
XGBRegressor,0.2569,0.3782,0.5068,0.5763,0.7651,0.7324,0.2558,0.3847,0.5058,0.5579,0.7556,0.7496
DecisionTreeRegressor,0.3523,0.4294,0.5935,0.419,0.697,0.6605,0.2698,0.3927,0.5194,0.5338,0.7446,0.7406
RandomForestRegressor,0.2404,0.3691,0.4903,0.6035,0.7771,0.7453,0.2264,0.3701,0.4758,0.6088,0.7808,0.7667
GradientBoostingRegressor,0.281,0.4158,0.5301,0.5366,0.7345,0.7018,0.2445,0.3983,0.4945,0.5775,0.7669,0.7574
AdaBoostRegressor,0.409,0.5413,0.6395,0.3255,0.5861,0.565,0.3797,0.5247,0.6162,0.3439,0.6339,0.6401
SVR,0.3073,0.4267,0.5544,0.4931,0.7035,0.6704,0.2637,0.4106,0.5135,0.5443,0.7401,0.712
LinearRegression,0.4591,0.5552,0.6776,0.2428,0.4941,0.4741,0.409,0.5202,0.6395,0.2932,0.5423,0.5023
KNeighborsRegressor,0.3116,0.4185,0.5582,0.4862,0.7073,0.6614,0.3177,0.4353,0.5637,0.451,0.6811,0.6587


In [7]:
aac_comp.to_csv('results/Monomeric/AAC_comp_results_const_rem_Caco2.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_prediction_data_const_rem_Caco2.csv')

In [23]:
#LVR column removal
df_mc_train = pd.read_csv('features/Monomeric/Train_aac_Caco2.csv')
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
X_train, const_col = remove_low_variance_columns(X_train)

y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_aac_Caco2.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_mc = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_mc, X_train,y_train, X_test,  y_test)
result_df

(1008, 5)
(1008,)
(252, 5)
(252,)
0.5231110399482617
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 5
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 5
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 806, number of used



0.1480522885433574




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.3795,0.461,0.616,0.3741,0.6378,0.6088,0.276,0.4181,0.5253,0.5231,0.726,0.6962
LGBMRegressor,0.3627,0.4635,0.6022,0.4018,0.6365,0.5952,0.2633,0.4153,0.5131,0.5451,0.7435,0.7114
XGBRegressor,0.3874,0.4678,0.6225,0.361,0.6317,0.5924,0.2689,0.4182,0.5186,0.5353,0.7334,0.6978
DecisionTreeRegressor,0.4265,0.4804,0.6531,0.2966,0.6046,0.5701,0.2855,0.4229,0.5343,0.5067,0.7175,0.6818
RandomForestRegressor,0.3557,0.4544,0.5964,0.4134,0.6521,0.6087,0.2591,0.4082,0.5091,0.5522,0.7445,0.7129
GradientBoostingRegressor,0.3533,0.4691,0.5944,0.4173,0.6462,0.6019,0.3086,0.4474,0.5555,0.4667,0.689,0.6443
AdaBoostRegressor,0.4527,0.5791,0.6728,0.2534,0.5224,0.484,0.4357,0.567,0.6601,0.2472,0.5479,0.5354
SVR,0.4098,0.4901,0.6402,0.3241,0.581,0.5289,0.3449,0.4592,0.5873,0.404,0.638,0.5681
LinearRegression,0.5336,0.6108,0.7305,0.12,0.3467,0.3232,0.5008,0.6011,0.7077,0.1347,0.3681,0.2939
KNeighborsRegressor,0.4591,0.5104,0.6776,0.2428,0.5576,0.5202,0.3419,0.4597,0.5847,0.4093,0.6496,0.6162


In [24]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-7.240000000000006, -6.302222222222216, -6.30...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.240000000000006, -6.6772166666666655, -6....","[-7.230000000000011, -6.679449281011334, -6.06...","[0.006324555320333535, 0.0558162913016872, 0.0..."
1,LGBMRegressor,"[-7.1369403774733104, -6.192754382150387, -6.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.1369403774733104, -6.4223574196702655, -6...","[-7.022203077673488, -6.483200540898531, -6.33...","[0.11580425885563135, 0.12686175083826434, 0.0..."
2,XGBRegressor,"[-7.251431, -6.2783113, -6.2783113, -5.9454646...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.251431, -6.3163, -5.724035, -5.798092, -5...","[-7.231393, -6.6721015, -5.824401, -5.8970366,...","[0.013492818, 0.24480946, 0.12565546, 0.185065..."
3,DecisionTreeRegressor,"[-7.24, -6.302222222222223, -6.302222222222223...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -5.526666666666666, -5.92, -5.8, -6.2...","[-7.230000000000001, -6.282033333333333, -6.01...","[0.006324555320336905, 0.6030029334736092, 0.1..."
4,RandomForestRegressor,"[-7.004740061864767, -6.294737015484514, -6.29...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.004740061864766, -6.483638333333334, -6.1...","[-7.07717961410456, -6.440275894681283, -6.153...","[0.0715569234319991, 0.11186487061609168, 0.03..."
5,GradientBoostingRegressor,"[-6.718738291669362, -5.938383237493036, -5.93...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.718738291669362, -6.6152367327455055, -6....","[-6.713255794620996, -6.687445773412115, -6.36...","[0.06218194581965795, 0.052797851441996085, 0...."
6,AdaBoostRegressor,"[-6.535920249998465, -6.383475887249133, -6.38...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.535920249998465, -6.581961485103581, -6.4...","[-6.468204022403782, -6.548916511238957, -6.33...","[0.11677338774896678, 0.01998094138606584, 0.1..."
7,SVR,"[-6.454578361268403, -5.900133187291434, -5.90...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.454578361268403, -6.188651681904176, -6.1...","[-6.559418720403872, -6.186223218564108, -6.14...","[0.09283199720873064, 0.022435221318779584, 0...."
8,LinearRegression,"[-6.1126432448203785, -6.105467072767023, -6.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.1126432448203785, -6.606769399335539, -5....","[-6.072958925878552, -6.662793763368074, -5.58...","[0.02270878848106582, 0.041534876650495114, 0...."
9,KNeighborsRegressor,"[-7.333333333333333, -6.8500000000000005, -6.8...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.333333333333333, -6.196666666666666, -6.0...","[-7.053333333333333, -6.103999999999999, -5.91...","[0.15999999999999992, 0.1247468547811039, 0.05..."


In [25]:
result_df.to_csv('results/Monomeric/AAC_comp_LVR_results_Caco2.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_LVR_prediction_data_Caco2.csv')

In [26]:
#Atomic models
df_train = pd.read_csv('features/Atomic/Train_all_atomic_desc_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Atomic/Test_all_atomic_desc_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_degree = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_degree, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 23)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 23)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 504
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 17
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 17
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead



0.4902597058809923




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2684,0.389,0.5181,0.5574,0.7473,0.7177,0.2215,0.3587,0.4706,0.6173,0.7859,0.7615
DecisionTreeRegressor,0.3663,0.4315,0.6052,0.3959,0.6857,0.6548,0.2571,0.3691,0.5071,0.5557,0.7582,0.7386
RandomForestRegressor,0.2618,0.3761,0.5117,0.5682,0.7561,0.7262,0.2281,0.3571,0.4776,0.6058,0.7792,0.7481
GradientBoostingRegressor,0.2601,0.3828,0.51,0.5711,0.7558,0.7179,0.2123,0.358,0.4608,0.6332,0.8001,0.7616
AdaBoostRegressor,0.3602,0.4956,0.6002,0.4059,0.6481,0.6014,0.3494,0.4955,0.5911,0.3963,0.6541,0.6192
XGBRegressor,0.2835,0.3905,0.5325,0.5324,0.7414,0.7117,0.2192,0.3431,0.4682,0.6212,0.7909,0.769
ExtraTreesRegressor,0.2741,0.3827,0.5236,0.5479,0.7493,0.7193,0.2168,0.3433,0.4656,0.6254,0.7936,0.7732
LinearRegression,0.4009,0.5017,0.6331,0.3389,0.5826,0.5313,0.3402,0.4863,0.5833,0.4121,0.646,0.5732
KNeighborsRegressor,0.2948,0.3989,0.5429,0.5139,0.7281,0.6945,0.2463,0.3801,0.4963,0.5744,0.7627,0.7365
SVR,0.2836,0.4042,0.5326,0.5323,0.7317,0.6915,0.2128,0.3644,0.4614,0.6322,0.7999,0.7577


In [27]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.785442850397671, -7.162945696316613, -7.16...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.785442850397671, -6.008832675993183, -6.2...","[-6.879893025995029, -6.314524960128983, -6.52...","[0.05359799761486307, 0.18628775092345418, 0.1..."
1,DecisionTreeRegressor,"[-6.21, -7.05, -7.05, -7.0, -5.7, -7.15, -6.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.21, -7.4, -6.21, -5.92, -6.66, -7.22, -5....","[-7.017999999999999, -5.886, -6.5, -5.928, -5....","[0.40399999999999986, 0.8459219822182187, 0.35..."
2,RandomForestRegressor,"[-6.890520880224292, -7.013099999999998, -7.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.890520880224292, -5.942608389466668, -6.5...","[-6.989216088368532, -5.946909789448667, -6.60...","[0.06826955604930098, 0.10504694123717265, 0.0..."
3,GradientBoostingRegressor,"[-7.314808768490747, -7.232058644420659, -7.23...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.314808768490747, -6.4606831558014735, -6....","[-7.2217138508825744, -6.506455389425888, -6.7...","[0.05381220953463885, 0.11911795109560126, 0.1..."
4,AdaBoostRegressor,"[-6.564452052367762, -6.720733040578791, -6.72...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.564452052367762, -6.243230769230767, -6.2...","[-6.568440160038245, -6.257742933757468, -6.43...","[0.1873201897075156, 0.1695581801270102, 0.129..."
5,XGBRegressor,"[-6.868759, -6.793629, -6.793629, -6.299522, -...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.868759, -6.0290594, -6.4102063, -5.920694...","[-7.1431823, -6.2309546, -6.547484, -5.9096847...","[0.1376938, 0.28824592, 0.3167449, 0.014137465..."
6,ExtraTreesRegressor,"[-7.079900000000006, -7.027999999999994, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.073500000000006, -5.760579053970003, -6.5...","[-7.165259803920014, -6.023377383755335, -6.68...","[0.04898850497424654, 0.19597958075466657, 0.0..."
7,LinearRegression,"[-6.511459707731925, -7.173040318900999, -7.17...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.518935378230914, -5.854003248234029, -5.7...","[-6.499810519021622, -5.930086057505843, -5.79...","[0.06193961817347407, 0.04967966255302722, 0.0..."
8,KNeighborsRegressor,"[-6.87, -7.3500000000000005, -7.35000000000000...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.87, -5.266666666666667, -6.86333333333333...","[-6.986666666666666, -5.433333333333333, -6.83...","[0.11083521301664244, 0.16481639616386573, 0.3..."
9,SVR,"[-6.679053459234488, -7.102383603994936, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.675830605402576, -5.964232367203491, -6.2...","[-6.68405437896876, -6.259770083183646, -6.465...","[0.11054051471616042, 0.17855062876977673, 0.0..."


In [28]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_Caco2.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_Caco2.csv')

In [10]:
#Atomic + monomeric_composition based features
df1 = pd.read_csv('features/Monomeric/Train_mon_comp_caco2.csv')
df2 = pd.read_csv('features/Atomic/Train_all_atomic_desc_Caco2.csv')
df_train = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_train

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_F,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,0.071429,0.000000,0.142857,0.000000,0.0,0.0,0.0,...,0,87,14,0,24,0,0,155,1,1
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,0.071429,0.071429,0.071429,0.000000,0.0,0.0,0.0,...,0,86,14,0,24,0,0,155,1,1
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,85,15,0,12,0,0,155,1,1
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,0.071429,0.000000,0.142857,0.071429,0.0,0.0,0.0,...,0,85,14,0,18,0,0,149,1,1
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,85,15,0,12,0,0,148,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.250000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,22,4,0,6,0,0,40,1,1
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,0.333333,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,22,3,0,6,0,0,35,1,1
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,0.333333,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,21,3,0,6,0,0,35,1,1
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,0.333333,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,20,3,0,6,0,0,35,1,1


In [11]:
df1 = pd.read_csv('features/Monomeric/Test_mon_comp_Caco2.csv')
df2 = pd.read_csv('features/Atomic/Test_all_atomic_desc_Caco2.csv')
df_test = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_test

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_F,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,0.071429,0.000000,0.071429,0.000000,0.0,0.0,0.0,...,0,87,14,0,24,0,0,154,1,1
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,0.000000,0.000000,0.000000,0.083333,0.0,0.0,0.0,...,0,95,12,0,6,0,0,141,1,1
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,81,12,0,24,0,0,140,1,1
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,1,89,12,0,12,0,0,148,1,1
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,0.000000,0.090909,0.000000,0.000000,0.0,0.0,0.0,...,1,90,12,0,12,0,0,143,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,20,3,0,11,0,0,42,1,1
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,23,3,0,6,0,0,35,1,1
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,23,3,0,6,0,0,35,1,1
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.333333,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,23,3,0,6,0,0,35,1,1


In [5]:
import re
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)
    df.columns = [clean_name(col) for col in df.columns]
    return df

In [4]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [14]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 408)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 408)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.085451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 843
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 57
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 843
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 58
[LightGBM] [Info] Start training from score -6.274773
[Lig

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2111,0.3445,0.4594,0.6519,0.8075,0.7879,0.1678,0.3227,0.4097,0.71,0.8451,0.8278
DecisionTreeRegressor,0.3151,0.4057,0.5614,0.4803,0.7347,0.7095,0.1853,0.315,0.4305,0.6798,0.8274,0.8039
RandomForestRegressor,0.1885,0.3172,0.4342,0.6891,0.8307,0.805,0.1612,0.3069,0.4015,0.7215,0.8501,0.8347
GradientBoostingRegressor,0.2034,0.3421,0.4511,0.6645,0.8189,0.7953,0.1913,0.348,0.4374,0.6695,0.8259,0.7927
AdaBoostRegressor,0.3476,0.4931,0.5896,0.4267,0.7037,0.6608,0.3324,0.4945,0.5766,0.4256,0.7268,0.6731
XGBRegressor,0.1913,0.3222,0.4373,0.6845,0.8292,0.8027,0.1523,0.2976,0.3903,0.7368,0.8595,0.8472
ExtraTreesRegressor,0.1835,0.3098,0.4284,0.6973,0.8363,0.8124,0.1687,0.3067,0.4107,0.7085,0.8418,0.8248
LinearRegression,0.4587,0.4439,0.6773,0.2435,0.6599,0.7239,0.3791,0.4237,0.6157,0.345,0.6608,0.6798
KNeighborsRegressor,0.2924,0.3928,0.5407,0.5178,0.7241,0.7018,0.3292,0.4185,0.5738,0.4312,0.6698,0.6651
SVR,0.25,0.3709,0.5,0.5878,0.7692,0.7608,0.2136,0.3598,0.4622,0.6309,0.7996,0.7786


In [18]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_and_mono_comp_Caco2.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_and_mono_comp_Caco2.csv')

In [19]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 239)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 239)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 843
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 57
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 843
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 58
[LightGBM] [Info] Start training from score -6.274773
[Lig



0.22432695519313683


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2111,0.3445,0.4594,0.6519,0.8075,0.7879,0.1678,0.3227,0.4097,0.71,0.8451,0.8278
DecisionTreeRegressor,0.3136,0.4073,0.56,0.4827,0.7356,0.7101,0.1914,0.3187,0.4375,0.6693,0.8217,0.7984
RandomForestRegressor,0.1884,0.3171,0.4341,0.6892,0.8309,0.806,0.1615,0.3072,0.4019,0.7209,0.8497,0.8345
GradientBoostingRegressor,0.2041,0.3424,0.4517,0.6635,0.8183,0.7947,0.1908,0.3472,0.4368,0.6703,0.8263,0.7936
AdaBoostRegressor,0.3443,0.4908,0.5867,0.4322,0.6995,0.6529,0.3308,0.4942,0.5752,0.4284,0.7273,0.6842
XGBRegressor,0.1913,0.3222,0.4373,0.6845,0.8292,0.8027,0.1523,0.2976,0.3903,0.7368,0.8595,0.8472
ExtraTreesRegressor,0.1869,0.3126,0.4323,0.6918,0.833,0.8107,0.1709,0.3083,0.4134,0.7046,0.8395,0.8217
LinearRegression,0.4587,0.4439,0.6773,0.2435,0.6599,0.7239,0.3791,0.4237,0.6157,0.345,0.6608,0.6798
KNeighborsRegressor,0.2924,0.3928,0.5407,0.5178,0.7241,0.7018,0.3292,0.4185,0.5738,0.4312,0.6698,0.6651
SVR,0.25,0.3709,0.5,0.5878,0.7692,0.7608,0.2136,0.3598,0.4622,0.6309,0.7996,0.7786


In [20]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.811913500485473, -6.940557733340329, -6.96...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.842443052619469, -6.314180550621558, -6.4...","[-7.002129629003934, -6.4251827125932, -6.6299...","[0.11765484271522285, 0.12483511480335593, 0.1..."
1,DecisionTreeRegressor,"[-6.21, -7.0, -7.0, -6.89, -6.66, -7.15, -5.49...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.21, -6.39, -6.21, -5.92, -5.8, -6.89, -7....","[-7.017999999999999, -6.242, -6.33800000000000...","[0.40399999999999986, 0.07440430095095292, 0.2..."
2,RandomForestRegressor,"[-6.833698039200004, -7.1262605664799965, -6.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.8700980392000055, -6.28818607315, -6.6287...","[-7.01310274509334, -6.3093265411753325, -6.59...","[0.10273721279395023, 0.0331129816829038, 0.11..."
3,GradientBoostingRegressor,"[-6.797795571513833, -6.958384973211656, -6.79...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7129409239731235, -6.3430355216795835, -6...","[-6.800583269791351, -6.445153128415211, -6.43...","[0.13505251280298725, 0.09687191662047147, 0.0..."
4,AdaBoostRegressor,"[-6.25923913043478, -6.641835730716036, -6.641...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.25923913043478, -6.577021224, -6.09, -6.4...","[-6.4313022772861785, -6.451644243307351, -6.2...","[0.09475563882340855, 0.09816632633228475, 0.1..."
5,XGBRegressor,"[-6.627334, -7.3082066, -7.026215, -6.9746356,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.661525, -6.3276634, -6.850144, -5.875889,...","[-7.088649, -6.325822, -6.6942863, -5.902438, ...","[0.21747775, 0.17571814, 0.15576974, 0.0263765..."
6,ExtraTreesRegressor,"[-7.182200000000006, -7.218299999999995, -7.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.167700000000007, -6.4179406940299994, -6....","[-7.172280000000015, -6.331654418089999, -6.60...","[0.025263760606846233, 0.0478102056084313, 0.0..."
7,LinearRegression,"[-6.087909833249171, -7.157194951178878, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.2287270823227106, -6.088920830665623, -5....","[-6.479506918301636, -6.063175311345572, -5.60...","[0.1335343639219489, 0.2523962727807513, 0.090..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -7.02666666666666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.19, -6.653333333333333, -6.14000000000000...","[-6.575999999999999, -6.370666666666668, -6.41...","[0.2599863244266685, 0.1691600162896391, 0.195..."
9,SVR,"[-6.323102107616452, -7.149387292371088, -7.09...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.512223702391223, -6.012406069161191, -5.6...","[-6.491831064350899, -6.03297400250199, -5.641...","[0.04612587597805733, 0.01674593187064185, 0.0..."


In [21]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_and_mono_comp_const_rem_Caco2.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_and_mono_comp_const_rem_Caco2.csv')

In [22]:
const_col

['Ala_tBu_',
 'Me_Ala_indol_2_yl_',
 'Ala_5_Tet_',
 'Me_dAbu',
 '2Abz',
 'HOCOCH2_Bal',
 'Cys_EtO2H__NH2',
 'Cha',
 'dCha',
 'Asp_OMe_',
 'Asp_Ph_2_NH2__',
 'dAsp_pyrrol_1_yl_',
 'E',
 'Glu_NH2',
 'Glu_3R_Me_',
 'Glu_OMe_',
 'dGlu_OMe_',
 'Phe_4_F_',
 'dPhe_4_F_',
 'Phe_4_NO2_',
 'dPhe_3_4_diF_',
 'Me_Phe_a_b_dehydro_',
 'Bn_4_OH__Gly',
 'HOCOCH2_Gly_ol',
 'NH2Bu_Gly',
 'PhPr_Gly',
 'cHexCH2_Gly',
 '2_pyridylmethyl_Gly',
 'd_N__O_Gly_allyl_',
 'GABA',
 'bHph',
 'dHyp',
 '_N__O_xiIle',
 'd_N__O_aIle',
 'Me_dK',
 'Lys_Cbz_',
 'Lys_Me_',
 'Me_Lys_Me_',
 'Lys_Tfa_',
 'aMeLeu',
 'dLeu_3R_OH_',
 '_N__O_Leu',
 'd_N__O_Leu',
 'M',
 'meM',
 'Met_O2_',
 'meN',
 'dAsn_Me2_',
 '1_Nal',
 'd1_Nal',
 'Me_dNle',
 'dNva',
 'Me_dNva',
 'Orn',
 'meQ',
 'dGln_Me2_',
 'R',
 'Arg_Me_Me_',
 'Ser_Ac_',
 'dSer_Me_',
 'Sta',
 'Sta_3R_4R_',
 'dT',
 'Tza',
 'Me_dV',
 '_N__O_Val',
 'd_N__O_Val',
 '_N__O_Val_3_OH_',
 'Me_dW',
 'Trp_5_Br_',
 'Trp_6_Br_',
 'Trp_7_Br_',
 'dY',
 'Me_dY',
 'Me_Tyr_Me_',
 'dTyr_bR_OMe_',

In [46]:
#Fingerprints models
#All fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/All_fingerprints_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/All_fingerprints_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 20188)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 20188)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.161332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12186
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2601
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12321
[LightGBM] [Info] Number of data points in the train set: 806, number of us

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1833,0.3168,0.4282,0.6976,0.8353,0.8191,0.1571,0.3007,0.3964,0.7285,0.8547,0.8441
DecisionTreeRegressor,0.3453,0.4214,0.5876,0.4305,0.714,0.6825,0.206,0.3362,0.4538,0.6441,0.8068,0.7966
RandomForestRegressor,0.1848,0.3137,0.4299,0.6953,0.8348,0.8206,0.1652,0.3069,0.4064,0.7146,0.8467,0.8393
GradientBoostingRegressor,0.198,0.3348,0.445,0.6734,0.8226,0.8101,0.165,0.3199,0.4062,0.7149,0.8506,0.8345
AdaBoostRegressor,0.3152,0.4652,0.5614,0.4802,0.7195,0.6754,0.2923,0.4598,0.5407,0.4949,0.7413,0.7108
XGBRegressor,0.1982,0.3245,0.4452,0.6731,0.824,0.8087,0.1555,0.2847,0.3944,0.7313,0.8555,0.8467
ExtraTreesRegressor,0.213,0.3319,0.4615,0.6488,0.8086,0.7962,0.1802,0.3043,0.4244,0.6887,0.8303,0.824
LinearRegression,2.056,1.0115,1.4339,-2.3908,0.3768,0.4478,0.7796,0.6407,0.883,-0.3472,0.568,0.5893
KNeighborsRegressor,0.2341,0.3539,0.4838,0.6139,0.7917,0.7737,0.2004,0.339,0.4477,0.6537,0.8103,0.7897
SVR,0.2068,0.3443,0.4548,0.6589,0.8159,0.8084,0.1809,0.3289,0.4254,0.6874,0.8334,0.8094


In [47]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.230442747040298, -7.174354110898877, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.592169102858935, -6.166135714699764, -6.3...","[-6.727040670043354, -6.0582497128898, -6.5971...","[0.09759813123230641, 0.1041653968640557, 0.18..."
1,DecisionTreeRegressor,"[-5.68, -6.82, -7.05, -7.22, -4.85, -6.68, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.52, -5.82, -6.21, -5.92, -5.62, -7.22, -6...","[-6.587999999999999, -5.406000000000001, -6.60...","[0.4714827674475495, 0.4664161232204566, 0.266..."
2,RandomForestRegressor,"[-6.110328033373329, -7.2697999999999965, -6.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.492399999999998, -6.1927666666666665, -6....","[-6.5086200000000005, -6.118692888888891, -6.4...","[0.05782710091298039, 0.0805781429929812, 0.08..."
3,GradientBoostingRegressor,"[-6.477180018910038, -7.158692699074034, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.313988250530257, -6.421150621899522, -6.3...","[-6.628202117190223, -6.11465572183787, -6.503...","[0.2562461721952333, 0.1902522702471319, 0.139..."
4,AdaBoostRegressor,"[-6.256625736088276, -7.0653424657534245, -6.5...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.465759319560004, -6.260835363062829, -6.1...","[-6.625223765224481, -6.124024096670052, -6.14...","[0.1317790706181564, 0.07444372228125373, 0.06..."
5,XGBRegressor,"[-6.407685, -7.5912657, -7.0415297, -6.463961,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6907377, -5.7143283, -6.471988, -5.922387...","[-6.9337173, -5.7293396, -6.5112367, -5.889043...","[0.14073293, 0.21111497, 0.2323347, 0.05400681..."
6,ExtraTreesRegressor,"[-6.115624999999998, -7.315899999999998, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.543900000000001, -5.9145999999999965, -6....","[-6.101799999999999, -6.084983923901996, -6.19...","[0.2584989284310489, 0.1433493328571115, 0.037..."
7,LinearRegression,"[-5.794773390702908, -7.646077956205973, -7.53...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.379546641767751, -8.211630364830109, -6.6...","[-7.9940057603591494, -6.924012369064836, -6.5...","[1.3339606920193718, 2.4666923916836234, 0.206..."
8,KNeighborsRegressor,"[-6.646666666666666, -7.343333333333334, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.646666666666666, -5.653333333333333, -6.8...","[-6.772, -5.836, -6.723999999999999, -5.840666...","[0.20227154465662678, 0.1290934200922383, 0.28..."
9,SVR,"[-6.300328806661749, -7.134840652962888, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.483978263308321, -6.090527898584579, -5.9...","[-6.6089339684804385, -6.250206244366101, -5.9...","[0.21008766634527612, 0.0820454123767831, 0.05..."


In [48]:
result_df.to_csv('results/Fingerprints/Results_All_fingerprints_fp_Caco2.csv')
prediction_df.to_csv('results/Fingerprints/Prediction_data_All_fingerprints_fp_Caco2.csv')

In [49]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [50]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [51]:
#All fingerprints constant removal
df_train = pd.read_csv('features/Fingerprints/Train/All_fingerprints_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/All_fingerprints_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 5613)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 5613)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089608 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12186
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2601
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12321
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2654
[LightGBM] [Info] Start training from score -6.2

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1833,0.3168,0.4282,0.6976,0.8353,0.8191,0.1571,0.3007,0.3964,0.7285,0.8547,0.8441
DecisionTreeRegressor,0.3346,0.4125,0.5784,0.4482,0.722,0.6914,0.2042,0.3365,0.4519,0.6472,0.8082,0.7976
RandomForestRegressor,0.1841,0.3133,0.429,0.6964,0.8355,0.8209,0.1645,0.3065,0.4056,0.7158,0.8474,0.8403
GradientBoostingRegressor,0.1992,0.3353,0.4463,0.6715,0.8215,0.8092,0.1653,0.3209,0.4065,0.7145,0.8504,0.8347
AdaBoostRegressor,0.3064,0.4567,0.5535,0.4947,0.7316,0.6954,0.2902,0.4568,0.5387,0.4986,0.7476,0.7225
XGBRegressor,0.1982,0.3245,0.4452,0.6731,0.824,0.8087,0.1555,0.2847,0.3944,0.7313,0.8555,0.8467
ExtraTreesRegressor,0.2136,0.3313,0.4621,0.6478,0.8079,0.794,0.1802,0.3054,0.4245,0.6886,0.8302,0.8225
LinearRegression,2.056,1.0115,1.4339,-2.3908,0.3768,0.4478,0.7796,0.6407,0.883,-0.3472,0.568,0.5893
KNeighborsRegressor,0.2341,0.3539,0.4838,0.6139,0.7917,0.7737,0.2004,0.339,0.4477,0.6537,0.8103,0.7897
SVR,0.2068,0.3443,0.4548,0.6589,0.8159,0.8084,0.1809,0.3289,0.4253,0.6874,0.8334,0.8095


In [52]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.230442747040298, -7.174354110898877, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.592169102858935, -6.166135714699764, -6.3...","[-6.727040670043354, -6.0582497128898, -6.5971...","[0.09759813123230641, 0.1041653968640557, 0.18..."
1,DecisionTreeRegressor,"[-5.68, -6.89, -7.05, -6.89, -5.03, -6.68, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.57, -5.41, -6.21, -5.92, -5.7, -6.89, -6....","[-6.308, -5.766, -6.618, -5.906000000000001, -...","[0.6087166828665039, 0.7220692487566549, 0.228..."
2,RandomForestRegressor,"[-6.074656666666666, -7.214899999999996, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.525899999999998, -6.113150000000001, -6.2...","[-6.52378, -6.095498369268, -6.433279270772, -...","[0.05207530700821667, 0.0688633947660544, 0.08..."
3,GradientBoostingRegressor,"[-6.477180018910038, -7.158692699074034, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.249060398612867, -6.421150621899523, -6.3...","[-6.616663825589808, -6.109122035703592, -6.48...","[0.2738773145424052, 0.2035862775394289, 0.117..."
4,AdaBoostRegressor,"[-6.360437678535297, -6.976706928599985, -6.74...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.397300531832775, -6.173369267711374, -6.0...","[-6.606721761229058, -6.12307064270879, -6.144...","[0.19175333534195715, 0.05849752278169166, 0.0..."
5,XGBRegressor,"[-6.407685, -7.5912657, -7.0415297, -6.463961,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6907377, -5.7143283, -6.471988, -5.922387...","[-6.9337173, -5.7293396, -6.5112367, -5.889043...","[0.14073293, 0.21111497, 0.2323347, 0.05400681..."
6,ExtraTreesRegressor,"[-6.202674999999997, -7.306200000000001, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.552499999999998, -5.893499999999998, -6.1...","[-6.108760000000001, -6.047421666666664, -6.20...","[0.2915548360085965, 0.15371302008757826, 0.06..."
7,LinearRegression,"[-5.794773390702893, -7.646077956206032, -7.53...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.379546641767764, -8.21163036483125, -6.66...","[-7.9940057603591415, -6.924012369064553, -6.5...","[1.333960692019349, 2.4666923916834347, 0.2067..."
8,KNeighborsRegressor,"[-6.646666666666666, -7.343333333333334, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.646666666666666, -5.653333333333333, -6.8...","[-6.772, -5.836, -6.723999999999999, -5.840666...","[0.20227154465662678, 0.1290934200922383, 0.28..."
9,SVR,"[-6.300623149414168, -7.134778052344182, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.484167353661235, -6.090434137151604, -5.9...","[-6.608964515812255, -6.250243994437875, -5.91...","[0.2100162638792912, 0.08211306524467218, 0.05..."


In [53]:
result_df.to_csv('results/Fingerprints/Results_All_const_rem_fingerprints_Caco2.csv')
prediction_df.to_csv('results/Fingerprints/Prediction_data_All_const_rem_fingerprints_Caco2.csv')

In [4]:
#Morgan fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/morgan_fp_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/morgan_fp_test_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_fp

X_train shape:  (1008, 2048)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 2048)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009282 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 260
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 792
[LightGBM] [Info] Number of data points in the train set: 806, number of used feat

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2325,0.3603,0.4822,0.6165,0.7855,0.7664,0.2044,0.3511,0.4521,0.6468,0.8064,0.7918
DecisionTreeRegressor,0.3886,0.4406,0.6234,0.3591,0.6731,0.6543,0.2334,0.3564,0.4831,0.5967,0.7801,0.7738
RandomForestRegressor,0.2205,0.344,0.4696,0.6363,0.7982,0.7835,0.1994,0.3381,0.4465,0.6555,0.8113,0.8013
GradientBoostingRegressor,0.253,0.3865,0.503,0.5828,0.7669,0.75,0.2121,0.3705,0.4605,0.6335,0.8056,0.7814
AdaBoostRegressor,0.395,0.5266,0.6285,0.3485,0.6235,0.5657,0.3541,0.5062,0.5951,0.3881,0.6975,0.6885
XGBRegressor,0.223,0.3429,0.4723,0.6322,0.7989,0.7871,0.192,0.3296,0.4382,0.6682,0.8177,0.8001
ExtraTreesRegressor,0.3601,0.426,0.6001,0.4061,0.691,0.6729,0.2354,0.359,0.4852,0.5933,0.778,0.7725
LinearRegression,0.3337,0.4314,0.5776,0.4497,0.7324,0.7463,0.3333,0.4334,0.5773,0.424,0.7152,0.7344
KNeighborsRegressor,0.2748,0.3791,0.5242,0.5469,0.7513,0.7338,0.231,0.3606,0.4807,0.6008,0.7814,0.7486
SVR,0.2356,0.3672,0.4854,0.6114,0.7851,0.783,0.2047,0.3538,0.4525,0.6462,0.8068,0.7889


In [5]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.7777494236190625, -7.108990348615672, -6.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.438911585554302, -5.932360031086494, -5.8...","[-6.486117892901231, -5.823162637349735, -5.88...","[0.07816151251693149, 0.09052157674433366, 0.1..."
1,DecisionTreeRegressor,"[-5.64, -8.0, -7.0, -8.0, -6.13, -7.15, -6.24,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.09, -5.82, -6.06, -5.92, -6.82, -6.89, -5...","[-6.516, -5.992, -5.867999999999999, -5.906000...","[0.6115423125181121, 0.7268397347421234, 0.131..."
2,RandomForestRegressor,"[-5.784633333333331, -7.447599999999998, -7.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.469099999999997, -6.045300000000002, -5.9...","[-6.392526666666665, -6.01684738095238, -5.930...","[0.05031308820222198, 0.04344036529498158, 0.0..."
3,GradientBoostingRegressor,"[-5.923994132941158, -7.149471241843286, -6.82...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.045786742619369, -6.169471097965965, -5.9...","[-6.303544907746516, -6.1454259612492645, -5.9...","[0.25178607597738717, 0.06070391043894799, 0.0..."
4,AdaBoostRegressor,"[-6.466574923547405, -6.619652733330639, -6.46...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.57706896551724, -6.284720646514601, -6.21...","[-6.447614138370464, -6.293568000510599, -6.25...","[0.1462982189710318, 0.06985179670279033, 0.06..."
5,XGBRegressor,"[-5.7744627, -7.670866, -6.7389984, -6.961161,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.17029, -5.871232, -5.8253117, -5.8753457,...","[-6.8922067, -5.7087917, -5.868038, -5.865766,...","[0.40262344, 0.0842747, 0.03972987, 0.02203313..."
6,ExtraTreesRegressor,"[-5.640099999999993, -7.681699999999998, -7.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.412000000000005, -6.066800000000003, -6.0...","[-6.5291399999999955, -5.913660000000002, -5.8...","[0.6474028903241044, 0.43860268854624695, 0.13..."
7,LinearRegression,"[-5.861736673874411, -7.400810482471522, -7.68...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.511112329632067, -5.244032507332353, -5.8...","[-6.8788590292881056, -5.710753884705049, -5.8...","[0.23204768393607297, 0.24187025174305868, 0.1..."
8,KNeighborsRegressor,"[-6.646666666666666, -7.343333333333334, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.646666666666666, -5.653333333333333, -6.3...","[-6.772, -5.836, -6.208, -5.872000000000001, -...","[0.20227154465662678, 0.1290934200922383, 0.14..."
9,SVR,"[-6.47659291703563, -7.185895492296423, -7.048...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.472790201999077, -5.912146782946962, -5.8...","[-6.735193999517543, -6.042036521380792, -5.82...","[0.2025973115455079, 0.07579990689698538, 0.03..."


In [6]:
df_morgan_fp.to_csv('results/Fingerprints/Results_Morgan_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Morgan_fp_Caco2.csv')

In [7]:
#Morgan count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/count_morgan_fp_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/count_morgan_fp_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_count_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_count_fp

X_train shape:  (1008, 2048)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 2048)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 272
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1395
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 276
[LightGBM] [Info] Start training from score -6.27477

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1929,0.3252,0.4392,0.6819,0.826,0.815,0.1649,0.3124,0.406,0.7151,0.8465,0.8341
DecisionTreeRegressor,0.344,0.4212,0.5865,0.4327,0.7199,0.6993,0.217,0.3363,0.4659,0.625,0.7963,0.7787
RandomForestRegressor,0.1817,0.3122,0.4263,0.7003,0.838,0.8215,0.1726,0.3145,0.4155,0.7017,0.8386,0.8306
GradientBoostingRegressor,0.2059,0.3422,0.4538,0.6603,0.8171,0.8038,0.179,0.3392,0.423,0.6908,0.8393,0.8253
AdaBoostRegressor,0.3554,0.499,0.5961,0.4139,0.6897,0.621,0.3226,0.4845,0.568,0.4426,0.7487,0.7383
XGBRegressor,0.1911,0.3191,0.4371,0.6849,0.8299,0.8145,0.1621,0.2975,0.4026,0.72,0.8485,0.8443
ExtraTreesRegressor,0.1997,0.3199,0.4469,0.6706,0.8207,0.8111,0.1826,0.3087,0.4273,0.6845,0.8277,0.8215
LinearRegression,0.2579,0.3787,0.5078,0.5747,0.771,0.7719,0.2318,0.3699,0.4815,0.5994,0.7835,0.7827
KNeighborsRegressor,0.243,0.3561,0.493,0.5992,0.78,0.7609,0.235,0.3622,0.4848,0.5939,0.7766,0.7541
SVR,0.2129,0.3463,0.4614,0.6489,0.8086,0.8016,0.1918,0.3356,0.4379,0.6686,0.8214,0.8079


In [8]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.402238347094769, -6.948371334904761, -6.83...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.667466790179692, -6.4060437584116965, -6....","[-6.582340234083034, -6.204841039349185, -6.56...","[0.0973380977931299, 0.11365961950143075, 0.07..."
1,DecisionTreeRegressor,"[-6.6, -7.0, -7.0, -5.96, -4.85, -7.24, -8.0, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6, -7.08, -7.24, -5.92, -6.11, -6.89, -5....","[-6.508, -6.518000000000001, -6.65999999999999...","[0.42035223325206694, 0.9087661965544273, 0.39..."
2,RandomForestRegressor,"[-6.362388333333332, -7.1221, -6.9462, -6.7054...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.542900000000002, -6.6358000000000015, -6....","[-6.556448138418666, -6.3180873333333345, -6.6...","[0.09498088700384165, 0.17613828026613387, 0.0..."
3,GradientBoostingRegressor,"[-6.169607279923105, -6.881480391846541, -6.87...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.5610227638858465, -6.567992123198884, -6....","[-6.572022448886967, -6.45091061287076, -6.429...","[0.24440106605941272, 0.08287639951831595, 0.0..."
4,AdaBoostRegressor,"[-6.2535072164177, -6.349125672379309, -6.3105...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.310546820010839, -6.2500323624595495, -6....","[-6.306933377046857, -6.243619630248549, -6.36...","[0.13494899408464991, 0.04729349805644178, 0.1..."
5,XGBRegressor,"[-6.083667, -7.7080774, -7.2891927, -6.3476515...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.8704486, -6.5507455, -6.568348, -5.918117...","[-6.935752, -6.1527624, -6.6195884, -5.894174,...","[0.04853356, 0.3159736, 0.12803175, 0.02964058..."
6,ExtraTreesRegressor,"[-6.01274410013, -7.528799999999999, -6.993399...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.496699999999996, -6.2998, -6.327900000000...","[-6.178859999999995, -6.091384999999998, -6.44...","[0.18974503524466868, 0.18219567201226286, 0.0..."
7,LinearRegression,"[-6.064268598002763, -7.304939723470838, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.685909322589697, -5.966987089141374, -5.4...","[-6.983171363648244, -6.079369365174255, -5.53...","[0.27502300759299037, 0.16828922963040766, 0.1..."
8,KNeighborsRegressor,"[-6.646666666666666, -7.343333333333334, -6.98...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.646666666666666, -5.63, -6.33000000000000...","[-6.772, -5.798, -6.334000000000001, -5.872000...","[0.20227154465662678, 0.09995221080318537, 0.0..."
9,SVR,"[-6.540189548498312, -7.212360217920473, -7.08...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.553719370452742, -6.131178232154553, -5.7...","[-6.801435589290695, -6.201101144959564, -5.75...","[0.18591960263093996, 0.03921235708645845, 0.0..."


In [9]:
df_morgan_count_fp.to_csv('results/Fingerprints/Results_Count_Morgan_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Count_Morgan_fp_Caco2.csv')

In [10]:
#AtomPairs2d fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/AtomPairs2D_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/AtomPairs2D_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2D_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2D_fp

X_train shape:  (1008, 780)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 780)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 270
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 90
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285
[LightGBM] [Info] Number of data points in the train set: 806, number of used feature

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4817,0.5754,0.694,0.2056,0.4543,0.3396,0.4232,0.5355,0.6505,0.2687,0.5241,0.4635
DecisionTreeRegressor,0.455,0.5444,0.6746,0.2496,0.5042,0.3995,0.3955,0.5024,0.6289,0.3166,0.5637,0.496
RandomForestRegressor,0.4522,0.5435,0.6725,0.2542,0.5062,0.3968,0.3926,0.5007,0.6266,0.3216,0.5671,0.4927
GradientBoostingRegressor,0.4489,0.5454,0.67,0.2596,0.5096,0.3926,0.3952,0.5094,0.6286,0.3171,0.565,0.5023
AdaBoostRegressor,0.5126,0.6075,0.716,0.1546,0.4005,0.2491,0.4643,0.5813,0.6814,0.1978,0.4608,0.413
XGBRegressor,0.4593,0.5464,0.6777,0.2425,0.4978,0.3915,0.3931,0.4996,0.627,0.3207,0.5673,0.4966
ExtraTreesRegressor,0.459,0.5469,0.6775,0.2429,0.4983,0.3936,0.3959,0.5025,0.6292,0.3159,0.563,0.496
LinearRegression,0.4667,0.5552,0.6832,0.2303,0.4854,0.366,0.4229,0.5247,0.6503,0.2692,0.5277,0.4665
KNeighborsRegressor,0.5185,0.5765,0.7201,0.1448,0.443,0.3531,0.4164,0.5141,0.6453,0.2805,0.5415,0.4823
SVR,0.457,0.5476,0.6761,0.2462,0.4984,0.3997,0.4086,0.5077,0.6392,0.294,0.5444,0.4929


In [11]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.448127955904098, -6.280332925127136, -6.28...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.448127955904098, -6.448127955904098, -6.4...","[-6.436246752047469, -6.331976743120181, -6.43...","[0.010367777571748605, 0.20758666035594822, 0...."
1,DecisionTreeRegressor,"[-6.461278195488719, -7.2700000000000005, -7.2...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.461278195488719, -5.688000000000001, -6.4...","[-6.450097219562224, -5.786524358974359, -6.45...","[0.008877451998333235, 0.057996826297807044, 0..."
2,RandomForestRegressor,"[-6.4545358946380915, -7.247043959025466, -7.2...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.454535894638093, -5.680632447640076, -6.4...","[-6.447383976444715, -5.786852175345666, -6.44...","[0.008534425318916879, 0.0634069764552592, 0.0..."
3,GradientBoostingRegressor,"[-6.448305793006271, -7.156398458127847, -7.15...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.448305793006271, -5.799348855127138, -6.4...","[-6.4397303413544975, -5.871326963532544, -6.4...","[0.007905437499843456, 0.053257278829145854, 0..."
4,AdaBoostRegressor,"[-6.464610828032873, -6.483054390824982, -6.48...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.464610828032873, -6.426389295689942, -6.4...","[-6.433274169858514, -6.410331825572776, -6.43...","[0.01942657034783069, 0.017273757282682045, 0...."
5,XGBRegressor,"[-6.4612446, -7.2688684, -7.2688684, -7.03745,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4612446, -5.688598, -6.4612446, -5.92031,...","[-6.4499846, -5.7876616, -6.4499846, -5.932719...","[0.008903019, 0.05859942, 0.008903019, 0.02153..."
6,ExtraTreesRegressor,"[-6.461278195488717, -7.26999999999999, -7.269...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.461278195488718, -5.687999999999992, -6.4...","[-6.450097219562228, -5.786524358974358, -6.45...","[0.008877451998331811, 0.05799682629781213, 0...."
7,LinearRegression,"[-6.445089019862817, -7.119097253480925, -7.11...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.445089019862817, -5.848914903204188, -6.4...","[-6.442292156422459, -5.8783864196711075, -6.4...","[0.007538953188552699, 0.041366052483521557, 0..."
8,KNeighborsRegressor,"[-6.28, -7.36, -7.36, -7.036666666666666, -5.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.973333333333334, -6.28, -5.8933333...","[-6.552, -6.040000000000001, -6.552, -5.968666...","[0.22816757974007662, 0.033333333333333576, 0...."
9,SVR,"[-6.4502285385631986, -7.100101976334238, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4502285385631986, -5.850228256781613, -6....","[-6.432255571156816, -5.865966198596407, -6.43...","[0.01826572574263611, 0.017309387295292977, 0...."


In [12]:
df_AtomPairs2D_fp.to_csv('results/Fingerprints/Results_AtomPairs2D_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_AtomPairs2D_fp_Caco2.csv')

In [13]:
#AtomPairs2d Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/AtomPairs2DCount_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/AtomPairs2DCount_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2DCount_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2DCount_fp

X_train shape:  (1008, 780)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 780)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2329
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 129
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2320
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 134
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ov

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2271,0.3596,0.4766,0.6254,0.7909,0.766,0.1876,0.3411,0.4332,0.6758,0.8228,0.8073
DecisionTreeRegressor,0.4193,0.4626,0.6476,0.3084,0.6572,0.6254,0.2194,0.3568,0.4684,0.6209,0.7919,0.7707
RandomForestRegressor,0.2295,0.3605,0.4791,0.6214,0.7885,0.7642,0.2114,0.3572,0.4597,0.6348,0.7971,0.784
GradientBoostingRegressor,0.2384,0.3755,0.4883,0.6067,0.7818,0.7577,0.2112,0.3618,0.4596,0.635,0.802,0.7832
AdaBoostRegressor,0.3806,0.5237,0.6169,0.3723,0.6352,0.6266,0.3568,0.514,0.5973,0.3836,0.6627,0.6505
XGBRegressor,0.2432,0.362,0.4932,0.5989,0.7788,0.7508,0.1991,0.3368,0.4462,0.6559,0.8103,0.8004
ExtraTreesRegressor,0.2105,0.3363,0.4588,0.6528,0.8083,0.7892,0.1906,0.332,0.4366,0.6706,0.819,0.8062
LinearRegression,0.4069,0.4678,0.6379,0.3288,0.6272,0.6554,0.3021,0.4396,0.5496,0.4781,0.6989,0.6632
KNeighborsRegressor,0.2988,0.4014,0.5467,0.5071,0.7264,0.6973,0.2498,0.3884,0.4998,0.5684,0.7597,0.7308
SVR,0.3347,0.4373,0.5785,0.4481,0.6767,0.6255,0.2868,0.4216,0.5355,0.5045,0.7214,0.6783


In [14]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.660849906792764, -7.150689952470668, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.513129634348976, -6.290504310876285, -6.4...","[-6.613891696404025, -6.2261278494400845, -6.6...","[0.1583715028637307, 0.0926538828681342, 0.135..."
1,DecisionTreeRegressor,"[-7.29, -6.89, -6.89, -7.22, -6.06, -7.05, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -5.66, -6.21, -5.92, -5.89, -7.0, -6....","[-6.5920000000000005, -6.0840000000000005, -6....","[0.7773390508651937, 0.8313025923212318, 0.244..."
2,RandomForestRegressor,"[-6.5504000000000016, -7.322599999999995, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.3503, -6.385007574910003, -6.394899999999...","[-6.540620000000001, -6.290877811484002, -6.51...","[0.14555773287599888, 0.0755525022931809, 0.09..."
3,GradientBoostingRegressor,"[-6.96211618490272, -6.982684054388686, -6.988...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.410798139877397, -5.8578090633312, -6.469...","[-6.628209688153092, -5.9444399565054145, -6.6...","[0.33145045661917377, 0.17018749718372908, 0.1..."
4,AdaBoostRegressor,"[-6.658108108108108, -6.702832117056246, -6.70...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.658108108108108, -6.417762160116755, -6.3...","[-6.641392384429092, -6.386414157654446, -6.38...","[0.08840149587533466, 0.12068912973441566, 0.1..."
5,XGBRegressor,"[-6.977788, -7.398928, -7.1102576, -6.466795, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.783427, -6.0749264, -6.700874, -5.927315,...","[-6.552813, -6.1289, -6.7890296, -5.9610734, -...","[0.32500666, 0.06950474, 0.3498317, 0.08271185..."
6,ExtraTreesRegressor,"[-6.795000000000001, -7.337099999999997, -7.05...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.463399999999998, -6.148000000000001, -6.5...","[-6.511220000000003, -6.364604119982001, -6.61...","[0.09766517086454245, 0.1346859716402307, 0.06..."
7,LinearRegression,"[-5.512934464531922, -7.162718360308874, -6.92...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.935004814572762, -5.086047229216421, -5.3...","[-5.977284123113788, -5.447096455262319, -5.37...","[0.11307984217940713, 0.18620191030317434, 0.1..."
8,KNeighborsRegressor,"[-6.28, -7.09, -7.3500000000000005, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.96, -6.109999999999999, -5.8933333...","[-6.552, -6.0373333333333346, -6.3953333333333...","[0.22816757974007726, 0.03866666666666703, 0.2..."
9,SVR,"[-5.729236958621189, -6.952574541176151, -6.95...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.860443644984318, -6.091046049195703, -5.4...","[-5.983526969756811, -6.278955373677448, -5.46...","[0.11393089962857657, 0.11265982217257632, 0.0..."


In [15]:
df_AtomPairs2DCount_fp.to_csv('results/Fingerprints/Results_AtomPairs2D_Count_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_df_AtomPairs2D_Count_fp_Caco2.csv')

In [16]:
#EState fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/EState_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/EState_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_estate_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_estate_fp

X_train shape:  (1008, 79)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 79)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 33
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 11
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5095,0.5764,0.7138,0.1598,0.4014,0.3745,0.4333,0.5332,0.6583,0.2513,0.504,0.4763
DecisionTreeRegressor,0.4988,0.5575,0.7063,0.1774,0.4399,0.4303,0.3734,0.4961,0.611,0.3548,0.5967,0.5685
RandomForestRegressor,0.4796,0.5536,0.6925,0.209,0.4626,0.4319,0.3734,0.4954,0.6111,0.3548,0.597,0.5715
GradientBoostingRegressor,0.4793,0.5605,0.6923,0.2095,0.4579,0.4389,0.389,0.5122,0.6237,0.3279,0.5806,0.5706
AdaBoostRegressor,0.5325,0.6188,0.7298,0.1217,0.3635,0.3477,0.4705,0.5833,0.6859,0.187,0.4789,0.4605
XGBRegressor,0.4899,0.5563,0.7,0.192,0.4514,0.4331,0.3736,0.4966,0.6112,0.3545,0.5961,0.5711
ExtraTreesRegressor,0.4988,0.5575,0.7063,0.1774,0.4399,0.4303,0.3733,0.4961,0.611,0.3549,0.5968,0.5685
LinearRegression,0.5077,0.5859,0.7126,0.1626,0.4059,0.3703,0.4512,0.5537,0.6717,0.2203,0.4697,0.4265
KNeighborsRegressor,0.5735,0.6044,0.7573,0.0541,0.3662,0.302,0.408,0.5297,0.6387,0.2951,0.5556,0.5242
SVR,0.4904,0.5364,0.7003,0.1913,0.4661,0.4416,0.3839,0.4852,0.6196,0.3366,0.5823,0.5613


In [17]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.621485448439028, -6.402550115139178, -6.40...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.621485448439028, -6.152606584729018, -6.1...","[-6.627602467447163, -6.18388245906429, -6.183...","[0.010730827452058611, 0.02726007079697141, 0...."
1,DecisionTreeRegressor,"[-6.628445692883897, -7.2700000000000005, -7.2...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.628445692883897, -6.157479674796746, -6.1...","[-6.6406685505650085, -6.182323232803335, -6.1...","[0.01442484404099163, 0.026270290443449723, 0...."
2,RandomForestRegressor,"[-6.624369982862886, -7.2367450633296775, -7.2...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.624369982862885, -6.151831556406173, -6.1...","[-6.6399102282178575, -6.183452109643852, -6.1...","[0.016040333130908618, 0.027915712391813965, 0..."
3,GradientBoostingRegressor,"[-6.593550184204047, -7.1913998664722865, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.593550184204047, -6.132261235651848, -6.1...","[-6.5864486038738335, -6.157351318469108, -6.1...","[0.016176211388206952, 0.01995280341413369, 0...."
4,AdaBoostRegressor,"[-6.519062432286461, -6.519062432286461, -6.51...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.519062432286461, -6.3371944444444415, -6....","[-6.531112823393748, -6.289451335061597, -6.28...","[0.03218704346732067, 0.05051660887176891, 0.0..."
5,XGBRegressor,"[-6.6282377, -7.2692437, -7.2692437, -6.604375...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6282377, -6.1573987, -6.1573987, -5.92214...","[-6.64017, -6.181825, -6.181825, -5.928, -5.84...","[0.01435182, 0.026233282, 0.026233282, 0.00767..."
6,ExtraTreesRegressor,"[-6.628445692883896, -7.26999999999999, -7.269...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.628445692883896, -6.1574796747967415, -6....","[-6.64066855056501, -6.182323232803331, -6.182...","[0.014424844040995036, 0.02627029044344882, 0...."
7,LinearRegression,"[-6.558445553336997, -7.263163103551602, -7.26...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.558445553336997, -6.1134965938498755, -6....","[-6.544458177881694, -6.148817934557244, -6.14...","[0.016549146399248555, 0.02185044439748011, 0...."
8,KNeighborsRegressor,"[-6.28, -7.36, -7.36, -7.036666666666666, -6.3...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.366666666666666, -6.36666666666666...","[-6.552, -6.380000000000001, -6.38000000000000...","[0.22816757974007662, 0.18086213288334022, 0.1..."
9,SVR,"[-6.7399397998909185, -7.100364415128385, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7399397998909185, -5.989935208505987, -5....","[-6.735986786768014, -5.978104955277831, -5.97...","[0.018747580529914147, 0.026375037877090203, 0..."


In [18]:
df_estate_fp.to_csv('results/Fingerprints/Results_EState_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_EState_fp_Caco2.csv')

In [19]:
#Extended fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Extended_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Extended_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_extended_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_extended_fp

X_train shape:  (1008, 1024)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1024)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 448
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1359
[LightGBM] [Info] Number of data points in the train set: 806, number of used fe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2652,0.397,0.515,0.5626,0.7506,0.7326,0.2446,0.3935,0.4946,0.5773,0.763,0.7375
DecisionTreeRegressor,0.4233,0.4664,0.6506,0.3019,0.6491,0.6343,0.26,0.3806,0.5099,0.5507,0.7465,0.7227
RandomForestRegressor,0.2656,0.3815,0.5154,0.5619,0.7505,0.7327,0.2309,0.3787,0.4805,0.601,0.7764,0.753
GradientBoostingRegressor,0.2997,0.4343,0.5474,0.5058,0.7159,0.6985,0.2563,0.416,0.5063,0.5571,0.759,0.741
AdaBoostRegressor,0.4683,0.5906,0.6843,0.2276,0.5047,0.4482,0.4408,0.575,0.6639,0.2384,0.5404,0.5457
XGBRegressor,0.2794,0.3952,0.5286,0.5391,0.746,0.7378,0.2421,0.3849,0.4921,0.5816,0.763,0.7419
ExtraTreesRegressor,0.3966,0.4562,0.6298,0.3458,0.6635,0.6484,0.2605,0.3809,0.5104,0.5499,0.7457,0.7226
LinearRegression,0.5962,0.534,0.7721,0.0168,0.5331,0.6085,0.3435,0.4521,0.5861,0.4065,0.6636,0.6657
KNeighborsRegressor,0.3373,0.4258,0.5807,0.4438,0.6846,0.6661,0.2775,0.4055,0.5268,0.5205,0.7266,0.6933
SVR,0.2975,0.4147,0.5455,0.5093,0.7158,0.6956,0.2576,0.4021,0.5075,0.5549,0.7508,0.7158


In [20]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.770348591291825, -7.235340709404083, -7.23...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.964382428309627, -6.48108753344209, -6.05...","[-6.956182435624673, -6.43262036612491, -6.186...","[0.10471999713708804, 0.07662945339263584, 0.0..."
1,DecisionTreeRegressor,"[-6.0, -7.343333333333334, -7.343333333333334,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.57, -7.053333333333334, -7.85, -5.92, -5....","[-6.866, -6.405066666666667, -7.22199999999999...","[0.6598666531959319, 0.48066206192893773, 0.51..."
2,RandomForestRegressor,"[-6.779997619047621, -7.326431666666666, -7.32...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.686846666666669, -6.768146666666667, -6.6...","[-6.6354151904761896, -6.519595857142858, -6.4...","[0.14354574846701876, 0.12943743304530894, 0.1..."
3,GradientBoostingRegressor,"[-6.543707238166324, -7.127504319507111, -7.12...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.79073759637364, -6.400294977132745, -6.02...","[-6.84821433144621, -6.539407719203989, -6.161...","[0.053484203527167276, 0.09733019108091313, 0...."
4,AdaBoostRegressor,"[-6.544555841630814, -6.584116331055549, -6.58...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.544555841630814, -6.42697505102408, -6.45...","[-6.5781218143571705, -6.40684014395319, -6.45...","[0.04006628444871538, 0.02356757252271974, 0.0..."
5,XGBRegressor,"[-7.2587557, -7.3070564, -7.3070564, -5.966183...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4305882, -6.5370893, -6.811117, -5.913132...","[-6.3178124, -6.5241547, -6.663755, -5.954848,...","[0.27444836, 0.27218875, 0.09860516, 0.0745708..."
6,ExtraTreesRegressor,"[-6.148199999999998, -7.3433333333333515, -7.3...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.396200000000008, -7.041666666666658, -7.3...","[-6.858220000000003, -6.490721333333335, -6.94...","[0.6053796243680444, 0.42512176352245656, 0.53..."
7,LinearRegression,"[-6.985217372869195, -7.172533771923354, -7.17...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.806845461743782, -7.286979732122333, -6.3...","[-6.348833617065793, -7.714265512737455, -6.32...","[0.2869467826191367, 0.25997068438923565, 0.10..."
8,KNeighborsRegressor,"[-6.72, -7.343333333333334, -7.343333333333334...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.19, -5.653333333333333, -5.58333333333333...","[-6.025333333333333, -5.824, -5.64666666666666...","[0.09349034412410966, 0.11825866188618528, 0.0..."
9,SVR,"[-6.507100859843202, -7.100339271609559, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.633717951911747, -6.1935977559323625, -5....","[-6.72256671804724, -6.355402519483421, -5.824...","[0.05344712814026276, 0.10050479348961944, 0.0..."


In [21]:
df_extended_fp.to_csv('results/Fingerprints/Results_Extended_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Extended_fp_Caco2.csv')

In [22]:
#Fingerprinter fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Fingerprinter_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Fingerprinter_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_fingerprinter_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_fingerprinter_fp

X_train shape:  (1008, 1024)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1024)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1320
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 440
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1320
[LightGBM] [Info] Number of data points in the train set: 806, number of used fe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2846,0.4083,0.5334,0.5307,0.7285,0.7112,0.2623,0.401,0.5122,0.5467,0.7408,0.7202
DecisionTreeRegressor,0.415,0.4628,0.6442,0.3155,0.644,0.6227,0.3179,0.423,0.5638,0.4508,0.6876,0.6541
RandomForestRegressor,0.2946,0.4046,0.5428,0.5141,0.7186,0.6982,0.2642,0.3972,0.514,0.5435,0.7379,0.712
GradientBoostingRegressor,0.3096,0.4421,0.5565,0.4893,0.702,0.683,0.2752,0.4282,0.5246,0.5245,0.7336,0.7074
AdaBoostRegressor,0.455,0.5764,0.6745,0.2496,0.5138,0.4588,0.4238,0.5608,0.651,0.2676,0.5533,0.5323
XGBRegressor,0.2921,0.4028,0.5404,0.5183,0.7316,0.72,0.2656,0.3883,0.5154,0.5411,0.7364,0.7145
ExtraTreesRegressor,0.3994,0.4577,0.632,0.3412,0.6502,0.627,0.3172,0.4224,0.5632,0.4518,0.686,0.6526
LinearRegression,0.5522,0.5207,0.7431,0.0893,0.5418,0.6005,0.3793,0.4604,0.6159,0.3446,0.6385,0.6605
KNeighborsRegressor,0.3766,0.4436,0.6137,0.3789,0.6509,0.6189,0.321,0.4204,0.5666,0.4453,0.6804,0.6495
SVR,0.2935,0.4149,0.5418,0.5159,0.721,0.7001,0.2605,0.4039,0.5104,0.5498,0.7469,0.7092


In [23]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.680506226144483, -7.130131326944558, -7.13...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.49706645630558, -6.35778896066796, -5.703...","[-6.532137437973984, -6.492191824753661, -5.72...","[0.026925430464688278, 0.0711843375052628, 0.0..."
1,DecisionTreeRegressor,"[-6.46, -7.5, -7.5, -5.865, -6.055, -5.92, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.74, -5.89, -5.51, -5.92, -5.92, -6.89, -5...","[-5.946, -6.808, -5.742, -5.832000000000001, -...","[0.24029981273400955, 0.4597999565028253, 0.11..."
2,RandomForestRegressor,"[-6.532885952380948, -7.4407, -7.4407, -5.9115...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.294466666666669, -6.584527777777777, -5.7...","[-6.199302333333333, -6.696627473507936, -5.74...","[0.07711103973131597, 0.09214556735702038, 0.0..."
3,GradientBoostingRegressor,"[-6.532358718252405, -7.017494264664842, -7.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.726795822569538, -6.507230151193977, -5.8...","[-6.654599934689786, -6.543575431944015, -5.94...","[0.046459671578161485, 0.11334471903205467, 0...."
4,AdaBoostRegressor,"[-6.59626229579909, -6.508575764733432, -6.508...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.59626229579909, -6.432949769885257, -6.29...","[-6.584103056538598, -6.319322022978568, -6.39...","[0.012822818793563104, 0.11756745015983344, 0...."
5,XGBRegressor,"[-6.66208, -7.4355793, -7.4355793, -6.075642, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.964388, -6.548087, -5.610661, -5.8615117,...","[-5.615836, -6.748742, -5.6446466, -5.7618446,...","[0.25598687, 0.17892, 0.03127731, 0.2233714, 0..."
6,ExtraTreesRegressor,"[-6.46, -7.5, -7.5, -5.786416666666671, -6.054...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.7400000000000055, -5.876599999999992, -5....","[-5.946000000000003, -6.765094666666658, -5.74...","[0.24029981273401177, 0.44459836689658583, 0.1..."
7,LinearRegression,"[-6.365324055505238, -7.163024774735796, -7.16...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.577842708392317, -6.757608942518033, -5.7...","[-6.422781453696741, -7.259410536223209, -5.75...","[0.1481253213214988, 0.315696449254561, 0.0166..."
8,KNeighborsRegressor,"[-6.489999999999999, -7.343333333333334, -7.34...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.213333333333334, -5.566666666666666, -5.6...","[-6.124666666666667, -5.825333333333334, -5.71...","[0.0939172212346833, 0.1966141172731783, 0.089..."
9,SVR,"[-6.488886093762809, -7.086116727719794, -7.08...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.647394581819928, -6.323214685829233, -5.7...","[-6.722155154286865, -6.428816297902779, -5.75...","[0.062175259315972314, 0.08665486193432681, 0...."


In [24]:
df_fingerprinter_fp.to_csv('results/Fingerprints/Results_Fingerprinter_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Fingerprinter_fp_Caco2.csv')

In [25]:
#GraphOnly fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Graphonly_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Graphonly_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_graph_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_graph_fp

X_train shape:  (1008, 1024)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1024)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 597
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 199
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 806, number of used feat

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3412,0.4549,0.5841,0.4373,0.6613,0.6351,0.2813,0.4265,0.5304,0.5139,0.7204,0.6893
DecisionTreeRegressor,0.4174,0.4685,0.6461,0.3116,0.6183,0.6033,0.3223,0.4342,0.5677,0.443,0.676,0.6421
RandomForestRegressor,0.3331,0.4352,0.5772,0.4506,0.6759,0.6441,0.2953,0.4258,0.5435,0.4897,0.7006,0.6672
GradientBoostingRegressor,0.344,0.4669,0.5865,0.4327,0.6589,0.6213,0.294,0.4452,0.5422,0.4919,0.7073,0.677
AdaBoostRegressor,0.4696,0.5882,0.6853,0.2256,0.4851,0.4093,0.4259,0.567,0.6526,0.2641,0.5439,0.5734
XGBRegressor,0.3537,0.4484,0.5947,0.4166,0.6649,0.6386,0.2981,0.4186,0.546,0.485,0.702,0.6794
ExtraTreesRegressor,0.4049,0.4662,0.6364,0.3321,0.6256,0.6095,0.3188,0.433,0.5647,0.4491,0.6795,0.6446
LinearRegression,0.4256,0.4959,0.6524,0.2981,0.598,0.6023,0.3351,0.448,0.5788,0.421,0.6692,0.6872
KNeighborsRegressor,0.4307,0.4823,0.6563,0.2897,0.5875,0.5647,0.3414,0.4431,0.5843,0.4101,0.652,0.6238
SVR,0.343,0.451,0.5856,0.4344,0.6591,0.6192,0.3064,0.4349,0.5536,0.4705,0.6859,0.6498


In [26]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.404611666143465, -7.167090837933117, -7.16...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.703118866496205, -6.177101701028665, -5.9...","[-6.691565785071961, -6.338222066733984, -5.97...","[0.06706140984693627, 0.125111424730029, 0.027..."
1,DecisionTreeRegressor,"[-6.417894736842103, -7.343333333333334, -7.34...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.246666666666666, -7.07, -6.00583333333333...","[-5.960666666666667, -6.942, -5.95704444444444...","[0.23351802214532957, 0.658009118477852, 0.047..."
2,RandomForestRegressor,"[-6.413662925061174, -7.318431666666665, -7.31...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.730823964646464, -6.628559090909092, -5.9...","[-6.320944121496147, -6.662687278499281, -5.94...","[0.2169281163336365, 0.06615535554699849, 0.04..."
3,GradientBoostingRegressor,"[-6.470189150429321, -7.030167260625403, -7.03...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.831458348368699, -6.160526580411803, -5.9...","[-6.621725569298126, -6.249369484774296, -6.00...","[0.1528684941885495, 0.07194557892633802, 0.02..."
4,AdaBoostRegressor,"[-6.454219976787326, -6.598434416146199, -6.59...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.598434416146199, -6.189166666666667, -6.3...","[-6.534801352657945, -6.282745796483319, -6.39...","[0.07572520900621312, 0.07765620675204678, 0.0..."
5,XGBRegressor,"[-6.432741, -7.324975, -7.324975, -6.0037594, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6080503, -6.8581815, -5.906696, -5.911326...","[-6.037893, -7.0004435, -5.90578, -5.853161, -...","[0.28987363, 0.10459778, 0.03276351, 0.1168502..."
6,ExtraTreesRegressor,"[-6.417894736842106, -7.3433333333333515, -7.3...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.246666666666669, -7.093600000000012, -6.0...","[-5.960666666666663, -6.985060000000004, -5.95...","[0.23351802214533438, 0.29588021630383077, 0.0..."
7,LinearRegression,"[-6.410985342361564, -7.149160586334667, -7.14...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6997979629913145, -6.780917546803412, -5....","[-6.718076591870824, -7.021451814510053, -5.95...","[0.04788434132007717, 0.15086769914059933, 0.0..."
8,KNeighborsRegressor,"[-6.28, -7.343333333333334, -7.343333333333334...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.913333333333334, -5.653333333333333, -6.6...","[-6.520666666666668, -5.824, -6.502, -5.891333...","[0.24435311425157757, 0.11825866188618528, 0.2..."
9,SVR,"[-6.518875069097527, -7.0999688787287845, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.887352135157818, -6.10112393854479, -5.86...","[-6.9107498349057455, -6.3041930946141935, -5....","[0.03409662579640579, 0.10322547355527936, 0.0..."


In [27]:
df_graph_fp.to_csv('results/Fingerprints/Results_Graphonly_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Graphonly_fp_Caco2.csv')

In [28]:
#KlekotaRoth fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/KlekotaRoth_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/KlekotaRoth_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRoth_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRoth_fp

X_train shape:  (1008, 4860)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 4860)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 597
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 199
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 600
[LightGBM] [Info] Number of data points in the train set: 806, number of used feat

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2603,0.3917,0.5102,0.5708,0.7556,0.7367,0.2261,0.374,0.4755,0.6094,0.7821,0.761
DecisionTreeRegressor,0.3944,0.4512,0.628,0.3496,0.6744,0.6534,0.2416,0.3713,0.4915,0.5825,0.7696,0.7426
RandomForestRegressor,0.2429,0.3675,0.4929,0.5994,0.7743,0.7606,0.2188,0.3606,0.4678,0.6219,0.7895,0.7698
GradientBoostingRegressor,0.261,0.396,0.5109,0.5695,0.757,0.7364,0.23,0.386,0.4795,0.6026,0.7799,0.7532
AdaBoostRegressor,0.3828,0.5184,0.6187,0.3687,0.6338,0.5838,0.3587,0.5089,0.5989,0.3802,0.6526,0.6087
XGBRegressor,0.2413,0.3695,0.4912,0.602,0.7801,0.7695,0.2118,0.3512,0.4602,0.634,0.7973,0.7786
ExtraTreesRegressor,0.3581,0.4311,0.5984,0.4095,0.6954,0.6698,0.2478,0.3736,0.4978,0.5718,0.7644,0.7411
LinearRegression,0.3739,0.4493,0.6115,0.3833,0.6816,0.7081,0.2229,0.3706,0.4721,0.6149,0.7877,0.7711
KNeighborsRegressor,0.2769,0.3835,0.5262,0.5434,0.7437,0.7234,0.2636,0.393,0.5134,0.5446,0.7475,0.7259
SVR,0.2575,0.3875,0.5074,0.5754,0.7597,0.7487,0.2098,0.36,0.458,0.6375,0.7999,0.7697


In [29]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.976915367415929, -7.125181447563859, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.58351842265396, -6.038995083580175, -5.92...","[-6.436503290596164, -6.039659909405022, -5.86...","[0.08683819999769471, 0.11007564482300597, 0.0..."
1,DecisionTreeRegressor,"[-6.55, -7.0, -7.05, -5.68, -6.11, -5.8, -6.89...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.47, -6.32, -6.82, -5.92, -5.92, -6.89, -5...","[-6.837999999999999, -6.040000000000001, -6.20...","[0.350679340708859, 0.37751821148124765, 0.506..."
2,RandomForestRegressor,"[-5.9836049999999945, -7.221724999999997, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.621600000000002, -6.225333333333334, -6.0...","[-6.496332, -6.139579145734335, -5.93566100000...","[0.14242084712569572, 0.06808930821972134, 0.0..."
3,GradientBoostingRegressor,"[-6.256623852154684, -6.914060647207159, -6.90...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.555338694628326, -6.021943890312423, -6.0...","[-6.482758679070825, -6.061837094331477, -6.02...","[0.04560694982212507, 0.06696656395494387, 0.0..."
4,AdaBoostRegressor,"[-6.3537042985056305, -6.472091972226149, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.462351129716583, -6.241143099760917, -6.2...","[-6.3141796952783995, -6.2284522872837185, -6....","[0.0844793993036661, 0.03940671355021276, 0.03..."
5,XGBRegressor,"[-6.0451365, -7.141987, -7.1366854, -6.1370687...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4093165, -5.9455776, -5.9564915, -5.84964...","[-6.338481, -5.8622155, -5.9129, -5.864321, -6...","[0.18744248, 0.1536118, 0.08771315, 0.07254211..."
6,ExtraTreesRegressor,"[-6.541299999999995, -7.190549999999997, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.298000000000014, -6.410000000000009, -6.8...","[-6.812580000000006, -6.110983749632007, -6.21...","[0.27251908116681234, 0.372424435574634, 0.519..."
7,LinearRegression,"[-6.28337656936606, -6.780279247358828, -7.294...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.681078623306621, -6.009952042351362, -5.9...","[-6.615055603739238, -5.639575624964111, -5.99...","[0.06662177975690593, 0.2066887410206464, 0.06..."
8,KNeighborsRegressor,"[-6.443333333333334, -7.343333333333334, -7.35...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.416666666666667, -5.653333333333333, -6.0...","[-6.427333333333334, -5.824, -5.894, -5.681999...","[0.15751402053997143, 0.11825866188618528, 0.0..."
9,SVR,"[-6.3605597545670225, -7.1059892156157165, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7217591296583805, -5.870416079065387, -5....","[-6.655809512527883, -6.002629110182968, -5.87...","[0.05162271879487273, 0.07110492462147618, 0.0..."


In [30]:
df_KlekotaRoth_fp.to_csv('results/Fingerprints/Results_KlekotaRoth_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_KlekotaRoth_fp_Caco2.csv')

In [31]:
#KlekotaRoth Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/KlekotaRothCount_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/KlekotaRothCount_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRothCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRothCount_fp

X_train shape:  (1008, 4860)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 4860)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2362
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 260
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2372
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 263
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2091,0.3398,0.4573,0.6551,0.8095,0.7918,0.175,0.3158,0.4183,0.6977,0.8359,0.8227
DecisionTreeRegressor,0.3518,0.4294,0.5931,0.4199,0.7033,0.682,0.2145,0.3385,0.4632,0.6293,0.7969,0.7864
RandomForestRegressor,0.2059,0.3321,0.4537,0.6604,0.8131,0.7987,0.1837,0.3202,0.4286,0.6826,0.827,0.8147
GradientBoostingRegressor,0.2139,0.3506,0.4625,0.6472,0.8074,0.7908,0.1901,0.3474,0.436,0.6715,0.8251,0.8021
AdaBoostRegressor,0.3666,0.5153,0.6055,0.3954,0.6567,0.5862,0.3328,0.4924,0.5769,0.4249,0.6984,0.6551
XGBRegressor,0.2137,0.3339,0.4623,0.6475,0.8093,0.8026,0.1799,0.3065,0.4241,0.6892,0.8306,0.8212
ExtraTreesRegressor,0.2023,0.3275,0.4497,0.6664,0.8175,0.8083,0.1907,0.3172,0.4367,0.6705,0.8192,0.8118
LinearRegression,0.3019,0.3989,0.5495,0.5021,0.7355,0.758,0.2803,0.3776,0.5295,0.5156,0.7438,0.7882
KNeighborsRegressor,0.2408,0.3533,0.4907,0.6028,0.7806,0.7584,0.2618,0.3798,0.5116,0.5477,0.7503,0.7313
SVR,0.2178,0.346,0.4667,0.6408,0.8024,0.7893,0.1888,0.3295,0.4345,0.6738,0.8225,0.7929


In [32]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.1503741210052345, -7.365654149717542, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.860539596579732, -6.466785838950487, -5.7...","[-6.769548101616424, -6.224306877464776, -5.84...","[0.0949236722259972, 0.14622588654270394, 0.11..."
1,DecisionTreeRegressor,"[-5.64, -8.0, -7.0, -7.03, -6.13, -5.96, -6.89...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.0, -7.49, -6.82, -5.92, -5.88, -6.89, -5....","[-6.638, -6.674000000000001, -6.144, -5.890000...","[0.5140972670613997, 0.6121307049968986, 0.398..."
2,RandomForestRegressor,"[-6.054245179673331, -7.462699999999997, -6.92...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.541328513006665, -6.430252380952383, -6.0...","[-6.514486141809336, -6.3084146428571435, -6.1...","[0.040032540191517946, 0.08465174296972229, 0...."
3,GradientBoostingRegressor,"[-6.480757897075449, -7.258357991723996, -6.94...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.765330395026985, -6.396504752292786, -6.0...","[-6.775328158796853, -6.314340653562237, -6.16...","[0.1127068657502677, 0.084146032104977, 0.1095..."
4,AdaBoostRegressor,"[-6.49292280354958, -6.9074796747967575, -6.32...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.431676018730639, -6.317532247055143, -6.1...","[-6.319037622617292, -6.299602649185561, -6.22...","[0.09871088966248892, 0.059440711081254566, 0...."
5,XGBRegressor,"[-6.46839, -7.7039385, -7.307966, -6.284219, -...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.266291, -6.400317, -6.1285553, -5.9148207...","[-7.3358207, -6.1145997, -6.1450605, -5.932498...","[0.15792708, 0.29897732, 0.055645257, 0.040789..."
6,ExtraTreesRegressor,"[-6.044324999999995, -7.896599999999999, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.665950000000002, -6.276599999999999, -6.2...","[-6.608581863912003, -6.35963, -6.336400000000...","[0.10774921542444775, 0.1045436157782967, 0.19..."
7,LinearRegression,"[-5.968425773331034, -7.198101742291227, -7.08...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.385737931525088, -5.876602710109797, -5.3...","[-6.4301746620744, -5.82242820717264, -5.43520...","[0.0670068002715355, 0.10814189800202613, 0.11..."
8,KNeighborsRegressor,"[-6.28, -7.343333333333334, -6.98, -7.03666666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.0633333333333335, -5.63, -6.2333333333333...","[-6.378666666666667, -5.824333333333333, -6.13...","[0.18820319987833475, 0.10870244809672953, 0.1..."
9,SVR,"[-5.980659971657707, -7.167153963380204, -7.14...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.392093750099015, -6.147144718844885, -5.7...","[-6.31239682556461, -6.1889208271487375, -5.74...","[0.10481967297472793, 0.03198092688063068, 0.0..."


In [33]:
df_KlekotaRothCount_fp.to_csv('results/Fingerprints/Results_KlekotaRoth_Count_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_KlekotaRoth_Count_fp_Caco2.csv')

In [34]:
#MACCS fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/MACCS_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/MACCS_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_MACCS_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_MACCS_fp

X_train shape:  (1008, 166)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 166)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 144
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 48
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 806, number of used feature



0.3944777369857846


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3095,0.4374,0.5563,0.4895,0.6998,0.6525,0.2694,0.4136,0.519,0.5345,0.7348,0.7036
DecisionTreeRegressor,0.3729,0.4589,0.6106,0.385,0.6541,0.624,0.2633,0.397,0.5132,0.5449,0.7435,0.712
RandomForestRegressor,0.3002,0.424,0.5479,0.5049,0.7126,0.6764,0.246,0.3892,0.496,0.5748,0.7589,0.7308
GradientBoostingRegressor,0.326,0.4581,0.5709,0.4624,0.6826,0.6414,0.2714,0.4299,0.5209,0.5311,0.7374,0.7039
AdaBoostRegressor,0.444,0.5702,0.6663,0.2677,0.525,0.4726,0.4121,0.5576,0.642,0.2879,0.5633,0.5483
XGBRegressor,0.2898,0.4143,0.5383,0.522,0.7273,0.6922,0.2585,0.3985,0.5084,0.5534,0.7483,0.7212
ExtraTreesRegressor,0.3469,0.4457,0.589,0.4279,0.6756,0.6411,0.2619,0.3958,0.5118,0.5474,0.7447,0.7092
LinearRegression,0.3659,0.4826,0.6049,0.3965,0.6365,0.596,0.2913,0.437,0.5397,0.4967,0.7068,0.6865
KNeighborsRegressor,0.3527,0.4449,0.5939,0.4183,0.6689,0.6313,0.2952,0.4224,0.5433,0.4899,0.7122,0.682
SVR,0.3137,0.431,0.5601,0.4825,0.6951,0.6575,0.2464,0.3879,0.4964,0.5742,0.7589,0.7319


In [35]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.188545464265373, -6.969422649674304, -6.55...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.704283054476951, -6.385868559008707, -6.1...","[-6.697150201213555, -6.238022044167134, -6.17...","[0.039268063792877526, 0.0957931899994108, 0.0..."
1,DecisionTreeRegressor,"[-6.185681818181819, -7.515000000000001, -7.02...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6536, -6.436249999999999, -6.058181818181...","[-6.6575224078669235, -6.379607142857144, -6.2...","[0.04386882198770176, 0.06046982296812104, 0.0..."
2,RandomForestRegressor,"[-6.16649811554382, -7.339771666666662, -7.032...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.647336017953259, -6.244143618187369, -6.0...","[-6.661830442903666, -6.31401890997891, -6.215...","[0.04991204260679366, 0.08098456068449737, 0.0..."
3,GradientBoostingRegressor,"[-6.437643660302735, -7.316651723882255, -6.87...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.5686023176908455, -6.221777836706621, -6....","[-6.591152629346442, -6.20555109809563, -6.127...","[0.022396867022547423, 0.051240735542770574, 0..."
4,AdaBoostRegressor,"[-6.52753378227803, -6.589085753791073, -6.527...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.537597874228514, -6.3381683100865125, -6....","[-6.537667228577945, -6.28149046195069, -6.403...","[0.04135461336761183, 0.04046992874484107, 0.0..."
5,XGBRegressor,"[-6.223743, -7.51151, -7.0348263, -6.3634024, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.654967, -6.3212624, -6.066043, -5.9111743...","[-6.657028, -6.363036, -6.186167, -5.8172836, ...","[0.047189415, 0.06255388, 0.070041046, 0.17970..."
6,ExtraTreesRegressor,"[-6.18568181818182, -7.51499999999999, -7.0249...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.653599999999992, -6.436249999999984, -6.0...","[-6.657522407866921, -6.379607142857137, -6.20...","[0.043868821987703206, 0.06046982296812034, 0...."
7,LinearRegression,"[-6.4316346629661565, -7.219587540770002, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.565115765867795, -6.25932714520153, -6.14...","[-6.5600101096126355, -6.29831375144676, -6.19...","[0.01461057844609525, 0.06147685934297204, 0.0..."
8,KNeighborsRegressor,"[-5.913333333333333, -7.36, -7.350000000000000...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.586666666666666, -5.828333333333333, -5.8...","[-6.382666666666667, -5.843666666666666, -6.30...","[0.16585669852147802, 0.023247461032216775, 0...."
9,SVR,"[-6.424375808498325, -7.12998784857563, -7.100...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.622769735375762, -6.066405978130842, -6.0...","[-6.623236690325356, -6.1635729309298855, -6.0...","[0.024309665635105955, 0.06365788142701458, 0...."


In [36]:
df_MACCS_fp.to_csv('results/Fingerprints/Results_MACCS_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_MACCS_fp_Caco2.csv')

In [37]:
#PubChem fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/PubChem_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/PubChem_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_PubChem_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_PubChem_fp

X_train shape:  (1008, 881)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 881)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 609
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 203
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 806, number of used featur

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3327,0.4492,0.5768,0.4512,0.6718,0.6197,0.2812,0.4209,0.5303,0.5141,0.7202,0.7008
DecisionTreeRegressor,0.3257,0.4319,0.5707,0.4628,0.6951,0.6547,0.281,0.4016,0.5301,0.5145,0.7202,0.7042
RandomForestRegressor,0.2914,0.4093,0.5399,0.5193,0.7217,0.6793,0.272,0.401,0.5216,0.5299,0.7281,0.7064
GradientBoostingRegressor,0.3276,0.457,0.5724,0.4597,0.6813,0.6379,0.2829,0.4327,0.5319,0.5112,0.7232,0.6909
AdaBoostRegressor,0.4622,0.5834,0.6799,0.2377,0.4973,0.4583,0.4269,0.5655,0.6533,0.2624,0.5384,0.5688
XGBRegressor,0.2898,0.4102,0.5384,0.522,0.7263,0.6914,0.2664,0.3975,0.5162,0.5396,0.7359,0.7105
ExtraTreesRegressor,0.31,0.4217,0.5567,0.4888,0.7087,0.6677,0.2821,0.4021,0.5311,0.5126,0.7189,0.7
LinearRegression,0.3692,0.4807,0.6076,0.3911,0.6364,0.6085,0.3285,0.4666,0.5732,0.4323,0.6648,0.6474
KNeighborsRegressor,0.3693,0.4558,0.6077,0.391,0.6508,0.605,0.3266,0.4336,0.5715,0.4356,0.6728,0.6332
SVR,0.321,0.4341,0.5666,0.4706,0.6864,0.6438,0.2692,0.4035,0.5188,0.5349,0.7327,0.6976


In [38]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.910858710601152, -6.934339734306734, -6.93...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.910858710601152, -5.952612994610728, -6.6...","[-6.908055709489693, -5.9171857314616805, -6.7...","[0.041006627380878825, 0.034258140573544905, 0..."
1,DecisionTreeRegressor,"[-6.924285714285714, -7.025, -7.025, -5.865, -...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.924285714285714, -6.484761904761903, -7.3...","[-6.875702380952381, -6.744952380952381, -7.35...","[0.051246044037354214, 0.4634272013758544, 0.1..."
2,RandomForestRegressor,"[-6.898882881840382, -7.0582333333333285, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.898882881840381, -5.992494499696711, -7.1...","[-6.8673918207397175, -6.085251867201086, -7.2...","[0.04902811767270143, 0.08066420737848456, 0.1..."
3,GradientBoostingRegressor,"[-6.8077006280115295, -7.020118036857051, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.8077006280115295, -6.161304734015358, -7....","[-6.797144521047651, -6.376724864896322, -7.04...","[0.03667135360830248, 0.13610798990940054, 0.1..."
4,AdaBoostRegressor,"[-6.546456056055551, -6.6392750296023735, -6.6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.546456056055551, -6.397331154684089, -6.4...","[-6.498520100493084, -6.361655273461963, -6.40...","[0.04901008396458998, 0.019819604654535642, 0...."
5,XGBRegressor,"[-6.9275346, -7.014682, -7.014682, -5.957666, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.9275346, -5.91576, -7.324376, -5.87471, -...","[-6.8802247, -6.317999, -7.342369, -5.816217, ...","[0.051193148, 0.2209877, 0.10006126, 0.1371919..."
6,ExtraTreesRegressor,"[-6.9242857142857055, -7.024999999999988, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.9242857142857055, -6.374685714285711, -7....","[-6.875702380952383, -6.332302597402597, -7.35...","[0.05124604403735177, 0.10317871030766224, 0.1..."
7,LinearRegression,"[-7.1164529924459625, -7.105537871009618, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.1164529924459625, -6.601863658414984, -6....","[-7.120943860164049, -6.8081681362279145, -6.7...","[0.05148252075968824, 0.2109178183703982, 0.05..."
8,KNeighborsRegressor,"[-6.739999999999999, -7.3500000000000005, -7.3...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.739999999999999, -5.821666666666666, -6.9...","[-6.885333333333333, -6.019000000000001, -7.21...","[0.11030462869304807, 0.09921693403850021, 0.1..."
9,SVR,"[-6.9876343085645845, -7.0218099326631425, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.9876343085645845, -5.9363787165416975, -6...","[-7.025664051226244, -6.182202076617092, -6.81...","[0.07183784141829634, 0.12671308812485393, 0.0..."


In [39]:
df_PubChem_fp.to_csv('results/Fingerprints/Results_PubChem_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_PubChem_fp_Caco2.csv')

In [40]:
#Substructure fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Substructure_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Substructure_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_Substructure_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_Substructure_fp

X_train shape:  (1008, 307)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 307)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 16
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features:



0.26408363914989785


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4684,0.5421,0.6844,0.2275,0.4771,0.4532,0.395,0.5016,0.6285,0.3174,0.5668,0.5559
DecisionTreeRegressor,0.4464,0.5175,0.6681,0.2638,0.5225,0.4954,0.3803,0.486,0.6166,0.3429,0.5888,0.5836
RandomForestRegressor,0.4407,0.5162,0.6638,0.2732,0.5263,0.4975,0.369,0.4776,0.6074,0.3624,0.6027,0.5966
GradientBoostingRegressor,0.4441,0.538,0.6664,0.2676,0.5179,0.4882,0.3923,0.5076,0.6263,0.3221,0.5704,0.5769
AdaBoostRegressor,0.5237,0.6071,0.7237,0.1363,0.3758,0.3686,0.4488,0.5702,0.67,0.2244,0.5174,0.492
XGBRegressor,0.4341,0.514,0.6588,0.2841,0.5382,0.51,0.3702,0.4794,0.6084,0.3603,0.6022,0.5972
ExtraTreesRegressor,0.4468,0.5174,0.6684,0.2631,0.522,0.4945,0.3804,0.4862,0.6167,0.3428,0.5887,0.5835
LinearRegression,0.4622,0.5562,0.6799,0.2377,0.4898,0.4602,0.4285,0.5345,0.6546,0.2596,0.512,0.5027
KNeighborsRegressor,0.4554,0.5398,0.6748,0.2489,0.5121,0.482,0.3922,0.4994,0.6262,0.3224,0.5741,0.5741
SVR,0.4384,0.5035,0.6622,0.2769,0.5335,0.5064,0.3884,0.482,0.6232,0.3289,0.5796,0.5888


In [41]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.701182561864496, -6.293908077148077, -6.29...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.701182561864496, -5.739583701137474, -6.0...","[-6.692768454438664, -5.7621253998362665, -6.1...","[0.019895684800163665, 0.04943782263245176, 0...."
1,DecisionTreeRegressor,"[-6.315, -7.2700000000000005, -7.2700000000000...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.315, -5.735555555555556, -6.1279279279279...","[-6.345603174603175, -5.828603968253968, -6.15...","[0.11824035129867072, 0.08109294823512306, 0.0..."
2,RandomForestRegressor,"[-6.288119685314687, -7.233434859307356, -7.23...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.288119685314684, -5.735649190462315, -6.1...","[-6.3311980967708745, -5.789387144598866, -6.1...","[0.1263360564935011, 0.05428048843832001, 0.02..."
3,GradientBoostingRegressor,"[-6.440404791868899, -7.139978642939512, -7.13...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.440404791868899, -5.770958846664279, -6.1...","[-6.459697999841301, -5.82763651666952, -6.152...","[0.09553907802356357, 0.07230481789528269, 0.0..."
4,AdaBoostRegressor,"[-6.561225352112669, -6.561225352112669, -6.56...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.561225352112669, -6.095304568527916, -6.2...","[-6.553922544770413, -6.1566506719329395, -6.2...","[0.02711733111380686, 0.07885141367432297, 0.0..."
5,XGBRegressor,"[-6.321786, -7.26646, -7.26646, -6.7488, -5.73...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.321786, -5.4703407, -6.1291285, -5.932962...","[-6.3541265, -5.654248, -6.1572194, -6.053292,...","[0.12037042, 0.10360348, 0.018817937, 0.232587..."
6,ExtraTreesRegressor,"[-6.315000000000009, -7.26999999999999, -7.269...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.315000000000009, -5.735555555555557, -6.1...","[-6.34560317460318, -5.828603968253967, -6.156...","[0.11824035129867044, 0.08109294823511959, 0.0..."
7,LinearRegression,"[-6.432668397106541, -7.221348630048239, -7.22...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.432668397106541, -5.798749100413431, -6.1...","[-6.434193756947525, -5.803154005847483, -6.18...","[0.07636337283525031, 0.0510713691524852, 0.02..."
8,KNeighborsRegressor,"[-6.28, -7.36, -7.36, -7.036666666666666, -5.6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.653333333333333, -6.41333333333333...","[-6.552, -5.800666666666666, -6.34133333333333...","[0.22816757974007662, 0.15788040480763332, 0.0..."
9,SVR,"[-6.059833782536692, -7.099723529566326, -7.09...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.059833782536692, -5.799653108976287, -5.9...","[-6.033748821802074, -5.847189230682391, -5.98...","[0.06578337451812258, 0.03488551718095578, 0.0..."


In [42]:
df_Substructure_fp.to_csv('results/Fingerprints/Results_Substructure_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Substructure_fp_Caco2.csv')

In [43]:
#Substructure Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/SubstructureCount_train_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/SubstructureCount_test_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_SubstructureCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_SubstructureCount_fp

X_train shape:  (1008, 307)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 307)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.196922 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 26
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 806, number of used feature



0.5119208918182989




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2234,0.3553,0.4726,0.6316,0.7948,0.7778,0.1856,0.3321,0.4308,0.6793,0.8248,0.8074
DecisionTreeRegressor,0.3176,0.4095,0.5635,0.4763,0.7308,0.7109,0.2118,0.3421,0.4602,0.6341,0.8012,0.7845
RandomForestRegressor,0.2135,0.3457,0.4621,0.6479,0.8054,0.7865,0.183,0.3257,0.4278,0.6838,0.827,0.812
GradientBoostingRegressor,0.2428,0.3815,0.4928,0.5996,0.7767,0.7563,0.2052,0.3617,0.453,0.6454,0.8094,0.7881
AdaBoostRegressor,0.4096,0.5494,0.64,0.3245,0.5931,0.5853,0.3992,0.5444,0.6318,0.3101,0.5875,0.588
XGBRegressor,0.2157,0.342,0.4644,0.6443,0.8072,0.7868,0.1858,0.3141,0.4311,0.6789,0.8252,0.8102
ExtraTreesRegressor,0.2074,0.3346,0.4555,0.6579,0.8145,0.7912,0.1932,0.3281,0.4395,0.6662,0.8184,0.8052
LinearRegression,0.3028,0.4137,0.5503,0.5006,0.7101,0.7117,0.2826,0.4318,0.5316,0.5116,0.7169,0.6728
KNeighborsRegressor,0.2661,0.3772,0.5159,0.5611,0.7572,0.7307,0.2163,0.3523,0.4651,0.6262,0.7942,0.7755
SVR,0.3323,0.434,0.5765,0.452,0.6764,0.658,0.2523,0.3922,0.5023,0.5641,0.7609,0.7386


In [44]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.831468738542402, -7.140948445569862, -7.16...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.955223132222581, -5.741055746517004, -6.2...","[-7.05323723697208, -6.030278560631518, -6.523...","[0.07932465922257062, 0.19583895182921815, 0.1..."
1,DecisionTreeRegressor,"[-7.24, -6.89, -6.89, -5.77, -4.85, -6.82, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.66, -5.92, -5.92, -5.88, -6.89, -5...","[-7.112, -6.444, -6.503, -5.928, -5.812, -6.93...","[0.22613270440164102, 0.3636261816756324, 0.49..."
2,RandomForestRegressor,"[-6.824300000000003, -7.102699999999993, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.830900000000003, -6.002900000000003, -6.4...","[-6.959790000000007, -6.160980000000003, -6.45...","[0.09594578886017137, 0.1401810743288831, 0.08..."
3,GradientBoostingRegressor,"[-6.87176976698039, -6.924656253441981, -7.014...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.955103681898174, -5.893700973994982, -6.3...","[-7.0916989148046286, -6.2200130056713165, -6....","[0.1813915522638697, 0.18233571707129323, 0.21..."
4,AdaBoostRegressor,"[-6.3859972299168986, -6.458147676062737, -6.3...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.3859972299168986, -6.22970353532405, -6.2...","[-6.529334964908264, -6.2723977363291, -6.3418...","[0.0980588388063182, 0.04361397143279695, 0.05..."
5,XGBRegressor,"[-7.1239324, -7.217034, -7.229846, -6.304128, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.1208906, -5.595731, -5.9826164, -5.929946...","[-7.163526, -5.7536683, -6.319442, -5.9726, -5...","[0.04025642, 0.3593009, 0.24551731, 0.11066435..."
6,ExtraTreesRegressor,"[-7.208000000000006, -7.323099999999993, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.130100000000006, -6.110200000000001, -6.2...","[-7.138000000000012, -6.401220000000002, -6.29...","[0.03381746590151436, 0.17390677560118337, 0.0..."
7,LinearRegression,"[-5.675222717853205, -7.259362280089988, -7.19...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.945527700329984, -5.356085116163308, -5.3...","[-6.156583302111928, -5.5329753612621895, -5.3...","[0.1345049458927704, 0.12132277007483369, 0.03..."
8,KNeighborsRegressor,"[-6.646666666666666, -7.3500000000000005, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.646666666666666, -5.573333333333333, -5.9...","[-6.772, -5.738666666666666, -5.86399999999999...","[0.20227154465662678, 0.1570081385010201, 0.07..."
9,SVR,"[-6.3712895557778975, -7.107729586121078, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.398890596265454, -6.204216845056135, -5.7...","[-6.553663742618847, -6.249136938843404, -5.73...","[0.2688041846533252, 0.031030824818462793, 0.0..."


In [45]:
df_SubstructureCount_fp.to_csv('results/Fingerprints/Results_Substructure_Count_fp_Caco2.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Substructure_Count_fp_Caco2.csv')

In [91]:
#Descriptors models
#2d RDKit descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2drdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2drdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 217)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 217)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18045
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 147
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18192
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 148
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1923,0.3182,0.4385,0.6829,0.8265,0.8108,0.1488,0.2965,0.3857,0.7429,0.8639,0.8564
DecisionTreeRegressor,0.3232,0.4101,0.5685,0.467,0.7267,0.6994,0.1925,0.3225,0.4388,0.6673,0.8205,0.8052
RandomForestRegressor,0.1969,0.3234,0.4438,0.6752,0.8227,0.8082,0.1514,0.3016,0.3891,0.7384,0.8619,0.8543
GradientBoostingRegressor,0.2002,0.3314,0.4475,0.6698,0.8198,0.8031,0.1552,0.3084,0.394,0.7318,0.8593,0.8502
AdaBoostRegressor,0.2996,0.4459,0.5473,0.5059,0.7326,0.6875,0.2648,0.4362,0.5146,0.5424,0.7714,0.7498
XGBRegressor,0.194,0.3162,0.4404,0.6801,0.8265,0.8122,0.152,0.2924,0.3899,0.7374,0.8588,0.8524
ExtraTreesRegressor,0.1874,0.3115,0.4329,0.6909,0.8316,0.8185,0.1531,0.2944,0.3913,0.7354,0.8577,0.8518
LinearRegression,0.4323,0.4277,0.6575,0.287,0.6403,0.7292,0.2678,0.3895,0.5175,0.5373,0.7439,0.7635
KNeighborsRegressor,0.2363,0.3455,0.4861,0.6103,0.7873,0.7615,0.185,0.3354,0.4301,0.6804,0.8266,0.8166
SVR,0.2157,0.3494,0.4644,0.6443,0.8056,0.7957,0.1628,0.3164,0.4035,0.7186,0.8518,0.8434


In [92]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.850917959383622, -7.1626225809589075, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.795668137077063, -6.573237077432057, -6.5...","[-6.963403993280272, -6.824736378316096, -6.66...","[0.09530666912733227, 0.2057626340061331, 0.13..."
1,DecisionTreeRegressor,"[-5.64, -7.03, -7.03, -5.92, -5.86999999999999...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.89, -5.96, -6.66, -5.92, -5.92, -6.89, -6...","[-7.119999999999999, -6.742, -6.85200000000000...","[0.1325141501878196, 0.8077227247019858, 0.246..."
2,RandomForestRegressor,"[-6.714889700040003, -7.1418999999999935, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7095897000399995, -6.507533333333335, -6....","[-6.734391481068004, -6.796343262732668, -6.69...","[0.1349033565818589, 0.20942957576301832, 0.10..."
3,GradientBoostingRegressor,"[-6.852288166896131, -6.7806569737335085, -6.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.855303870518325, -6.532806105625549, -6.4...","[-7.156847035388718, -6.758559132069844, -6.60...","[0.174443087222356, 0.406663241888479, 0.07033..."
4,AdaBoostRegressor,"[-6.868, -6.757622744966875, -6.57760241141904...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.788378965563219, -6.57760241141904, -6.19...","[-6.836982016492236, -6.386948454509524, -6.33...","[0.04314750594497502, 0.1303000870300217, 0.14..."
5,XGBRegressor,"[-6.7751746, -7.145078, -7.3264694, -6.3086677...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.13844, -6.2356434, -6.789014, -5.9201264,...","[-6.9433594, -6.487053, -6.9402237, -5.964231,...","[0.107681856, 0.40687966, 0.112068675, 0.08978..."
6,ExtraTreesRegressor,"[-6.6141000000000005, -7.074599999999993, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.776236758840005, -6.2797000000000045, -6....","[-6.968322735746009, -6.294793400472002, -6.64...","[0.10285807895035368, 0.12443536613953143, 0.0..."
7,LinearRegression,"[-10.0, -7.173440288315399, -7.085368263792001...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-10.0, -5.453335698220039, -5.51968314986289...","[-8.507101571750564, -5.781599944710214, -5.60...","[0.7473717820671271, 0.22484706835136897, 0.05..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.363333333333333, -5.776666666666666, -6.3...","[-6.601999999999999, -6.065333333333333, -6.58...","[0.20942885294162442, 0.19108578643577284, 0.1..."
9,SVR,"[-6.4277748106701935, -7.120289618729317, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.427777874754721, -6.294087492634307, -6.4...","[-6.942852017443842, -6.485552792371112, -6.40...","[0.2575383401298369, 0.14330675949138533, 0.05..."


In [93]:
result_df.to_csv('results/Descriptors/Results_2d_RDKit_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_RDKit_desc_Caco2.csv')

In [95]:
#2d Mordred descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df , prediction_df= train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')


X_train shape:  (1008, 1428)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1428)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 259889
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1176
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 259978
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1182
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.189,0.3197,0.4347,0.6883,0.8298,0.8114,0.153,0.2921,0.3911,0.7357,0.8593,0.8485
DecisionTreeRegressor,0.3626,0.4325,0.6021,0.4021,0.7012,0.675,0.1972,0.3394,0.444,0.6593,0.8153,0.7969
RandomForestRegressor,0.1931,0.3265,0.4394,0.6816,0.8276,0.8105,0.1612,0.3083,0.4015,0.7215,0.8532,0.8428
GradientBoostingRegressor,0.1979,0.3282,0.4449,0.6735,0.8211,0.8009,0.1565,0.3054,0.3956,0.7296,0.8575,0.8406
AdaBoostRegressor,0.2931,0.4433,0.5414,0.5165,0.744,0.7004,0.2583,0.4314,0.5082,0.5537,0.7825,0.7612
XGBRegressor,0.2124,0.3321,0.4608,0.6498,0.8084,0.7925,0.1642,0.3024,0.4052,0.7163,0.8468,0.8375
ExtraTreesRegressor,0.1812,0.3069,0.4257,0.7012,0.8377,0.8249,0.1621,0.2973,0.4026,0.72,0.849,0.8399
LinearRegression,3.9737,1.5466,1.9934,-5.5537,0.228,0.257,1.2861,0.862,1.1341,-1.2223,0.3193,0.3394
KNeighborsRegressor,0.2131,0.3357,0.4616,0.6486,0.8104,0.7857,0.2006,0.3369,0.4479,0.6533,0.812,0.7933
SVR,0.2018,0.334,0.4492,0.6672,0.8189,0.8066,0.1681,0.3217,0.41,0.7095,0.8454,0.8328


In [96]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.831529275669986, -7.404400896768025, -7.09...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.841735362329452, -6.478576032910392, -6.2...","[-6.9745473931988275, -6.385404796737414, -6.3...","[0.09970805427699535, 0.08033971216763644, 0.1..."
1,DecisionTreeRegressor,"[-8.04, -6.89, -7.05, -7.22, -6.0, -7.15, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.66, -6.28, -5.92, -5.88, -7.0, -5....","[-6.659999999999999, -6.728, -6.118, -5.928, -...","[0.7094786818502725, 0.8926903158430701, 0.404..."
2,RandomForestRegressor,"[-6.6446, -7.209132980329998, -6.9700387196399...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.714647274949999, -6.264949999999999, -6.1...","[-6.831440822404002, -6.500148184602, -6.30830...","[0.08674536825572056, 0.1714502375943906, 0.15..."
3,GradientBoostingRegressor,"[-6.669756640310448, -7.148745777507401, -6.96...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6803481793407276, -6.185682465000282, -6....","[-6.946992228226482, -6.415270361316682, -6.50...","[0.153418380988621, 0.28820488305160036, 0.126..."
4,AdaBoostRegressor,"[-6.550074646937984, -7.0523276563124995, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.550074646937984, -6.073948831273812, -6.2...","[-6.607751631650617, -6.247678056787832, -6.22...","[0.16606198776306247, 0.17475272601925004, 0.0..."
5,XGBRegressor,"[-6.968932, -7.6268244, -7.0346174, -6.875161,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.0574203, -6.077678, -6.182101, -5.9197435...","[-7.016031, -6.398041, -6.4642653, -5.9418316,...","[0.2304262, 0.37430385, 0.17610942, 0.04416263..."
6,ExtraTreesRegressor,"[-6.67224433375, -7.396586073149998, -7.029261...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7257999478500015, -5.978600000000004, -6....","[-6.930775312852006, -6.171088729612003, -6.34...","[0.10902436727813337, 0.12232276908547152, 0.0..."
7,LinearRegression,"[-9.166936508810348, -10.0, -4.0, -10.0, -7.18...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-9.676565372774402, -4.0, -8.173520099928165...","[-6.946838899657038, -8.177839732294567, -8.15...","[1.4518531262877648, 2.411461994797378, 0.7095..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.4799999999999995, -6.3433333333333...","[-6.552, -6.672666666666666, -6.15866666666666...","[0.22816757974007726, 0.09641115656972055, 0.1..."
9,SVR,"[-6.164256869134834, -7.179188456779414, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.386251594715867, -6.335719968344946, -6.1...","[-6.574852842180792, -6.446151272974381, -6.27...","[0.1850700111139141, 0.08311509449152962, 0.06..."


In [97]:
result_df.to_csv('results/Descriptors/Results_2d_Mordred_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_Mordred_desc_Caco2.csv')

In [8]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [9]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [100]:
#2d RDKit descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 170)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 170)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18045
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 147
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18192
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 148
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1923,0.3182,0.4385,0.6829,0.8265,0.8108,0.1488,0.2965,0.3857,0.7429,0.8639,0.8564
DecisionTreeRegressor,0.3512,0.4217,0.5926,0.4208,0.711,0.6897,0.1832,0.3086,0.428,0.6835,0.8295,0.8126
RandomForestRegressor,0.1959,0.3227,0.4426,0.6769,0.8237,0.8091,0.1531,0.3037,0.3913,0.7355,0.86,0.852
GradientBoostingRegressor,0.1995,0.3311,0.4466,0.671,0.8206,0.8039,0.1553,0.308,0.3941,0.7316,0.8591,0.8502
AdaBoostRegressor,0.2944,0.4441,0.5426,0.5144,0.7387,0.6913,0.2643,0.4372,0.5141,0.5432,0.7717,0.7526
XGBRegressor,0.194,0.3162,0.4404,0.6801,0.8265,0.8122,0.152,0.2924,0.3899,0.7374,0.8588,0.8524
ExtraTreesRegressor,0.1853,0.3086,0.4305,0.6944,0.8336,0.8205,0.1542,0.2963,0.3927,0.7335,0.8565,0.8514
LinearRegression,0.4323,0.4277,0.6575,0.287,0.6403,0.7292,0.2678,0.3895,0.5175,0.5373,0.7439,0.7635
KNeighborsRegressor,0.2363,0.3455,0.4861,0.6103,0.7873,0.7615,0.186,0.3364,0.4313,0.6786,0.8255,0.8144
SVR,0.2157,0.3494,0.4644,0.6443,0.8056,0.7956,0.1628,0.3164,0.4035,0.7186,0.8518,0.8434


In [101]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.850917959383622, -7.1626225809589075, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.795668137077063, -6.573237077432057, -6.5...","[-6.963403993280272, -6.824736378316096, -6.66...","[0.09530666912733227, 0.2057626340061331, 0.13..."
1,DecisionTreeRegressor,"[-5.64, -7.03, -7.03, -5.92, -6.0, -6.77, -5.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.89, -5.96, -6.21, -5.92, -5.92, -6.89, -6...","[-7.153999999999999, -6.76, -6.825999999999999...","[0.13200000000000003, 0.7209160838821671, 0.37..."
2,RandomForestRegressor,"[-6.633210299960001, -7.151249019599995, -7.10...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.68577940008, -6.505582352933334, -6.44762...","[-6.706461563952004, -6.793053137253335, -6.69...","[0.12595141725522796, 0.20411779279227352, 0.1..."
3,GradientBoostingRegressor,"[-6.85228816689613, -6.7806569737335085, -6.76...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.905876157852879, -6.3779890767215015, -6....","[-7.179602463423912, -6.7190189665874955, -6.6...","[0.16019926410642898, 0.41953145095201017, 0.0..."
4,AdaBoostRegressor,"[-6.8168000000000015, -6.990556952647316, -6.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7883789655632185, -6.5886545966578725, -6...","[-6.832669017644139, -6.445384804053328, -6.46...","[0.07004737607109757, 0.10313758925763429, 0.1..."
5,XGBRegressor,"[-6.7751746, -7.145078, -7.3264694, -6.3086677...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.13844, -6.2356434, -6.789014, -5.9201264,...","[-6.9433594, -6.487053, -6.9402237, -5.964231,...","[0.107681856, 0.40687966, 0.112068675, 0.08978..."
6,ExtraTreesRegressor,"[-6.614600000000003, -7.212557574909996, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.59756910013, -6.170743527120004, -6.53538...","[-6.891230130744006, -6.2733887054240025, -6.6...","[0.15187666712133047, 0.10772892906394595, 0.0..."
7,LinearRegression,"[-10.0, -7.17344028831026, -7.085368263775948,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-10.0, -5.453335698210314, -5.51968314985794...","[-8.50710157180304, -5.7815999446819974, -5.60...","[0.7473717820348645, 0.22484706836500945, 0.05..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.363333333333333, -5.776666666666666, -6.3...","[-6.601999999999999, -6.065333333333333, -6.58...","[0.20942885294162442, 0.19108578643577284, 0.1..."
9,SVR,"[-6.4276872503176605, -7.1204058571342586, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.427690304575386, -6.293980741833711, -6.4...","[-6.942804304040817, -6.485579721586921, -6.40...","[0.25755850203825326, 0.14325970911277497, 0.0..."


In [102]:
result_df.to_csv('results/Descriptors/Results_2d_rdkit_const_rem_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_rdkit_const_rem_Caco2.csv')

In [103]:
#2d Mordred descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')


X_train shape:  (1008, 1217)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1217)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 259889
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1176
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 259978
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1182
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.189,0.3197,0.4347,0.6883,0.8298,0.8114,0.153,0.2921,0.3911,0.7357,0.8593,0.8485
DecisionTreeRegressor,0.3527,0.4261,0.5939,0.4184,0.709,0.6825,0.1948,0.3336,0.4413,0.6635,0.8179,0.7996
RandomForestRegressor,0.1912,0.3237,0.4373,0.6847,0.8297,0.813,0.1619,0.3089,0.4024,0.7202,0.8523,0.8419
GradientBoostingRegressor,0.1976,0.3276,0.4445,0.6741,0.8215,0.8016,0.1567,0.3058,0.3958,0.7293,0.8574,0.8405
AdaBoostRegressor,0.286,0.4335,0.5348,0.5284,0.7504,0.7094,0.2577,0.4285,0.5076,0.5547,0.7765,0.7526
XGBRegressor,0.2124,0.3321,0.4608,0.6498,0.8084,0.7925,0.1642,0.3024,0.4052,0.7163,0.8468,0.8375
ExtraTreesRegressor,0.1808,0.3051,0.4253,0.7017,0.8381,0.825,0.1621,0.2973,0.4026,0.72,0.8492,0.8393
LinearRegression,3.9737,1.5466,1.9934,-5.5537,0.228,0.257,1.2861,0.862,1.1341,-1.2223,0.3193,0.3394
KNeighborsRegressor,0.2133,0.3352,0.4619,0.6482,0.81,0.786,0.2027,0.3382,0.4503,0.6497,0.8099,0.7905
SVR,0.2018,0.334,0.4492,0.6672,0.8189,0.8066,0.1681,0.3217,0.41,0.7095,0.8454,0.8328


In [104]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.831529275669986, -7.404400896768025, -7.09...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.841735362329452, -6.478576032910392, -6.2...","[-6.9745473931988275, -6.385404796737414, -6.3...","[0.09970805427699535, 0.08033971216763644, 0.1..."
1,DecisionTreeRegressor,"[-8.04, -6.85, -7.05, -7.0, -5.92, -7.15, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.85, -6.28, -5.92, -5.886056648, -7...","[-6.595999999999999, -6.574, -6.144, -5.928, -...","[0.7806048936561956, 0.3392698041382405, 0.373..."
2,RandomForestRegressor,"[-6.6345, -7.1916507790166655, -6.968700000000...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.744610092750001, -6.288199999999997, -6.1...","[-6.802962018550003, -6.470896473540002, -6.29...","[0.08804757129610163, 0.1547116997784663, 0.11..."
3,GradientBoostingRegressor,"[-6.673827275783399, -7.129238596452819, -6.94...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.839110399787352, -6.1212931169649725, -6....","[-6.987004735674906, -6.414584750747634, -6.50...","[0.09121564808752236, 0.2840710342461792, 0.12..."
4,AdaBoostRegressor,"[-6.793692565599381, -7.144620045049296, -6.70...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.454951296320436, -6.200000000000001, -6.3...","[-6.6379018390036775, -6.24466645467365, -6.20...","[0.15418684292603632, 0.17070290965478385, 0.0..."
5,XGBRegressor,"[-6.968932, -7.6268244, -7.0346174, -6.875161,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.0574203, -6.077678, -6.182101, -5.9197435...","[-7.016031, -6.398041, -6.4642653, -5.9418316,...","[0.2304262, 0.37430385, 0.17610942, 0.04416263..."
6,ExtraTreesRegressor,"[-6.689289700040002, -7.441599999999994, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.712728073570003, -6.196600000000002, -6.1...","[-6.9326565899680075, -6.300985796862001, -6.3...","[0.11645233080110637, 0.1252108919223208, 0.12..."
7,LinearRegression,"[-9.166936508784957, -10.0, -4.0, -10.0, -7.18...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-9.676565372753501, -4.0, -8.173520099828735...","[-6.946838899640184, -8.177839732368588, -8.15...","[1.4518531262735797, 2.4114619947578215, 0.709..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.4799999999999995, -6.3433333333333...","[-6.552, -6.672666666666666, -6.15866666666666...","[0.22816757974007726, 0.09641115656972055, 0.1..."
9,SVR,"[-6.164154138570254, -7.179394282949861, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.386097074507676, -6.336081850159685, -6.1...","[-6.574898173982541, -6.44628648004454, -6.273...","[0.18496283816368603, 0.08280314170492947, 0.0..."


In [105]:
result_df.to_csv('results/Descriptors/Results_2d_Mordred_const_rem_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_df_2d_Mordred_const_rem_Caco2.csv')

In [106]:
#2d RDKit descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_LVR_rdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_LVR_rdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 153)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 153)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15327
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 136
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15459
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 137
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 



0.595027894206617




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2034,0.3288,0.451,0.6645,0.8153,0.7966,0.1596,0.3095,0.3995,0.7242,0.8528,0.8444
DecisionTreeRegressor,0.345,0.4141,0.5873,0.4311,0.7176,0.6918,0.198,0.3314,0.445,0.6578,0.8145,0.7998
RandomForestRegressor,0.1994,0.3258,0.4465,0.6711,0.8198,0.8014,0.1609,0.3114,0.4011,0.722,0.8517,0.8447
GradientBoostingRegressor,0.2076,0.3382,0.4556,0.6576,0.8117,0.7925,0.1667,0.3199,0.4083,0.7119,0.8473,0.8331
AdaBoostRegressor,0.2983,0.4481,0.5462,0.508,0.738,0.6889,0.266,0.4412,0.5157,0.5404,0.7744,0.7415
XGBRegressor,0.2029,0.3222,0.4504,0.6654,0.8176,0.7984,0.1515,0.2901,0.3892,0.7382,0.8593,0.8528
ExtraTreesRegressor,0.1901,0.3114,0.436,0.6864,0.8288,0.8151,0.1604,0.3018,0.4005,0.7229,0.8502,0.8448
LinearRegression,0.4558,0.4347,0.6752,0.2482,0.6214,0.7246,0.2771,0.3922,0.5264,0.5211,0.7439,0.7842
KNeighborsRegressor,0.2351,0.3466,0.4849,0.6122,0.7882,0.762,0.1953,0.3412,0.442,0.6625,0.8162,0.8043
SVR,0.2164,0.349,0.4652,0.643,0.8045,0.7934,0.1665,0.3197,0.408,0.7124,0.8475,0.8386


In [107]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.800574214822957, -7.1223516863592105, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.893263862392471, -6.647064909229915, -6.3...","[-7.02126316940203, -6.811783380089625, -6.574...","[0.06858583103249369, 0.13324716382305085, 0.1..."
1,DecisionTreeRegressor,"[-5.64, -7.05, -7.03, -6.54, -6.14, -6.74, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -5.72, -6.21, -5.92, -5.92, -6.89, -6...","[-6.911999999999999, -6.645999999999999, -6.75...","[0.6360628899723676, 0.8866476188430218, 0.311..."
2,RandomForestRegressor,"[-6.67191029996, -7.137689700039996, -7.156099...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.761100000000002, -6.456500000000001, -6.4...","[-6.819516702878005, -6.818156470586669, -6.67...","[0.10999049764991402, 0.22826755571124435, 0.1..."
3,GradientBoostingRegressor,"[-6.8094710312781705, -6.7561533031663705, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7915936199836375, -6.335561900455404, -6....","[-7.012629945910584, -6.555141124254992, -6.61...","[0.13565139306178067, 0.34263433619810263, 0.1..."
4,AdaBoostRegressor,"[-6.887272727272732, -6.887272727272732, -6.68...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.788378965563216, -6.183884562520864, -6.1...","[-6.813429242352322, -6.338389081705698, -6.30...","[0.09211324903224437, 0.1959493255430156, 0.09..."
5,XGBRegressor,"[-6.401748, -7.0872817, -7.3376927, -6.517508,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4591837, -6.076312, -6.672585, -5.918654,...","[-7.068312, -6.277921, -6.755786, -5.952565, -...","[0.3285023, 0.33871534, 0.07774746, 0.06847361..."
6,ExtraTreesRegressor,"[-6.708269100130002, -7.108199999999994, -7.07...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.829769100130002, -6.273000000000003, -6.4...","[-7.070076353950009, -6.323680000000001, -6.56...","[0.13662547212727444, 0.08463709352287484, 0.0..."
7,LinearRegression,"[-10.0, -7.154030151625629, -7.025637261910769...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-10.0, -5.500862054091206, -5.43544395148187...","[-8.383345505650553, -5.885494186754398, -5.51...","[0.8102166743902873, 0.24534923385021531, 0.04..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.363333333333333, -5.776666666666666, -6.3...","[-6.601999999999999, -6.065333333333333, -6.58...","[0.20942885294162442, 0.19108578643577284, 0.1..."
9,SVR,"[-6.4129127160895765, -7.131906769378596, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.412856677483656, -6.345392219782349, -6.4...","[-6.936401132637042, -6.522128961739872, -6.44...","[0.2617747145247884, 0.13542919789178878, 0.05..."


In [108]:
result_df.to_csv('results/Descriptors/Results_2d_rdkit_LVR_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_rdkit_LVR_Caco2.csv')

In [109]:
#2d Mordred descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101,),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
results_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
results_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')


X_train shape:  (1008, 841)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 841)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170273
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 814
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170318
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 820
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, th

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1988,0.3241,0.4458,0.6722,0.8199,0.8009,0.1519,0.2934,0.3898,0.7375,0.8609,0.8498
DecisionTreeRegressor,0.3498,0.422,0.5914,0.4232,0.7076,0.6817,0.204,0.3212,0.4517,0.6475,0.8074,0.7853
RandomForestRegressor,0.1915,0.3233,0.4376,0.6842,0.8289,0.8132,0.1625,0.3082,0.4031,0.7192,0.8516,0.839
GradientBoostingRegressor,0.201,0.3316,0.4484,0.6684,0.8182,0.8001,0.1586,0.3091,0.3982,0.726,0.856,0.8354
AdaBoostRegressor,0.2834,0.4334,0.5324,0.5326,0.7557,0.7056,0.2566,0.4312,0.5065,0.5567,0.7823,0.7513
XGBRegressor,0.206,0.3288,0.4538,0.6603,0.8145,0.7962,0.1698,0.3066,0.4121,0.7065,0.8411,0.8254
ExtraTreesRegressor,0.1812,0.3038,0.4256,0.7012,0.8377,0.8248,0.16,0.2951,0.4001,0.7235,0.8511,0.8417
LinearRegression,2.6046,1.1329,1.6139,-3.2956,0.2941,0.3741,0.9937,0.6847,0.9968,-0.717,0.4105,0.4741
KNeighborsRegressor,0.2154,0.3374,0.4641,0.6448,0.8069,0.7875,0.1883,0.3318,0.4339,0.6747,0.8231,0.8063
SVR,0.2022,0.3335,0.4497,0.6665,0.8188,0.8088,0.1656,0.3208,0.407,0.7138,0.8482,0.8369


In [110]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.831497639280336, -7.29083934887742, -6.896...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7288407474530025, -6.196301104225642, -6....","[-6.9052377836536705, -6.447767343769508, -6.5...","[0.1369189850449255, 0.18399310574931552, 0.10..."
1,DecisionTreeRegressor,"[-8.04, -7.0, -7.0, -7.0, -6.28, -7.0, -7.14, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -4.85, -5.77, -5.92, -5.88, -6.89, -6...","[-6.726000000000001, -6.151999999999999, -5.94...","[0.7634553032103452, 0.8678340855255687, 0.141..."
2,RandomForestRegressor,"[-6.813687760899999, -7.194169100129996, -7.01...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.866338373530003, -6.362350000000003, -6.3...","[-6.860509312214002, -6.468133761312667, -6.38...","[0.13151612211522198, 0.09706466301980568, 0.0..."
3,GradientBoostingRegressor,"[-6.681939985658362, -7.092539181011661, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.614661804942996, -6.01569139332828, -6.48...","[-6.854906352681911, -6.452886196910518, -6.49...","[0.15619929597359325, 0.2374919303692213, 0.11..."
4,AdaBoostRegressor,"[-6.755153337427239, -7.008508695102558, -6.94...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.604380578409158, -6.1355657894736835, -6....","[-6.655168979552164, -6.340401035649477, -6.32...","[0.17253781055961434, 0.14336154353384473, 0.1..."
5,XGBRegressor,"[-7.176573, -7.3430324, -6.952058, -7.0933166,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.182548, -5.896726, -6.2369995, -5.9204803...","[-6.945615, -6.3467493, -6.382258, -5.9387093,...","[0.24265453, 0.3466754, 0.28244063, 0.03687465..."
6,ExtraTreesRegressor,"[-6.7275078197700005, -7.370786073149998, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.750638719640001, -6.004749019600005, -6.2...","[-6.9613252917760065, -6.233267743928003, -6.4...","[0.12853689717486705, 0.16266890604019088, 0.1..."
7,LinearRegression,"[-6.5558992535925995, -10.0, -4.0, -10.0, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.651841502779519, -10.0, -5.91620456360857...","[-7.138694438937404, -8.702326090591626, -6.58...","[1.0379532098495832, 1.9899651470585251, 0.596..."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.823333333333333, -6.34333333333333...","[-6.552, -6.409999999999999, -6.22866666666666...","[0.22816757974007726, 0.3907826903934091, 0.12..."
9,SVR,"[-6.364489538893381, -7.180569322490727, -7.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.527385998249593, -6.316544886958452, -6.2...","[-6.7772012881233845, -6.418802067758027, -6.3...","[0.25456912402949317, 0.07039285360249453, 0.0..."


In [111]:
results_df.to_csv('results/Descriptors/Results_2d_Mordred_LVR_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_Mordred_LVR_Caco2.csv')

In [112]:
#2d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_Caco2.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-1.2947,1.676248,186.2062,119.101580,0,0,111,51,60,...,100.950923,1.979430,36.643890,17.922198,18.721692,9274.0,80.0,6.411,254.0,1003
1,0,-1.2947,1.676248,186.2062,119.101580,0,0,111,51,60,...,100.950923,1.979430,36.643890,17.922198,18.721692,9274.0,80.0,6.411,254.0,1001
2,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.404267,1.970449,37.175727,17.865461,19.310266,10383.0,92.0,5.550,272.0,1014
3,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.404039,1.970445,37.173701,17.864949,19.308751,10375.0,92.0,5.550,272.0,1009
4,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.404039,1.970445,37.173701,17.864949,19.308751,10375.0,92.0,5.550,272.0,1010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,0,-9.6097,92.346334,344.0469,231.595816,0,0,214,102,112,...,204.450386,2.004416,78.927096,30.584156,38.662023,60444.0,180.0,7.726,532.0,8354
1004,0,-2.6870,7.219969,379.0408,217.916128,0,0,194,98,96,...,195.069444,1.990505,73.605382,30.474598,38.084600,54340.0,164.0,3.846,494.0,8493
1005,0,-3.4642,12.000682,366.2199,208.635370,0,0,185,95,90,...,189.206654,1.991649,73.459617,30.422275,37.991158,50819.0,160.0,2.411,480.0,8492
1006,0,-2.9426,8.658895,374.5890,214.822542,0,0,191,97,94,...,193.061774,1.990328,73.567692,30.461069,38.060439,53039.0,164.0,3.066,490.0,8491


In [113]:
df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')
df 


  df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.002370,2.434280,4.865870,...,11.590451,162.312312,1663.015310,6.547304,91297,211,602.0,703.0,50.611111,26.777778
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,148.508426,2.439658,4.878720,...,11.569485,161.240399,1648.999660,6.569720,90388,207,596.0,694.0,49.750000,26.555556
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,130.882818,2.423927,4.847855,...,11.437501,150.677817,1585.815201,6.894849,69754,181,540.0,618.0,48.506944,24.125000
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,140.033288,2.434167,4.866917,...,11.543988,155.034166,1572.968359,6.526840,79884,202,566.0,663.0,49.750000,25.222222
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,132.454775,2.413793,4.827585,...,11.409840,150.603018,1565.869824,6.720471,69806,179,536.0,612.0,47.284722,24.347222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,37.855097,2.317338,4.629714,...,9.940542,65.841858,430.258006,6.619354,2644,45,148.0,164.0,11.750000,7.000000
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,37.048032,2.303371,4.602630,...,9.872152,64.585516,416.278741,6.307254,2503,43,142.0,157.0,10.888889,6.833333
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,35.896500,2.304816,4.609632,...,9.859065,63.474076,402.263091,6.385128,2286,42,138.0,153.0,10.638889,6.583333
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,34.487329,2.307278,4.602399,...,9.845805,62.359842,388.247441,6.470791,2069,41,134.0,149.0,10.388889,6.333333


In [114]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1003,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](...,-5.100000,0,-1.2947,1.676248,186.2062,119.101580,0,0,...,6.418490,100.950923,1.979430,36.643890,17.922198,18.721692,9274.0,80.0,6.411,254.0
1,1001,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-6.370000,0,-1.2947,1.676248,186.2062,119.101580,0,0,...,6.418490,100.950923,1.979430,36.643890,17.922198,18.721692,9274.0,80.0,6.411,254.0
2,1014,CC(C)C[C@H]1C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](Cc...,-4.630000,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.404267,1.970449,37.175727,17.865461,19.310266,10383.0,92.0,5.550,272.0
3,1009,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H...,-5.690000,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.404039,1.970445,37.173701,17.864949,19.308751,10375.0,92.0,5.550,272.0
4,1010,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H...,-5.950000,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.404039,1.970445,37.173701,17.864949,19.308751,10375.0,92.0,5.550,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,8354,CC[C@H](C)[C@@H]1NC(=O)[C@H](C2CCCC2)N(C)C(=O)...,-6.207608,0,-9.6097,92.346334,344.0469,231.595816,0,0,...,6.788861,204.450386,2.004416,78.927096,30.584156,38.662023,60444.0,180.0,7.726,532.0
1004,8493,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)C[C@@H...,-7.096910,0,-2.6870,7.219969,379.0408,217.916128,0,0,...,7.168375,195.069444,1.990505,73.605382,30.474598,38.084600,54340.0,164.0,3.846,494.0
1005,8492,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)C[C@@H...,-7.000000,0,-3.4642,12.000682,366.2199,208.635370,0,0,...,7.289826,189.206654,1.991649,73.459617,30.422275,37.991158,50819.0,160.0,2.411,480.0
1006,8491,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)C[C@@H...,-6.958607,0,-2.9426,8.658895,374.5890,214.822542,0,0,...,7.207587,193.061774,1.990328,73.567692,30.461069,38.060439,53039.0,164.0,3.066,490.0


In [115]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,0,-3.6573,13.375843,462.8222,276.940262,0,0,...,6.547304,237.423604,1.978530,82.566872,37.965469,44.601403,91297.0,211.0,8.405,602.0
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,0,-3.7596,14.134592,457.1255,273.846676,0,0,...,6.569720,235.601472,1.979844,82.357901,37.975600,44.382301,90388.0,207.0,8.692,596.0
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,0,-4.7036,22.123853,403.5966,250.593953,0,0,...,6.894849,214.100758,1.964227,89.673466,40.258888,41.162462,69754.0,181.0,6.958,540.0
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,0,-5.1318,26.335371,429.6680,260.619504,0,0,...,6.526840,222.500266,1.969029,82.337179,37.880967,44.456212,79884.0,202.0,6.376,566.0
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,0,-6.2030,38.477209,402.2859,252.174332,0,0,...,6.720471,214.421483,1.967170,87.553124,40.278853,41.547594,69806.0,179.0,7.107,536.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0,-1.6788,2.818369,115.5380,70.758962,0,0,...,6.619354,61.526924,1.984739,22.541021,10.153913,12.387108,2644.0,45.0,1.941,148.0
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,0,-1.4869,2.210872,112.9715,71.290548,0,0,...,6.307254,59.707951,1.990265,19.682842,7.618137,12.064705,2503.0,43.0,3.753,142.0
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,0,-1.1989,1.437361,110.0599,68.196962,0,0,...,6.385128,57.707840,1.989926,19.684197,7.618398,12.065799,2286.0,42.0,3.395,138.0
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,0,-0.9109,0.829739,107.1483,65.103376,0,0,...,6.470791,55.707630,1.989558,19.686908,7.618921,12.067987,2069.0,41.0,3.037,134.0


In [116]:
df_ordered.to_csv('features/Descriptors/Train_2d_padel_curated_Caco2.csv', index=False)

In [117]:
#2d test padel descriptors
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_Caco2.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-4.0344,16.276383,122.7648,75.470134,0,0,72,34,38,...,65.139553,1.915869,33.439372,14.950444,18.488928,3129.0,64.0,-0.386,168.0,1024
1,0,-4.2390,17.969121,111.3714,69.282962,0,0,66,32,34,...,61.494634,1.921707,33.024381,14.973124,18.051258,2735.0,56.0,0.188,156.0,1021
2,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.404267,1.970449,37.175727,17.865461,19.310266,10383.0,92.0,5.550,272.0,1013
3,0,-1.2947,1.676248,186.2062,119.101580,0,0,111,51,60,...,100.950923,1.979430,36.643890,17.922198,18.721692,9274.0,80.0,6.411,254.0,1006
4,0,-1.1924,1.421818,191.9029,122.195166,0,0,114,52,62,...,102.767992,1.976308,36.822189,17.902958,18.919231,9650.0,84.0,6.124,260.0,1051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,0,-3.5051,12.285726,462.8621,276.940262,0,0,254,120,134,...,237.600778,1.980006,82.612830,37.983553,44.629277,94190.0,208.0,8.466,600.0,2064
248,0,-2.7880,7.772944,363.1421,205.541784,0,0,182,94,88,...,187.209315,1.991588,73.414277,30.406151,37.961942,49207.0,158.0,1.842,476.0,8075
249,0,-2.8466,8.103132,379.2485,217.916128,0,0,194,98,96,...,194.883801,1.988610,73.534572,30.449360,38.039053,54177.0,168.0,3.592,496.0,8077
250,0,-3.4877,12.164051,371.6893,214.602956,0,0,191,99,92,...,197.073279,1.990639,78.403426,30.481362,38.096680,55739.0,165.0,3.613,498.0,8370


In [118]:
df = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_Caco2.csv')
df

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.071798,2.429231,4.858462,...,11.569599,162.262107,1663.015310,6.547304,94190,208,600.0,698.0,50.000000,26.805556
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,133.544218,2.399797,4.799594,...,11.363578,152.529489,1571.020120,6.309318,74798,175,538.0,606.0,48.763889,24.777778
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,138.687365,2.431377,4.862754,...,11.482930,153.892806,1550.951647,6.516604,73336,187,562.0,647.0,45.645833,24.736111
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,131.842973,2.484317,4.905149,...,11.534364,163.933604,1535.873430,6.736287,67356,187,554.0,645.0,46.569444,23.486111
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,133.583282,2.474028,4.901032,...,11.515712,164.907694,1529.943702,6.538221,69591,185,556.0,644.0,46.208333,23.763889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,40.208444,2.339628,4.650167,...,10.071076,80.884603,440.242356,6.878787,2857,47,160.0,181.0,11.111111,7.083333
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.475166,2.317701,4.635401,...,9.885578,65.694814,430.294391,6.236151,2660,44,146.0,161.0,11.138889,7.083333
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.474998,2.320224,4.640447,...,9.885578,65.694814,430.294391,6.236151,2648,44,146.0,161.0,11.138889,7.083333
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333


In [119]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1024,C[C@H]1C(=O)N(C)[C@H](C)C(=O)N[C@H](C)C(=O)N(C...,-7.100000,0,-4.0344,16.276383,122.7648,75.470134,0,0,...,6.698407,65.139553,1.915869,33.439372,14.950444,18.488928,3129.0,64.0,-0.386,168.0
1,1021,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.200000,0,-4.2390,17.969121,111.3714,69.282962,0,0,...,6.882636,61.494634,1.921707,33.024381,14.973124,18.051258,2735.0,56.0,0.188,156.0
2,1013,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H...,-6.420000,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.404267,1.970449,37.175727,17.865461,19.310266,10383.0,92.0,5.550,272.0
3,1006,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-5.440000,0,-1.2947,1.676248,186.2062,119.101580,0,0,...,6.418490,100.950923,1.979430,36.643890,17.922198,18.721692,9274.0,80.0,6.411,254.0
4,1051,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...,-6.120000,0,-1.1924,1.421818,191.9029,122.195166,0,0,...,6.372526,102.767992,1.976308,36.822189,17.902958,18.919231,9650.0,84.0,6.124,260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.190000,0,-3.5051,12.285726,462.8621,276.940262,0,0,...,6.547304,237.600778,1.980006,82.612830,37.983553,44.629277,94190.0,208.0,8.466,600.0
248,8075,CC(C)[C@H]1C(=O)N[C@H](C(=O)N2CCCC2)CC(=O)N(C)...,-7.000000,0,-2.7880,7.772944,363.1421,205.541784,0,0,...,7.332979,187.209315,1.991588,73.414277,30.406151,37.961942,49207.0,158.0,1.842,476.0
249,8077,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)C[C@@H...,-7.301030,0,-2.8466,8.103132,379.2485,217.916128,0,0,...,7.168375,194.883801,1.988610,73.534572,30.449360,38.039053,54177.0,168.0,3.592,496.0
250,8370,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)C[C@@H...,-7.154902,0,-3.4877,12.164051,371.6893,214.602956,0,0,...,7.395970,197.073279,1.990639,78.403426,30.481362,38.096680,55739.0,165.0,3.613,498.0


In [120]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,0,-3.5051,12.285726,462.8621,276.940262,0,0,...,6.547304,237.600778,1.980006,82.612830,37.983553,44.629277,94190.0,208.0,8.466,600.0
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,0,-6.4306,41.352616,403.2109,261.255434,0,0,...,6.309318,217.445869,1.958972,88.222714,50.298796,37.923918,74798.0,175.0,6.768,538.0
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,0,-0.7114,0.506090,433.7132,259.803918,0,0,...,6.516604,222.331854,1.985106,74.211794,35.995049,38.216745,73336.0,187.0,10.339,562.0
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,0,-3.8138,14.545070,395.3981,246.117160,0,0,...,6.736287,213.827378,1.979883,85.394192,35.950728,41.996280,67356.0,187.0,6.203,554.0
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,0,-3.6776,13.524742,398.5483,250.791125,0,0,...,6.538221,215.983270,1.981498,82.730212,35.983356,41.825434,69591.0,185.0,8.843,556.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,0,-0.7287,0.531004,126.1856,71.185376,0,0,...,6.878787,64.397738,2.012429,23.109637,10.742164,12.367472,2857.0,47.0,2.181,160.0
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,0,-1.7749,3.150270,115.8831,74.384134,0,0,...,6.236151,61.707690,1.990571,19.711184,7.642415,12.068770,2660.0,44.0,4.111,146.0
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,0,-1.7749,3.150270,115.8831,74.384134,0,0,...,6.236151,61.707448,1.990563,19.711947,7.642792,12.069155,2648.0,44.0,4.111,146.0
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0,-1.7749,3.150270,115.8831,74.384134,0,0,...,6.236151,61.708009,1.990581,19.682164,7.618006,12.064158,2750.0,44.0,4.111,146.0


In [121]:
df_ordered.to_csv('features/Descriptors/Test_2d_padel_curated_Caco2.csv', index=False)

In [122]:
#3d Train descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_padel_Caco2.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.261422,2.182556,3.016840,3.747558,4.585447,5.349741,6.059172,6.782624,7.603431,8.149217,...,0.320443,0.384955,0.406224,0.360825,29.462072,259.036766,949.539592,0.289425,1.152004,1014
1,1.261060,2.183198,3.009105,3.722036,4.578436,5.422106,6.121418,6.872774,7.648109,8.389204,...,0.446290,0.444096,0.446906,0.465390,32.015971,298.604824,977.973873,0.354920,1.356392,1008
2,1.259386,2.182188,3.005605,3.717280,4.574053,5.376599,6.069151,6.954843,7.643213,8.345460,...,0.368342,0.507049,0.518004,0.324569,31.288209,290.116262,1039.941353,0.311100,1.349622,1016
3,1.260388,2.180694,3.015214,3.741983,4.570048,5.416361,6.097201,6.901325,7.606820,8.311535,...,0.350633,0.406441,0.396193,0.333635,31.396052,292.290922,1070.664468,0.299868,1.136269,1015
4,1.257092,2.181489,3.032191,3.753863,4.486237,5.468836,6.249346,7.188905,7.799954,8.240230,...,0.440577,0.476716,0.498522,0.481411,19.097691,103.313248,235.332527,0.385791,1.456650,1019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,1.278342,2.213446,3.032582,3.836659,4.613495,5.505439,6.266794,6.908455,7.696370,8.375593,...,0.336653,0.443898,0.460493,0.424060,31.227117,286.293277,1037.943344,0.301479,1.328452,994
1004,1.277362,2.209598,3.027707,3.809947,4.616642,5.525675,6.283940,7.108358,8.019565,8.889049,...,0.435164,0.490527,0.529497,0.249504,33.366540,323.512401,1080.856686,0.356894,1.269529,997
1005,1.276213,2.208211,3.038565,3.787472,4.588814,5.431469,6.176457,6.995361,7.723967,8.368548,...,0.400499,0.459952,0.444234,0.380963,30.248235,267.449785,876.394034,0.341393,1.285150,996
1006,1.277643,2.215089,3.038606,3.845464,4.646214,5.495176,6.232398,6.882603,7.498702,8.082292,...,0.367093,0.536585,0.481797,0.429295,25.906154,202.866669,675.165705,0.285999,1.447677,989


In [123]:
df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')
df 

  df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.002370,2.434280,4.865870,...,11.590451,162.312312,1663.015310,6.547304,91297,211,602.0,703.0,50.611111,26.777778
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,148.508426,2.439658,4.878720,...,11.569485,161.240399,1648.999660,6.569720,90388,207,596.0,694.0,49.750000,26.555556
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,130.882818,2.423927,4.847855,...,11.437501,150.677817,1585.815201,6.894849,69754,181,540.0,618.0,48.506944,24.125000
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,140.033288,2.434167,4.866917,...,11.543988,155.034166,1572.968359,6.526840,79884,202,566.0,663.0,49.750000,25.222222
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,132.454775,2.413793,4.827585,...,11.409840,150.603018,1565.869824,6.720471,69806,179,536.0,612.0,47.284722,24.347222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,37.855097,2.317338,4.629714,...,9.940542,65.841858,430.258006,6.619354,2644,45,148.0,164.0,11.750000,7.000000
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,37.048032,2.303371,4.602630,...,9.872152,64.585516,416.278741,6.307254,2503,43,142.0,157.0,10.888889,6.833333
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,35.896500,2.304816,4.609632,...,9.859065,63.474076,402.263091,6.385128,2286,42,138.0,153.0,10.638889,6.583333
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,34.487329,2.307278,4.602399,...,9.845805,62.359842,388.247441,6.470791,2069,41,134.0,149.0,10.388889,6.333333


In [124]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1014,CC(C)C[C@H]1C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](Cc...,-4.63,1.261422,2.182556,3.016840,3.747558,4.585447,5.349741,6.059172,...,0.526283,0.320443,0.384955,0.406224,0.360825,29.462072,259.036766,949.539592,0.289425,1.152004
1,1008,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]...,-4.70,1.261060,2.183198,3.009105,3.722036,4.578436,5.422106,6.121418,...,0.456990,0.446290,0.444096,0.446906,0.465390,32.015971,298.604824,977.973873,0.354920,1.356392
2,1016,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N(C)[C@...,-4.79,1.259386,2.182188,3.005605,3.717280,4.574053,5.376599,6.069151,...,0.505724,0.368342,0.507049,0.518004,0.324569,31.288209,290.116262,1039.941353,0.311100,1.349622
3,1015,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@...,-5.67,1.260388,2.180694,3.015214,3.741983,4.570048,5.416361,6.097201,...,0.515946,0.350633,0.406441,0.396193,0.333635,31.396052,292.290922,1070.664468,0.299868,1.136269
4,1019,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.30,1.257092,2.181489,3.032191,3.753863,4.486237,5.468836,6.249346,...,0.483284,0.440577,0.476716,0.498522,0.481411,19.097691,103.313248,235.332527,0.385791,1.456650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,994,CC[C@H](C)[C@@H]1NC(=O)c2csc(n2)[C@H]([C@H](C)...,-5.21,1.278342,2.213446,3.032582,3.836659,4.613495,5.505439,6.266794,...,0.531000,0.336653,0.443898,0.460493,0.424060,31.227117,286.293277,1037.943344,0.301479,1.328452
1004,997,CC[C@H](C)[C@@H]1NC(=O)c2csc(n2)[C@H]([C@H](C)...,-5.82,1.277362,2.209598,3.027707,3.809947,4.616642,5.525675,6.283940,...,0.469432,0.435164,0.490527,0.529497,0.249504,33.366540,323.512401,1080.856686,0.356894,1.269529
1005,996,CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](...,-5.68,1.276213,2.208211,3.038565,3.787472,4.588814,5.431469,6.176457,...,0.493763,0.400499,0.459952,0.444234,0.380963,30.248235,267.449785,876.394034,0.341393,1.285150
1006,989,CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](...,-6.30,1.277643,2.215089,3.038606,3.845464,4.646214,5.495176,6.232398,...,0.490240,0.367093,0.536585,0.481797,0.429295,25.906154,202.866669,675.165705,0.285999,1.447677


In [125]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,1.262351,2.191106,3.031628,3.774598,4.573977,5.360347,6.066094,...,0.652402,0.288976,0.490060,0.392125,0.340978,84.972151,1759.679739,8625.222611,0.478603,1.223163
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,1.262071,2.193514,3.023748,3.764570,4.560785,5.361645,6.051195,...,0.640138,0.291958,0.430048,0.470112,0.328022,82.497866,1702.744759,8910.830409,0.460207,1.228181
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,1.269448,2.197140,3.020301,3.742178,4.584264,5.351257,6.095552,...,0.590397,0.362577,0.489361,0.496496,0.344430,71.764722,1333.271798,5125.668681,0.429461,1.330288
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,1.260561,2.187814,3.028857,3.752989,4.510073,5.360820,6.133947,...,0.668782,0.278567,0.451178,0.416564,0.348152,83.135006,1632.338554,7351.509889,0.503173,1.215893
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,1.267412,2.193809,3.019227,3.758527,4.513875,5.295577,5.990756,...,0.522315,0.388730,0.541868,0.551796,0.382011,65.301775,1211.414868,6306.229872,0.366567,1.475675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,1.259089,2.194381,3.007751,3.751590,4.611812,5.242195,5.769198,...,0.620584,0.265348,0.444383,0.502894,0.436832,18.554653,91.483311,230.026534,0.430876,1.384109
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,1.258198,2.191566,2.973671,3.705202,4.460698,5.237321,5.878186,...,0.692270,0.249929,0.426958,0.468043,0.444280,20.215607,92.963698,195.799164,0.538405,1.339282
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,1.257014,2.193448,2.984844,3.726942,4.585777,5.368594,5.948557,...,0.589060,0.305502,0.509137,0.469219,0.387322,18.258258,91.435060,225.184830,0.383590,1.365678
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,1.255311,2.194277,2.981488,3.712347,4.620760,5.423795,6.057763,...,0.785075,0.156266,0.462273,0.392676,0.434064,21.643247,83.333105,177.935116,0.677612,1.289012


In [126]:
df_ordered.to_csv('features/Descriptors/Train_3d_padel_curated_Caco2.csv', index=False)

In [127]:
#3d test padel descriptors
df_test = pd.read_csv('features/Descriptors/Test_3d_padel_Caco2.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.257998,2.169271,3.046798,3.768489,4.472643,5.345855,6.122972,6.984056,7.583037,8.010907,...,0.361169,0.542484,0.533207,0.368279,18.971254,102.737281,256.669405,0.347099,1.443970,1024
1,1.267299,2.223130,3.034084,3.818011,4.700741,5.508629,6.244134,6.978876,7.702789,8.497747,...,0.269507,0.412058,0.442294,0.385391,31.503378,231.948844,573.858037,0.514000,1.239743,1078
2,1.256988,2.183509,3.026473,3.763624,4.574110,5.498503,6.260755,7.292527,7.915936,8.430562,...,0.453107,0.525119,0.553819,0.452893,20.037088,110.787714,233.410885,0.413737,1.531831,1021
3,1.262306,2.195495,3.011909,3.748835,4.676414,5.488707,6.238683,6.959283,7.562872,8.251838,...,0.333676,0.372473,0.466480,0.335823,32.421398,307.341154,1136.341637,0.303787,1.174777,1070
4,1.260535,2.191391,3.042007,3.769350,4.552662,5.298062,6.044804,6.843005,7.573255,8.223910,...,0.291952,0.435630,0.506810,0.343169,29.767700,239.618472,759.926494,0.403431,1.285609,1087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,1.260013,2.180436,3.010307,3.733501,4.578559,5.341091,6.006795,6.786576,7.508179,8.173988,...,0.382321,0.521283,0.492929,0.352270,61.482682,1085.456805,5678.334369,0.352756,1.366482,8483
248,1.257796,2.177640,3.004684,3.723547,4.558057,5.347310,6.067333,6.866360,7.585379,8.307844,...,0.423942,0.585092,0.481894,0.345968,64.061968,1234.047118,7593.562929,0.312035,1.412954,8484
249,1.279398,2.215153,3.043351,3.856027,4.664819,5.494090,6.246800,6.933532,7.576441,8.293859,...,0.258720,0.531005,0.455378,0.396786,28.204480,216.704016,712.951221,0.413272,1.383169,991
250,1.259992,2.182698,3.014043,3.737413,4.562238,5.412389,6.139010,6.845320,7.678431,8.458133,...,0.331361,0.461258,0.474559,0.361166,32.919132,305.560834,1052.549065,0.341426,1.296983,8462


In [128]:
df = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_Caco2.csv')
df

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.071798,2.429231,4.858462,...,11.569599,162.262107,1663.015310,6.547304,94190,208,600.0,698.0,50.000000,26.805556
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,133.544218,2.399797,4.799594,...,11.363578,152.529489,1571.020120,6.309318,74798,175,538.0,606.0,48.763889,24.777778
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,138.687365,2.431377,4.862754,...,11.482930,153.892806,1550.951647,6.516604,73336,187,562.0,647.0,45.645833,24.736111
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,131.842973,2.484317,4.905149,...,11.534364,163.933604,1535.873430,6.736287,67356,187,554.0,645.0,46.569444,23.486111
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,133.583282,2.474028,4.901032,...,11.515712,164.907694,1529.943702,6.538221,69591,185,556.0,644.0,46.208333,23.763889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,40.208444,2.339628,4.650167,...,10.071076,80.884603,440.242356,6.878787,2857,47,160.0,181.0,11.111111,7.083333
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.475166,2.317701,4.635401,...,9.885578,65.694814,430.294391,6.236151,2660,44,146.0,161.0,11.138889,7.083333
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.474998,2.320224,4.640447,...,9.885578,65.694814,430.294391,6.236151,2648,44,146.0,161.0,11.138889,7.083333
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333


In [129]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1024,C[C@H]1C(=O)N(C)[C@H](C)C(=O)N[C@H](C)C(=O)N(C...,-7.100,1.257998,2.169271,3.046798,3.768489,4.472643,5.345855,6.122972,...,0.536897,0.361169,0.542484,0.533207,0.368279,18.971254,102.737281,256.669405,0.347099,1.443970
1,1078,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)CNC(=...,-8.000,1.267299,2.223130,3.034084,3.818011,4.700741,5.508629,6.244134,...,0.676000,0.269507,0.412058,0.442294,0.385391,31.503378,231.948844,573.858037,0.514000,1.239743
2,1021,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.200,1.256988,2.183509,3.026473,3.763624,4.574110,5.498503,6.260755,...,0.489385,0.453107,0.525119,0.553819,0.452893,20.037088,110.787714,233.410885,0.413737,1.531831
3,1070,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](...,-5.600,1.262306,2.195495,3.011909,3.748835,4.676414,5.488707,6.238683,...,0.535515,0.333676,0.372473,0.466480,0.335823,32.421398,307.341154,1136.341637,0.303787,1.174777
4,1087,CC(C)C[C@H]1C(=O)N[C@@H](Cc2ccc(O)cc2)C(=O)N(C...,-6.030,1.260535,2.191391,3.042007,3.769350,4.552662,5.298062,6.044804,...,0.602287,0.291952,0.435630,0.506810,0.343169,29.767700,239.618472,759.926494,0.403431,1.285609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8483,CCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](COCC...,-5.810,1.260013,2.180436,3.010307,3.733501,4.578559,5.341091,6.006795,...,0.519516,0.382321,0.521283,0.492929,0.352270,61.482682,1085.456805,5678.334369,0.352756,1.366482
248,8484,CCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](COCC...,-5.835,1.257796,2.177640,3.004684,3.723547,4.558057,5.347310,6.067333,...,0.450747,0.423942,0.585092,0.481894,0.345968,64.061968,1234.047118,7593.562929,0.312035,1.412954
249,991,CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](...,-4.960,1.279398,2.215153,3.043351,3.856027,4.664819,5.494090,6.246800,...,0.608848,0.258720,0.531005,0.455378,0.396786,28.204480,216.704016,712.951221,0.413272,1.383169
250,8462,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-5.175,1.259992,2.182698,3.014043,3.737413,4.562238,5.412389,6.139010,...,0.560951,0.331361,0.461258,0.474559,0.361166,32.919132,305.560834,1052.549065,0.341426,1.296983


In [130]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,1.261635,2.190215,3.027568,3.762918,4.545954,5.392501,6.072081,...,0.652497,0.280515,0.445002,0.413012,0.296281,83.880226,1727.561974,9047.590565,0.478746,1.154294
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,1.256557,2.169821,2.986649,3.670414,4.476205,5.313816,6.086273,...,0.485352,0.422296,0.447930,0.517179,0.314599,73.405636,1556.088414,9116.543699,0.361471,1.279709
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,1.262680,2.195242,3.024928,3.752221,4.598067,5.402628,6.155935,...,0.510350,0.444723,0.537150,0.549475,0.350829,73.976557,1476.888699,5678.961564,0.432609,1.437454
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,1.268133,2.190418,3.025417,3.756084,4.530515,5.333679,6.067635,...,0.497711,0.432923,0.510948,0.511623,0.391263,64.740473,1173.676682,5294.086991,0.395951,1.413835
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,1.265152,2.188857,3.010896,3.743219,4.577397,5.377621,6.111278,...,0.486151,0.417556,0.491781,0.507360,0.399975,68.119851,1345.764797,7592.643463,0.355561,1.399116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,1.262446,2.201498,3.045532,3.794079,4.642616,5.291164,5.944925,...,0.754131,0.182249,0.480010,0.459342,0.384289,22.131975,96.501291,213.423664,0.631197,1.323641
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,1.254869,2.184131,2.960468,3.701093,4.520896,5.286871,5.909763,...,0.551299,0.372080,0.515541,0.521631,0.360145,19.167585,101.356334,231.204419,0.385069,1.397318
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,1.255465,2.185980,2.969142,3.714962,4.589486,5.350734,6.014608,...,0.531900,0.364522,0.414856,0.423812,0.431576,19.944376,114.058462,293.327379,0.344633,1.270243
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,1.255587,2.184126,2.976109,3.730063,4.649851,5.464935,6.146713,...,0.576767,0.278730,0.433292,0.469774,0.341165,19.549575,108.687575,301.806263,0.365151,1.244231


In [131]:
df_ordered.to_csv('features/Descriptors/Test_3d_padel_curated_Caco2.csv', index=False)

In [132]:
#2d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 1444)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1444)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222118
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1025
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222272
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1038
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1883,0.319,0.4339,0.6894,0.8307,0.8129,0.1571,0.3025,0.3963,0.7286,0.8552,0.8473
DecisionTreeRegressor,0.3845,0.4479,0.6201,0.3658,0.692,0.6669,0.1944,0.3408,0.4409,0.6641,0.8175,0.8019
RandomForestRegressor,0.1999,0.3339,0.4471,0.6704,0.8222,0.8037,0.1685,0.3169,0.4104,0.7089,0.8466,0.8398
GradientBoostingRegressor,0.2026,0.3379,0.4501,0.6659,0.817,0.8003,0.1596,0.3072,0.3995,0.7242,0.8544,0.8423
AdaBoostRegressor,0.2721,0.4185,0.5217,0.5512,0.7613,0.7205,0.2461,0.4185,0.4961,0.5747,0.7841,0.7499
XGBRegressor,0.2042,0.3307,0.4519,0.6632,0.8152,0.8018,0.1603,0.3015,0.4004,0.723,0.8521,0.8403
ExtraTreesRegressor,0.1815,0.3082,0.4261,0.7006,0.8376,0.8246,0.1583,0.2987,0.3978,0.7265,0.8536,0.8435
LinearRegression,9.4694,2.8239,3.0772,-14.6174,-0.0417,-0.0741,2.5425,1.275,1.5945,-3.3933,0.0107,-0.0057
KNeighborsRegressor,0.217,0.3372,0.4658,0.6422,0.8062,0.7799,0.1917,0.3306,0.4379,0.6687,0.82,0.803
SVR,0.2032,0.3332,0.4507,0.6649,0.8183,0.8108,0.1672,0.321,0.4089,0.711,0.8469,0.8315


In [133]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.389178066549693, -7.484833891087416, -6.90...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.364721837150334, -6.329036983571328, -6.3...","[-6.748701599014778, -6.306523243562145, -6.35...","[0.22676744111895714, 0.17268557162567982, 0.0..."
1,DecisionTreeRegressor,"[-5.64, -8.0, -7.06, -6.89, -6.28, -6.82, -6.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.64, -6.0, -6.82, -5.92, -5.545, -7.0, -6....","[-6.3987743928, -5.9799999999999995, -6.371443...","[0.5754984359909545, 0.6122417823049977, 0.273..."
2,RandomForestRegressor,"[-6.251424910869996, -7.148939053969998, -6.88...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.239285210829996, -6.2911065945099995, -6....","[-6.564609701372002, -6.380330455286001, -6.23...","[0.16855783716503597, 0.058346971799754545, 0...."
3,GradientBoostingRegressor,"[-6.402647273378198, -7.32756011474585, -6.984...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.439378808872545, -6.211424587060593, -6.2...","[-6.776035288528499, -6.1665976794809705, -6.3...","[0.20289085782963517, 0.2108679754768676, 0.07..."
4,AdaBoostRegressor,"[-6.437172287092586, -7.019846153846152, -6.90...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.387795262807797, -5.971156612434907, -6.1...","[-6.532379463315179, -6.23443783131268, -6.111...","[0.14445378011885546, 0.27345996628363245, 0.1..."
5,XGBRegressor,"[-6.610179, -7.3762727, -7.124862, -7.1663704,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4336205, -6.0404596, -6.0617943, -5.92045...","[-6.8466783, -6.199047, -6.1872587, -6.055405,...","[0.22090368, 0.18399765, 0.1853347, 0.2703841,..."
6,ExtraTreesRegressor,"[-6.311316295409999, -7.497099999999997, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.267843707509997, -6.1852092190800025, -6....","[-6.765819506058004, -6.401847532084, -6.29268...","[0.2552141934552706, 0.12543494146888423, 0.10..."
7,LinearRegression,"[-4.0, -4.0, -4.0, -10.0, -4.0, -10.0, -4.0, -...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-4.0, -4.0, -4.0, -5.919999957519622, -4.0, ...","[-6.4, -5.2, -6.4, -6.7359999950652165, -6.4, ...","[2.939387691339814, 2.4, 2.939387691339814, 1...."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.053333333333334, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.776666666666666, -6.31666666666666...","[-6.552, -6.427333333333334, -6.46333333333333...","[0.22816757974007726, 0.3429311560324869, 0.15..."
9,SVR,"[-6.253466007635598, -7.1941011932863566, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4145641239262545, -6.283168767128786, -6....","[-6.548230445095605, -6.403085553067666, -6.25...","[0.22945689415019524, 0.09114257628522188, 0.0..."


In [134]:
result_df.to_csv('results/Descriptors/Results_2D_padel_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_Caco2.csv')

In [135]:
#2d padel descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 1088)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1088)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222118
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1025
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222272
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1038
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1883,0.319,0.4339,0.6894,0.8307,0.8129,0.1571,0.3025,0.3963,0.7286,0.8552,0.8473
DecisionTreeRegressor,0.3916,0.4528,0.6257,0.3542,0.6837,0.6573,0.1875,0.333,0.433,0.6761,0.825,0.8128
RandomForestRegressor,0.2005,0.3352,0.4478,0.6693,0.8216,0.8025,0.1696,0.3174,0.4118,0.707,0.8454,0.838
GradientBoostingRegressor,0.2019,0.3376,0.4493,0.667,0.8177,0.8008,0.1593,0.3069,0.3991,0.7247,0.8546,0.8428
AdaBoostRegressor,0.2656,0.4149,0.5154,0.562,0.7698,0.7309,0.2477,0.4206,0.4977,0.5721,0.7844,0.7586
XGBRegressor,0.2042,0.3307,0.4519,0.6632,0.8152,0.8018,0.1603,0.3015,0.4004,0.723,0.8521,0.8403
ExtraTreesRegressor,0.1786,0.3064,0.4226,0.7055,0.8406,0.8274,0.1569,0.2964,0.3961,0.7288,0.8551,0.8452
LinearRegression,9.4694,2.8239,3.0772,-14.6174,-0.0417,-0.0741,2.5425,1.275,1.5945,-3.3933,0.0107,-0.0057
KNeighborsRegressor,0.217,0.3372,0.4658,0.6422,0.8062,0.7799,0.1917,0.3306,0.4379,0.6687,0.82,0.803
SVR,0.2032,0.3332,0.4507,0.6649,0.8183,0.8108,0.1672,0.321,0.4089,0.711,0.8469,0.8315


In [136]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.389178066549693, -7.484833891087416, -6.90...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.364721837150334, -6.329036983571328, -6.3...","[-6.748701599014778, -6.306523243562145, -6.35...","[0.22676744111895714, 0.17268557162567982, 0.0..."
1,DecisionTreeRegressor,"[-5.64, -8.0, -7.06, -6.57, -6.28, -6.82, -6.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.545, -7.57, -6.54, -5.92, -5.68, -7.0, -6...","[-6.379, -6.151999999999999, -6.38000000000000...","[0.6081644514438507, 0.8709626857678808, 0.291..."
2,RandomForestRegressor,"[-6.2635242969499965, -7.144389700039993, -6.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.254594932769996, -6.287133333333335, -6.1...","[-6.537290936680002, -6.340236029452668, -6.21...","[0.15016973368603243, 0.0698278330909723, 0.03..."
3,GradientBoostingRegressor,"[-6.485929780358171, -7.32756011474585, -6.984...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.313862016147416, -6.177866189923363, -6.2...","[-6.768668680421865, -6.185903188559523, -6.33...","[0.24105269610226324, 0.20164889694167898, 0.0..."
4,AdaBoostRegressor,"[-6.199918827832431, -7.133488372093026, -6.88...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.199918827832431, -6.33327868852459, -6.17...","[-6.448721705257448, -6.4285392035579845, -6.1...","[0.1349729146109685, 0.13482550824915546, 0.11..."
5,XGBRegressor,"[-6.610179, -7.3762727, -7.124862, -7.1663704,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4336205, -6.0404596, -6.0617943, -5.92045...","[-6.8466783, -6.199047, -6.1872587, -6.055405,...","[0.22090368, 0.18399765, 0.1853347, 0.2703841,..."
6,ExtraTreesRegressor,"[-6.343160992483332, -7.544499999999998, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.397310441259999, -6.205550000000003, -6.1...","[-6.786663841168005, -6.409679803920001, -6.28...","[0.21193534715641404, 0.14227394902881701, 0.0..."
7,LinearRegression,"[-4.0, -4.0, -4.0, -10.0, -4.0, -10.0, -4.0, -...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-4.0, -4.0, -4.0, -5.920000255967125, -4.0, ...","[-6.4, -5.2, -6.4, -6.736000050535492, -6.4, -...","[2.939387691339814, 2.4, 2.939387691339814, 1...."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.053333333333334, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.776666666666666, -6.31666666666666...","[-6.552, -6.427333333333334, -6.46333333333333...","[0.22816757974007726, 0.3429311560324869, 0.15..."
9,SVR,"[-6.253614875662789, -7.194272609859479, -7.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.414773037166665, -6.283325038224237, -6.1...","[-6.548321439089233, -6.403122587814376, -6.25...","[0.22953190311086438, 0.09103572302729751, 0.0..."


In [137]:
result_df.to_csv('results/Descriptors/Results_2D_padel_const_rem_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_const_rem_Caco2.csv')

In [138]:
#2d padel descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 759)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 759)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148231
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 722
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148289
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 735
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, th

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1835,0.3184,0.4284,0.6973,0.8356,0.8176,0.1561,0.3004,0.3952,0.7302,0.8561,0.8481
DecisionTreeRegressor,0.414,0.4712,0.6434,0.3172,0.6623,0.6493,0.2042,0.3489,0.4519,0.6472,0.8063,0.7894
RandomForestRegressor,0.196,0.3303,0.4427,0.6768,0.8259,0.8077,0.1688,0.3164,0.4109,0.7083,0.8456,0.8364
GradientBoostingRegressor,0.196,0.3334,0.4428,0.6767,0.8237,0.8077,0.1548,0.3021,0.3934,0.7325,0.8598,0.8435
AdaBoostRegressor,0.2708,0.4185,0.5204,0.5534,0.7611,0.7163,0.2482,0.4199,0.4982,0.5712,0.7826,0.7553
XGBRegressor,0.1947,0.3265,0.4413,0.6789,0.8242,0.8077,0.1612,0.3025,0.4016,0.7214,0.8504,0.8381
ExtraTreesRegressor,0.175,0.3007,0.4184,0.7113,0.844,0.8325,0.155,0.2955,0.3937,0.7321,0.8568,0.8465
LinearRegression,1.5056,0.8507,1.227,-1.4831,0.413,0.496,0.6634,0.5738,0.8145,-0.1463,0.553,0.6227
KNeighborsRegressor,0.2116,0.3328,0.46,0.651,0.8116,0.7903,0.2016,0.3368,0.449,0.6517,0.8104,0.7927
SVR,0.2024,0.3333,0.4499,0.6662,0.819,0.8117,0.1644,0.32,0.4054,0.716,0.8504,0.8381


In [139]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.331132060709635, -7.407016980807575, -7.08...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.291231578778258, -6.308917859240777, -6.3...","[-6.710730738516551, -6.264906050627461, -6.29...","[0.2514804768368459, 0.11368522205045893, 0.09..."
1,DecisionTreeRegressor,"[-5.77, -7.03, -8.0, -7.47, -5.96, -5.85, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.77, -5.77, -5.77, -5.92, -6.82, -6.89, -6...","[-6.726000000000001, -5.927, -6.38600000000000...","[0.5953687932701881, 0.4294135535820918, 0.495..."
2,RandomForestRegressor,"[-6.261529431696661, -7.216049019599995, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.222232399306663, -6.25342059991, -6.17125...","[-6.566180100287335, -6.339046117500859, -6.27...","[0.19649181736107882, 0.09522185164116778, 0.0..."
3,GradientBoostingRegressor,"[-6.577724141054342, -7.361142486708802, -6.96...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.490674891575447, -6.558432290985637, -6.3...","[-6.7901497517043286, -6.239566784472143, -6.4...","[0.19756843498185958, 0.23186736336854236, 0.0..."
4,AdaBoostRegressor,"[-6.2618613550534565, -7.271176470588234, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.178627991800439, -6.108636363636364, -6.0...","[-6.366002292166532, -6.4278208523149285, -6.1...","[0.16830003121955658, 0.2685654878944652, 0.10..."
5,XGBRegressor,"[-6.32191, -7.308255, -6.812135, -7.243532, -6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.1859818, -6.098912, -5.8455453, -5.919533...","[-6.641716, -6.191452, -6.13704, -6.0202866, -...","[0.2588396, 0.19450311, 0.20654963, 0.20038813..."
6,ExtraTreesRegressor,"[-6.377860812209997, -7.453849999999996, -7.00...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.319370225009997, -6.391799999999998, -6.1...","[-6.838400952774006, -6.443807817481999, -6.35...","[0.26171738260274496, 0.07244790435818096, 0.1..."
7,LinearRegression,"[-5.7294838430054655, -6.244546388136524, -5.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.006251590782296, -4.0, -7.653103021158341...","[-7.629736967097112, -4.704482676908631, -7.27...","[1.39479788449747, 0.8892531952081758, 0.41077..."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.053333333333334, -5.9...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.173333333333333, -6.31666666666666...","[-6.552, -6.213333333333334, -6.53866666666666...","[0.22816757974007726, 0.2589723279940669, 0.15..."
9,SVR,"[-6.50369701817419, -7.1918207835632195, -7.04...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6233129133532325, -6.396462843536985, -6....","[-6.756604145794749, -6.484114877003305, -6.28...","[0.23180207058219274, 0.08137887923407588, 0.0..."


In [140]:
result_df.to_csv('results/Descriptors/Results_2D_padel_LVR_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_const_LVR_Caco2.csv')

In [141]:
#2d All descriptors
df_train_padel = pd.read_csv('features/Descriptors/Train_2d_padel_curated_Caco2.csv')
df_train_rdkit = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_Caco2.csv')
df_train_mordred = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')

df_2d_train = df_train_rdkit.merge(df_train_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_train

  df_train_mordred = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_Caco2.csv')


Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,16.271931,16.271931,0.025756,-1.930898,0.058792,23.408333,1664.156,...,6.547304,237.423604,1.978530,82.566872,37.965469,44.601403,91297.0,211.0,8.405,602.0
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,16.083777,16.083777,0.027668,-1.934865,0.053531,23.411765,1650.129,...,6.569720,235.601472,1.979844,82.357901,37.975600,44.382301,90388.0,207.0,8.692,596.0
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,15.668220,15.668220,0.003714,-4.150777,0.085546,24.018349,1587.863,...,6.894849,214.100758,1.964227,89.673466,40.258888,41.162462,69754.0,181.0,6.958,540.0
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,16.048909,16.048909,0.025704,-1.891709,0.101375,24.159292,1574.031,...,6.526840,222.500266,1.969029,82.337179,37.880967,44.456212,79884.0,202.0,6.376,566.0
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,15.511700,15.511700,0.033138,-4.150544,0.085338,23.366972,1567.445,...,6.720471,214.421483,1.967170,87.553124,40.278853,41.547594,69806.0,179.0,7.107,536.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,12.898424,12.898424,0.143657,-0.742398,0.662387,22.258065,430.549,...,6.619354,61.526924,1.984739,22.541021,10.153913,12.387108,2644.0,45.0,1.941,148.0
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,12.903074,12.903074,0.161549,-0.731330,0.598056,24.933333,416.566,...,6.307254,59.707951,1.990265,19.682842,7.618137,12.064705,2503.0,43.0,3.753,142.0
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,12.835172,12.835172,0.168936,-0.728726,0.606745,24.965517,402.539,...,6.385128,57.707840,1.989926,19.684197,7.618398,12.065799,2286.0,42.0,3.395,138.0
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,12.763992,12.763992,0.179945,-0.726122,0.611430,25.000000,388.512,...,6.470791,55.707630,1.989558,19.686908,7.618921,12.067987,2069.0,41.0,3.037,134.0


In [142]:
df_2d_train.to_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv', index=False)

In [143]:
df_test_padel = pd.read_csv('features/Descriptors/Test_2d_padel_curated_Caco2.csv')
df_test_rdkit = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_Caco2.csv')
df_test_mordred = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_Caco2.csv')

df_2d_test = df_test_rdkit.merge(df_test_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_test

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,15.976055,15.976055,0.030813,-1.935404,0.058779,22.883333,1664.156,...,6.547304,237.600778,1.980006,82.612830,37.983553,44.629277,94190.0,208.0,8.466,600.0
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,15.516152,15.516152,0.000091,-1.679143,0.049536,24.360360,1572.049,...,6.309318,217.445869,1.958972,88.222714,50.298796,37.923918,74798.0,175.0,6.768,538.0
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,15.993973,15.993973,0.020362,-1.750211,0.057933,24.294643,1552.024,...,6.516604,222.331854,1.985106,74.211794,35.995049,38.216745,73336.0,187.0,10.339,562.0
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,15.804049,15.804049,0.002346,-2.970606,0.165351,26.981481,1537.344,...,6.736287,213.827378,1.979883,85.394192,35.950728,41.996280,67356.0,187.0,6.203,554.0
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,15.545371,15.545371,0.013183,-2.948188,0.126607,27.422018,1530.953,...,6.538221,215.983270,1.981498,82.730212,35.983356,41.825434,69591.0,185.0,8.843,556.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,13.192828,13.192828,0.126957,-0.677034,0.670121,21.000000,440.544,...,6.878787,64.397738,2.012429,23.109637,10.742164,12.367472,2857.0,47.0,2.181,160.0
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,13.114656,13.114656,0.186152,-0.689890,0.586385,24.903226,430.593,...,6.236151,61.707690,1.990571,19.711184,7.642415,12.068770,2660.0,44.0,4.111,146.0
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,13.174810,13.174810,0.158364,-0.755076,0.586385,24.903226,430.593,...,6.236151,61.707448,1.990563,19.711947,7.642792,12.069155,2648.0,44.0,4.111,146.0
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,6.236151,61.708009,1.990581,19.682164,7.618006,12.064158,2750.0,44.0,4.111,146.0


In [144]:
df_2d_test.to_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv', index=False)

In [145]:
#2d All descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
# X_test = X_test.select_dtypes(include=['number'])
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 3089)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 3089)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 500050
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2348
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 500440
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2368
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1803,0.3134,0.4246,0.7027,0.839,0.8213,0.1534,0.2964,0.3916,0.735,0.859,0.8512
DecisionTreeRegressor,0.412,0.4686,0.6419,0.3204,0.6589,0.6291,0.1914,0.3427,0.4375,0.6692,0.8191,0.7966
RandomForestRegressor,0.1997,0.3318,0.4469,0.6706,0.8222,0.8055,0.1628,0.3119,0.4034,0.7188,0.8527,0.8449
GradientBoostingRegressor,0.1874,0.3244,0.4329,0.6909,0.8328,0.8167,0.1552,0.3041,0.3939,0.7319,0.8591,0.8444
AdaBoostRegressor,0.2696,0.424,0.5192,0.5554,0.7628,0.7284,0.2455,0.4177,0.4955,0.5758,0.7833,0.7601
XGBRegressor,0.2086,0.3364,0.4567,0.6559,0.811,0.7932,0.1542,0.2943,0.3927,0.7335,0.8576,0.8455
ExtraTreesRegressor,0.178,0.3038,0.4219,0.7064,0.8409,0.8293,0.15,0.2903,0.3873,0.7408,0.8621,0.8505
LinearRegression,9.0745,2.7587,3.0124,-13.9661,0.0523,0.0242,2.8559,1.3313,1.6899,-3.9348,0.0695,0.0839
KNeighborsRegressor,0.206,0.3289,0.4538,0.6603,0.8166,0.7924,0.1927,0.3317,0.439,0.667,0.8197,0.8021
SVR,0.201,0.3322,0.4483,0.6685,0.8201,0.8129,0.1613,0.3167,0.4016,0.7213,0.8531,0.841


In [146]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.479671151227762, -7.41669759409729, -7.063...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.432406597296459, -6.2790921894970415, -6....","[-6.723402756600431, -6.172909024851341, -6.29...","[0.1495408069036402, 0.16090280747994334, 0.03..."
1,DecisionTreeRegressor,"[-6.06, -7.05, -7.03, -7.44, -6.28, -6.68, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.77, -7.15490196, -5.919999999999999, -5.9...","[-6.859999999999999, -5.932980392, -5.98999999...","[0.48427265047698076, 0.7611623214439184, 0.67..."
2,RandomForestRegressor,"[-6.276506564519996, -7.121599999999998, -6.93...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.2959649503099975, -6.496289353930001, -6....","[-6.618319526612001, -6.341820091124669, -6.27...","[0.17467095629525753, 0.12309184793441003, 0.0..."
3,GradientBoostingRegressor,"[-6.534337590075724, -7.123129282965317, -6.93...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.568118283255722, -6.2613300204956355, -6....","[-6.816046887293496, -6.367913277236352, -6.47...","[0.20299719247638845, 0.31531679542715807, 0.1..."
4,AdaBoostRegressor,"[-6.13915555616269, -7.157666666666666, -7.023...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.076759259259256, -6.2155000000000005, -6....","[-6.3260093566413955, -6.527999188986376, -6.1...","[0.16891696289109745, 0.2316597344263591, 0.06..."
5,XGBRegressor,"[-6.1415243, -7.3276725, -7.0832243, -6.928905...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.0814095, -6.405711, -6.163932, -5.920123,...","[-6.623914, -6.3319077, -6.2759256, -5.9760385...","[0.32296768, 0.08779229, 0.12127504, 0.1117505..."
6,ExtraTreesRegressor,"[-6.462376675039999, -7.445157574909993, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4958341486999975, -6.227850000000001, -6....","[-6.8504471229360036, -6.451332813634001, -6.3...","[0.1863855535371024, 0.11694932641337916, 0.10..."
7,LinearRegression,"[-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -10.0, -4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-4.0, -4.0, -10.0, -5.919999956251559, -4.0,...","[-6.4, -7.6, -6.4, -6.735999992462138, -8.8, -...","[2.939387691339814, 2.939387691339814, 2.93938..."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.036666666666666, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.173333333333333, -6.34333333333333...","[-6.552, -6.213333333333334, -6.22866666666666...","[0.22816757974007726, 0.2589723279940669, 0.12..."
9,SVR,"[-6.297048693073095, -7.195193489734953, -7.05...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.414359484541146, -6.315265975209755, -6.2...","[-6.877899336813002, -6.476904198438865, -6.28...","[0.27209998099347255, 0.10341635662636661, 0.0..."


In [147]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_Caco2.csv')

In [148]:
#2d All descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 2475)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 2475)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 500050
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2348
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 500440
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2368
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1803,0.3134,0.4246,0.7027,0.839,0.8213,0.1534,0.2964,0.3916,0.735,0.859,0.8512
DecisionTreeRegressor,0.4037,0.4633,0.6354,0.3341,0.6676,0.6395,0.1951,0.3416,0.4416,0.663,0.816,0.8025
RandomForestRegressor,0.1988,0.3304,0.4459,0.6721,0.823,0.8065,0.1631,0.313,0.4038,0.7182,0.8525,0.8447
GradientBoostingRegressor,0.1876,0.3245,0.4331,0.6906,0.8325,0.8157,0.1554,0.3048,0.3943,0.7314,0.8588,0.8453
AdaBoostRegressor,0.2676,0.4175,0.5173,0.5587,0.7656,0.7234,0.2465,0.4193,0.4965,0.574,0.7816,0.7643
XGBRegressor,0.2086,0.3364,0.4567,0.6559,0.811,0.7932,0.1542,0.2943,0.3927,0.7335,0.8576,0.8455
ExtraTreesRegressor,0.1736,0.3008,0.4166,0.7137,0.8452,0.835,0.1511,0.2905,0.3887,0.7389,0.861,0.8493
LinearRegression,9.0745,2.7587,3.0124,-13.9661,0.0523,0.0242,2.8559,1.3313,1.6899,-3.9348,0.0695,0.0839
KNeighborsRegressor,0.206,0.3289,0.4538,0.6603,0.8166,0.7924,0.1927,0.3317,0.439,0.667,0.8197,0.8021
SVR,0.201,0.3322,0.4483,0.6685,0.8201,0.8129,0.1613,0.3167,0.4016,0.7213,0.8531,0.8409


In [149]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.479671151227762, -7.41669759409729, -7.063...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.432406597296459, -6.2790921894970415, -6....","[-6.723402756600431, -6.172909024851341, -6.29...","[0.1495408069036402, 0.16090280747994334, 0.03..."
1,DecisionTreeRegressor,"[-6.124938737, -7.03, -7.05, -7.44, -6.1249387...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.85, -6.82, -5.74, -5.919999999999999, -5....","[-6.894, -6.1579999999999995, -6.148, -6.14600...","[0.48849155571002445, 0.4711008384624252, 0.32..."
2,RandomForestRegressor,"[-6.25004982355333, -7.136657207139998, -6.931...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.31349631951333, -6.521559953840003, -6.28...","[-6.60933628752867, -6.377830770956668, -6.269...","[0.16596367737940515, 0.09937330979599539, 0.0..."
3,GradientBoostingRegressor,"[-6.558233554173521, -7.099233318867523, -6.93...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.67564123228679, -6.342459488043285, -6.49...","[-6.814015049318021, -6.438025520089131, -6.46...","[0.18717924768407035, 0.2670468339225202, 0.05..."
4,AdaBoostRegressor,"[-6.227827596669797, -7.144238424832405, -7.08...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.156275076834236, -6.657272727272726, -6.1...","[-6.409485238017671, -6.434070060164646, -6.18...","[0.17474045508813824, 0.18219619426103761, 0.0..."
5,XGBRegressor,"[-6.1415243, -7.3276725, -7.0832243, -6.928905...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.0814095, -6.405711, -6.163932, -5.920123,...","[-6.623914, -6.3319077, -6.2759256, -5.9760385...","[0.32296768, 0.08779229, 0.12127504, 0.1117505..."
6,ExtraTreesRegressor,"[-6.433645509519998, -7.427849019599996, -6.98...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.454020205509998, -6.2230500000000015, -6....","[-6.852326414554005, -6.480627743928001, -6.34...","[0.2001269692481339, 0.13946819098699845, 0.14..."
7,LinearRegression,"[-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -10.0, -4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-4.0, -4.0, -10.0, -5.919999967121839, -4.0,...","[-6.4, -7.6, -6.4, -6.736000012308091, -8.8, -...","[2.939387691339814, 2.939387691339814, 2.93938..."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.036666666666666, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.173333333333333, -6.34333333333333...","[-6.552, -6.213333333333334, -6.22866666666666...","[0.22816757974007726, 0.2589723279940669, 0.12..."
9,SVR,"[-6.29715211733134, -7.194964785380774, -7.049...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4144479138134916, -6.3150883862643665, -6...","[-6.877893141092555, -6.476841689125385, -6.28...","[0.272056930856736, 0.1035220834353901, 0.0666..."


In [150]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_const_rem_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_const_rem_Caco2.csv')

In [151]:
#2d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 1753)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1753)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 333829
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1672
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334064
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1692
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1862,0.3204,0.4315,0.6929,0.8328,0.8147,0.151,0.2936,0.3886,0.7391,0.862,0.8519
DecisionTreeRegressor,0.3836,0.4523,0.6194,0.3673,0.6855,0.6597,0.2027,0.3301,0.4502,0.6498,0.8088,0.7886
RandomForestRegressor,0.1963,0.33,0.443,0.6763,0.8254,0.8075,0.1611,0.3106,0.4014,0.7216,0.8543,0.8452
GradientBoostingRegressor,0.19,0.3289,0.4359,0.6866,0.8296,0.8067,0.1545,0.3011,0.3931,0.733,0.8598,0.8458
AdaBoostRegressor,0.2638,0.4082,0.5136,0.565,0.7693,0.7268,0.244,0.4131,0.4939,0.5785,0.7832,0.744
XGBRegressor,0.2186,0.3434,0.4675,0.6395,0.8017,0.7834,0.1553,0.2924,0.3941,0.7317,0.8567,0.8438
ExtraTreesRegressor,0.1743,0.2993,0.4175,0.7126,0.8445,0.8323,0.1488,0.2885,0.3858,0.7428,0.8632,0.8545
LinearRegression,9.1202,2.7721,3.02,-14.0416,0.0325,-0.0048,3.1393,1.3979,1.7718,-4.4245,-0.0109,-0.011
KNeighborsRegressor,0.2115,0.3315,0.4599,0.6512,0.812,0.7941,0.1925,0.3334,0.4387,0.6674,0.8196,0.8043
SVR,0.2006,0.331,0.4479,0.6692,0.8207,0.8141,0.1591,0.3153,0.3989,0.725,0.8557,0.8453


In [152]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.346724589011055, -7.367496194037714, -6.92...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4113793687869265, -6.166183522801234, -6....","[-6.729754414305003, -6.282671798126207, -6.32...","[0.18023105992448019, 0.16374769765588057, 0.1..."
1,DecisionTreeRegressor,"[-6.148741651, -7.03, -7.03, -7.27, -4.85, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.148741651, -5.77, -5.869999999999999, -5....","[-6.6548998284, -6.004, -6.0729999999999995, -...","[0.644618318469631, 0.6296221088875456, 0.2792..."
2,RandomForestRegressor,"[-6.297117882066666, -7.157199999999994, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.322250877666664, -6.48585801642, -6.30674...","[-6.623305673899336, -6.3747422194873335, -6.3...","[0.1637563886249004, 0.11557953231779378, 0.06..."
3,GradientBoostingRegressor,"[-6.489321609710626, -7.1500722745355985, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.499524386339651, -6.601372787025921, -6.2...","[-6.845074970735425, -6.4954977771100975, -6.4...","[0.1774525440988286, 0.18767039699182028, 0.11..."
4,AdaBoostRegressor,"[-6.184578620387182, -7.1233308637636386, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.101547169022219, -6.805482983167881, -6.0...","[-6.4271730510966325, -6.469740323795129, -6.0...","[0.17175134309421394, 0.2500488025802665, 0.05..."
5,XGBRegressor,"[-6.460424, -7.2003856, -7.1333, -6.600131, -6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4040246, -6.0130663, -6.552376, -5.918923...","[-6.768569, -6.1115236, -6.4207215, -6.019611,...","[0.24923892, 0.18988077, 0.20199122, 0.1995938..."
6,ExtraTreesRegressor,"[-6.386021475819998, -7.469238719639998, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4350539480199975, -6.456138719640002, -6....","[-6.860245971060006, -6.485386959608002, -6.35...","[0.21491651937890957, 0.04848410720345306, 0.0..."
7,LinearRegression,"[-4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-4.0, -4.0, -10.0, -5.919999931506027, -4.0,...","[-6.4, -6.4, -8.8, -5.5360032693882655, -7.6, ...","[2.939387691339814, 2.939387691339814, 2.4, 0...."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.173333333333333, -6.34333333333333...","[-6.552, -6.213333333333334, -6.55466666666666...","[0.22816757974007726, 0.2589723279940669, 0.14..."
9,SVR,"[-6.4813095633243885, -7.1965286044788765, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.552698980994173, -6.355893216991171, -6.2...","[-7.012064702628804, -6.495006893389268, -6.35...","[0.23904902347614815, 0.10076698599852575, 0.0..."


In [153]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_LVR_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_LVR_Caco2.csv')

In [154]:
#2d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 1753)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1753)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 333829
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1672
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334064
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1692
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1862,0.3204,0.4315,0.6929,0.8328,0.8147,0.151,0.2936,0.3886,0.7391,0.862,0.8519
DecisionTreeRegressor,0.3836,0.4523,0.6194,0.3673,0.6855,0.6597,0.2027,0.3301,0.4502,0.6498,0.8088,0.7886
RandomForestRegressor,0.1963,0.33,0.443,0.6763,0.8254,0.8075,0.1611,0.3106,0.4014,0.7216,0.8543,0.8452
GradientBoostingRegressor,0.19,0.3289,0.4359,0.6866,0.8296,0.8067,0.1545,0.3011,0.3931,0.733,0.8598,0.8458
AdaBoostRegressor,0.2638,0.4082,0.5136,0.565,0.7693,0.7268,0.244,0.4131,0.4939,0.5785,0.7832,0.744
XGBRegressor,0.2186,0.3434,0.4675,0.6395,0.8017,0.7834,0.1553,0.2924,0.3941,0.7317,0.8567,0.8438
ExtraTreesRegressor,0.1743,0.2993,0.4175,0.7126,0.8445,0.8323,0.1488,0.2885,0.3858,0.7428,0.8632,0.8545
LinearRegression,9.1202,2.7721,3.02,-14.0416,0.0325,-0.0048,3.1393,1.3979,1.7718,-4.4245,-0.0109,-0.011
KNeighborsRegressor,0.2115,0.3315,0.4599,0.6512,0.812,0.7941,0.1925,0.3334,0.4387,0.6674,0.8196,0.8043
SVR,0.2006,0.331,0.4479,0.6692,0.8207,0.8141,0.1591,0.3153,0.3989,0.725,0.8557,0.8453


In [155]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.346724589011055, -7.367496194037714, -6.92...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4113793687869265, -6.166183522801234, -6....","[-6.729754414305003, -6.282671798126207, -6.32...","[0.18023105992448019, 0.16374769765588057, 0.1..."
1,DecisionTreeRegressor,"[-6.148741651, -7.03, -7.03, -7.27, -4.85, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.148741651, -5.77, -5.869999999999999, -5....","[-6.6548998284, -6.004, -6.0729999999999995, -...","[0.644618318469631, 0.6296221088875456, 0.2792..."
2,RandomForestRegressor,"[-6.297117882066663, -7.157199999999996, -6.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.322250877666664, -6.485858016420001, -6.3...","[-6.623305673899336, -6.3747422194873335, -6.3...","[0.16375638862490055, 0.1155795323177943, 0.06..."
3,GradientBoostingRegressor,"[-6.489321609710626, -7.1500722745355985, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.499524386339651, -6.601372787025921, -6.2...","[-6.845074970735425, -6.4954977771100975, -6.4...","[0.1774525440988286, 0.18767039699182028, 0.11..."
4,AdaBoostRegressor,"[-6.184578620387182, -7.1233308637636386, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.101547169022219, -6.805482983167881, -6.0...","[-6.4271730510966325, -6.469740323795129, -6.0...","[0.17175134309421394, 0.2500488025802665, 0.05..."
5,XGBRegressor,"[-6.460424, -7.2003856, -7.1333, -6.600131, -6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4040246, -6.0130663, -6.552376, -5.918923...","[-6.768569, -6.1115236, -6.4207215, -6.019611,...","[0.24923892, 0.18988077, 0.20199122, 0.1995938..."
6,ExtraTreesRegressor,"[-6.386021475819998, -7.469238719639998, -6.99...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.435053948019998, -6.456138719640003, -6.2...","[-6.860245971060006, -6.485386959608002, -6.35...","[0.21491651937890957, 0.04848410720345307, 0.0..."
7,LinearRegression,"[-4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-4.0, -4.0, -10.0, -5.919999931506027, -4.0,...","[-6.4, -6.4, -8.8, -5.5360032693882655, -7.6, ...","[2.939387691339814, 2.939387691339814, 2.4, 0...."
8,KNeighborsRegressor,"[-6.28, -7.3500000000000005, -6.98, -7.0366666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.173333333333333, -6.34333333333333...","[-6.552, -6.213333333333334, -6.55466666666666...","[0.22816757974007726, 0.2589723279940669, 0.14..."
9,SVR,"[-6.4813095633243885, -7.1965286044788765, -7....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.552698980994173, -6.355893216991171, -6.2...","[-7.012064702628804, -6.495006893389268, -6.35...","[0.23904902347614815, 0.10076698599852575, 0.0..."


In [156]:
def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [157]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [158]:
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
X_train = df_train[selected_features] 
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv')
df_test =df_test.dropna()
X_test =  df_test[X_train.columns]
y_test =  df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 231)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 231)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 46352
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 222
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 46358
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 223
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1958,0.3265,0.4424,0.6772,0.823,0.8055,0.1596,0.3097,0.3995,0.7242,0.8535,0.8432
DecisionTreeRegressor,0.3791,0.4457,0.6157,0.3748,0.686,0.6613,0.2004,0.3431,0.4477,0.6537,0.8104,0.7838
RandomForestRegressor,0.205,0.3358,0.4528,0.6619,0.8161,0.7959,0.1737,0.3273,0.4168,0.6998,0.8416,0.8265
GradientBoostingRegressor,0.1984,0.334,0.4454,0.6728,0.8212,0.7994,0.1633,0.3192,0.4041,0.7178,0.8521,0.8348
AdaBoostRegressor,0.2824,0.4285,0.5314,0.5343,0.7441,0.7009,0.2581,0.4285,0.5081,0.5539,0.7657,0.736
XGBRegressor,0.2132,0.3406,0.4618,0.6483,0.8062,0.789,0.1677,0.3073,0.4095,0.7103,0.8444,0.8298
ExtraTreesRegressor,0.1798,0.3083,0.4241,0.7034,0.8392,0.8241,0.1574,0.3019,0.3967,0.7281,0.8551,0.8452
LinearRegression,0.3235,0.4001,0.5687,0.4665,0.7318,0.7571,0.3065,0.3722,0.5536,0.4704,0.7266,0.7868
KNeighborsRegressor,0.2008,0.3229,0.4481,0.6688,0.8237,0.8045,0.1853,0.3237,0.4304,0.6799,0.8273,0.8117
SVR,0.1848,0.3203,0.4299,0.6952,0.8365,0.8341,0.1549,0.3126,0.3936,0.7323,0.8601,0.8427


In [159]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.272900148090574, -7.355750800398436, -6.93...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.200032288215409, -6.452770425323021, -6.0...","[-6.446557725915774, -6.39848974924397, -6.327...","[0.15895917416421448, 0.17578809542582055, 0.1..."
1,DecisionTreeRegressor,"[-5.25, -8.0, -6.89, -7.07, -6.013228266, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.759450752, -5.48, -5.92, -5.92, -5.7, -6....","[-6.875890150400001, -5.942, -6.37400000000000...","[0.5672294355681291, 0.5627219562092811, 0.603..."
2,RandomForestRegressor,"[-6.219199029363329, -7.283099999999995, -7.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.199553112000001, -6.384382352933335, -6.2...","[-6.462085058032002, -6.484324958558, -6.32878...","[0.13649827208949578, 0.15327121573647734, 0.1..."
3,GradientBoostingRegressor,"[-6.4072589616544064, -7.186259191028184, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.236446277592802, -6.196066627972347, -6.4...","[-6.4615472883128104, -6.425828864520523, -6.4...","[0.15803347688306327, 0.17767351688266497, 0.0..."
4,AdaBoostRegressor,"[-6.189174600338083, -7.202592592592591, -7.15...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.189174600338083, -6.19284090909091, -6.08...","[-6.4874605529428235, -6.4171164426336205, -6....","[0.17896995947351227, 0.18553577501643537, 0.0..."
5,XGBRegressor,"[-5.8586645, -7.531838, -7.1303935, -6.9076457...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.8133574, -6.0459175, -5.811527, -5.920682...","[-6.539759, -6.230113, -6.485222, -6.0168347, ...","[0.4125309, 0.2772074, 0.3606255, 0.19383083, ..."
6,ExtraTreesRegressor,"[-6.1670443603599985, -7.613699999999997, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.268139857559999, -6.523900000000003, -6.1...","[-6.641014863454004, -6.578365154638002, -6.21...","[0.1876165718021266, 0.07131699990325498, 0.08..."
7,LinearRegression,"[-10.0, -7.662524690899571, -7.161643710442831...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-10.0, -7.015829050764486, -5.85222313114096...","[-8.4966603385358, -6.813328683700917, -5.9425...","[0.7531155277970678, 0.1467984371408742, 0.075..."
8,KNeighborsRegressor,"[-6.28, -7.3066666666666675, -6.98, -7.0533333...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -5.760000000000001, -6.04333333333333...","[-6.552, -6.249333333333333, -5.97866666666666...","[0.22816757974007726, 0.3613394833424965, 0.04..."
9,SVR,"[-6.27529793656065, -7.237439323918242, -7.014...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.278976475027309, -6.248813053582453, -6.1...","[-6.852196298365918, -6.395455703178094, -6.23...","[0.286623945793615, 0.10221339850200456, 0.043..."


In [160]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_LVR_remove_corr_features_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_LVRremove_corr_features_Caco2.csv')

In [161]:
#3d RDKit descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_RDKit_desc_Caco2.csv')
df_train = df_train.fillna(0)
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_RDKit_desc_Caco2.csv')
df_test = df_test.fillna(0)
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 11)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 11)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 11
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 11
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



-0.03850147424213257




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5551,0.6075,0.745,0.0846,0.3457,0.283,0.5292,0.5833,0.7275,0.0855,0.3367,0.2903
DecisionTreeRegressor,0.9865,0.7796,0.9932,-0.6269,0.201,0.1621,0.69,0.6469,0.8307,-0.1923,0.2226,0.226
RandomForestRegressor,0.5353,0.6031,0.7317,0.1171,0.3664,0.2748,0.5366,0.5918,0.7325,0.0728,0.3122,0.2784
GradientBoostingRegressor,0.5248,0.5989,0.7244,0.1345,0.3794,0.275,0.5443,0.5988,0.7377,0.0595,0.2793,0.263
AdaBoostRegressor,0.5427,0.6396,0.7367,0.1049,0.3547,0.267,0.5573,0.6318,0.7465,0.0371,0.2634,0.1694
XGBRegressor,0.6122,0.6331,0.7824,-0.0096,0.3118,0.2335,0.554,0.5867,0.7443,0.0427,0.3128,0.2797
ExtraTreesRegressor,0.5456,0.604,0.7387,0.1001,0.3674,0.2832,0.5388,0.591,0.734,0.0691,0.3216,0.2739
LinearRegression,0.5686,0.635,0.754,0.0623,0.2582,0.2127,0.5738,0.6338,0.7575,0.0085,0.1687,0.0974
KNeighborsRegressor,0.6634,0.6526,0.8145,-0.0941,0.2618,0.1836,0.6702,0.6488,0.8187,-0.1581,0.1787,0.171
SVR,0.5545,0.6008,0.7447,0.0854,0.3433,0.2727,0.5608,0.5951,0.7489,0.031,0.2772,0.2294


In [162]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.4929384359796645, -6.261072475804215, -5.8...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.574935421930776, -6.180501267243096, -6.4...","[-6.5722553003588615, -6.317465485385701, -6.2...","[0.06706350585911902, 0.15144834919901487, 0.0..."
1,DecisionTreeRegressor,"[-7.24, -6.82, -5.89, -5.74, -6.62, -6.8, -7.1...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.07, -5.64, -5.68, -6.13, -5.89, -6.24, -6...","[-6.8981162148, -6.901999999999999, -5.9659999...","[0.4180234470872645, 0.9481012604147304, 0.405..."
2,RandomForestRegressor,"[-6.888258795869998, -6.318141830470004, -5.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.619728257549999, -6.703451561509996, -6.2...","[-6.7010126901580005, -6.567109415287999, -6.1...","[0.12135164281363073, 0.1791123482464455, 0.04..."
3,GradientBoostingRegressor,"[-6.95566366721235, -6.273891713443746, -6.165...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.833283924893092, -5.988813677824892, -6.1...","[-6.714040633972806, -6.416373525401416, -6.11...","[0.11770964887095577, 0.46444480582216585, 0.0..."
4,AdaBoostRegressor,"[-6.477016950103275, -6.4609160563183075, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.549740347911483, -6.512876477196267, -6.4...","[-6.499383630123971, -6.474856415512131, -6.42...","[0.05115122778171236, 0.04673629029464207, 0.0..."
5,XGBRegressor,"[-7.0172153, -6.524013, -5.4764, -6.707936, -6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.482342, -6.6057444, -6.102097, -6.4760613...","[-6.793859, -6.55914, -6.214556, -6.2802634, -...","[0.24383767, 0.1809014, 0.18709227, 0.14548123..."
6,ExtraTreesRegressor,"[-6.967442785740002, -6.235413943490004, -5.97...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.715952984170003, -6.505079400079994, -6.1...","[-6.723901873036003, -6.644436374627998, -6.14...","[0.14024361587101936, 0.2905924392604327, 0.05..."
7,LinearRegression,"[-6.232821610538682, -5.9334250610011345, -6.3...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.556666099022283, -6.2114222361058395, -6....","[-6.434198350372452, -6.170975965880979, -6.09...","[0.08298944324790397, 0.04008934127296255, 0.0..."
8,KNeighborsRegressor,"[-6.28, -6.16, -5.803333333333334, -6.85241405...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.753333333333333, -6.746666666666667, -6.4...","[-6.857333333333334, -6.530666666666667, -6.40...","[0.16003332986183413, 0.3216374770866521, 0.06..."
9,SVR,"[-6.483096514167082, -6.0739655648415365, -5.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.481501987025421, -5.940858638117705, -5.8...","[-6.693539304781353, -6.054013070678326, -5.84...","[0.21713267313883866, 0.14122145779951173, 0.0..."


In [163]:
result_df.to_csv('results/Descriptors/Results_3D_RDKit_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_RDKit_desc_Caco2.csv')

In [164]:
#3d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_padel_curated_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_padel_curated_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 431)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 431)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109905
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 431
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109905
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 431
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, th

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2932,0.419,0.5415,0.5165,0.7188,0.6767,0.235,0.3909,0.4847,0.594,0.7743,0.7397
DecisionTreeRegressor,0.6116,0.5737,0.782,-0.0086,0.4835,0.4533,0.2894,0.4179,0.5379,0.5,0.7093,0.6456
RandomForestRegressor,0.3077,0.4397,0.5547,0.4926,0.7076,0.6647,0.2728,0.429,0.5223,0.5286,0.7358,0.6952
GradientBoostingRegressor,0.3118,0.434,0.5584,0.4858,0.6976,0.6484,0.2575,0.4124,0.5075,0.555,0.7497,0.695
AdaBoostRegressor,0.3995,0.5324,0.632,0.3412,0.6173,0.5495,0.3628,0.5171,0.6023,0.3732,0.6685,0.6079
XGBRegressor,0.315,0.4347,0.5612,0.4805,0.6949,0.646,0.2372,0.3882,0.487,0.5901,0.7726,0.7306
ExtraTreesRegressor,0.2913,0.4272,0.5397,0.5196,0.7262,0.6866,0.2579,0.4123,0.5079,0.5543,0.7536,0.716
LinearRegression,0.5028,0.5554,0.7091,0.1707,0.5942,0.5826,0.3911,0.4893,0.6253,0.3243,0.659,0.6231
KNeighborsRegressor,0.4297,0.5054,0.6555,0.2913,0.5743,0.4944,0.3552,0.4617,0.596,0.3862,0.6384,0.5598
SVR,0.3128,0.4332,0.5593,0.4841,0.6991,0.6487,0.288,0.4298,0.5366,0.5024,0.7092,0.6724


In [165]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.584137262963216, -6.9830323568094075, -6.5...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.303663440742395, -6.329417681047214, -6.3...","[-6.587878112280312, -6.543992018150149, -6.65...","[0.16077384152980917, 0.1228399796361004, 0.16..."
1,DecisionTreeRegressor,"[-7.54, -5.96, -7.045757491, -7.15, -7.15, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -5.92, -5.96, -5.886056648, -7.045757...","[-7.148000000000001, -6.469999999999999, -6.69...","[0.16424372134118237, 0.5485982136317982, 0.77..."
2,RandomForestRegressor,"[-6.370062535330001, -6.696095237860002, -6.48...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4136893539300015, -6.596886073149999, -6....","[-6.630417674706001, -6.619476365763999, -6.62...","[0.1503428301914405, 0.056493802307405465, 0.1..."
3,GradientBoostingRegressor,"[-6.929634804006416, -6.848906239419724, -6.75...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.655568626025116, -6.811533638965246, -7.0...","[-6.648864688770667, -6.58548802204479, -7.173...","[0.2438892095750038, 0.22555783462293302, 0.21..."
4,AdaBoostRegressor,"[-6.426708496831248, -6.812172285600002, -6.55...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.426708496831248, -6.602554099714274, -6.6...","[-6.457238036627101, -6.581748820836408, -6.78...","[0.06337447528311273, 0.12334134095907172, 0.1..."
5,XGBRegressor,"[-6.905374, -6.637857, -6.642309, -6.3650374, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.5776997, -6.016545, -6.0968814, -6.252019...","[-6.9092016, -6.3959265, -6.630951, -6.1052394...","[0.26055577, 0.4999867, 0.32150853, 0.15536807..."
6,ExtraTreesRegressor,"[-6.789989353930001, -6.902369100129997, -6.78...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.830241021569998, -6.629099999999999, -6.6...","[-6.867330702370005, -6.6175479400079995, -6.6...","[0.11940512154469354, 0.08619209251535216, 0.0..."
7,LinearRegression,"[-6.130903562382553, -8.093810391990827, -7.61...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.400872924882592, -7.374735028461243, -5.8...","[-7.233260785860726, -7.596454348798867, -6.52...","[0.3735005322900029, 0.4005684625404825, 0.495..."
8,KNeighborsRegressor,"[-6.28, -5.986666666666667, -6.216666666666666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.41, -6.03, -5.883333333333333, -5....","[-6.552, -6.2746666666666675, -6.3353333333333...","[0.22816757974007726, 0.1608118293050743, 0.15..."
9,SVR,"[-6.633542442281951, -6.765975303896598, -6.34...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.637314057114391, -6.274539095812915, -6.4...","[-6.703179689701064, -6.337894720705084, -6.56...","[0.15982068383775597, 0.06843779716052734, 0.1..."


In [166]:
result_df.to_csv('results/Descriptors/Results_3D_padel_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_padel_desc_Caco2.csv')

In [167]:
df_train_rdkit = pd.read_csv('features/Descriptors/Train_3d_RDKit_desc_Caco2.csv')
df_train_rdkit = df_train_rdkit.fillna(0)
df_train_padel = pd.read_csv('features/Descriptors/Train_3d_padel_curated_Caco2.csv')

df_3d_descriptors = df_train_rdkit.merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,44715.312675,91107.809558,123619.238351,0.361718,0.737003,8.828937,0.000016,...,0.652402,0.288976,0.490060,0.392125,0.340978,84.972151,1759.679739,8625.222611,0.478603,1.223163
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,39878.864245,83243.633963,109538.317627,0.364063,0.759950,8.396295,0.000019,...,0.640138,0.291958,0.430048,0.470112,0.328022,82.497866,1702.744759,8910.830409,0.460207,1.228181
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,38542.444878,76292.960568,100389.431793,0.383929,0.759970,8.232366,0.000020,...,0.590397,0.362577,0.489361,0.496496,0.344430,71.764722,1333.271798,5125.668681,0.429461,1.330288
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,34566.515473,82504.806655,101941.820182,0.339081,0.809332,8.340910,0.000023,...,0.668782,0.278567,0.451178,0.416564,0.348152,83.135006,1632.338554,7351.509889,0.503173,1.215893
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,43675.463614,58178.760645,90645.390361,0.481828,0.641828,7.836169,0.000015,...,0.522315,0.388730,0.541868,0.551796,0.382011,65.301775,1211.414868,6306.229872,0.366567,1.475675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,3108.457288,3620.937641,5561.408951,0.558933,0.651083,3.778017,0.000209,...,0.620584,0.265348,0.444383,0.502894,0.436832,18.554653,91.483311,230.026534,0.430876,1.384109
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,1705.198171,6092.708948,7027.507516,0.242646,0.866980,4.218388,0.000508,...,0.692270,0.249929,0.426958,0.468043,0.444280,20.215607,92.963698,195.799164,0.538405,1.339282
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,1587.669882,5846.537779,6808.341490,0.233195,0.858732,4.206054,0.000541,...,0.589060,0.305502,0.509137,0.469219,0.387322,18.258258,91.435060,225.184830,0.383590,1.365678
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,1686.370397,5331.220745,6490.249963,0.259831,0.821420,4.169421,0.000487,...,0.785075,0.156266,0.462273,0.392676,0.434064,21.643247,83.333105,177.935116,0.677612,1.289012


In [168]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [169]:
df_3d_descriptors.to_csv('features/Descriptors/Train_3d_all_descriptors_Caco2.csv', index=False)

In [170]:
df_test_rdkit = pd.read_csv('features/Descriptors/Test_3d_RDKit_desc_Caco2.csv')
df_test_rdkit = df_test_rdkit.fillna(0)
df_test_padel = pd.read_csv('features/Descriptors/Test_3d_padel_curated_Caco2.csv')

df_3d_descriptors = df_test_rdkit.merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,39993.254223,103505.490779,135075.847693,0.296080,0.766277,9.148686,0.000019,...,0.652497,0.280515,0.445002,0.413012,0.296281,83.880226,1727.561974,9047.590565,0.478746,1.154294
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,34076.712402,75970.792197,94707.190141,0.359811,0.802165,8.069914,0.000024,...,0.485352,0.422296,0.447930,0.517179,0.314599,73.405636,1556.088414,9116.543699,0.361471,1.279709
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,39130.184499,55100.825424,80924.805604,0.483538,0.680889,7.511870,0.000017,...,0.510350,0.444723,0.537150,0.549475,0.350829,73.976557,1476.888699,5678.961564,0.432609,1.437454
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,44470.276090,60415.318636,87212.902057,0.509905,0.692734,7.904264,0.000016,...,0.497711,0.432923,0.510948,0.511623,0.391263,64.740473,1173.676682,5294.086991,0.395951,1.413835
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,40729.464320,63771.554832,88096.437834,0.462328,0.723883,7.931026,0.000018,...,0.486151,0.417556,0.491781,0.507360,0.399975,68.119851,1345.764797,7592.643463,0.355561,1.399116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,2624.075321,5553.550321,7616.331460,0.344533,0.729163,4.233854,0.000278,...,0.754131,0.182249,0.480010,0.459342,0.384289,22.131975,96.501291,213.423664,0.631197,1.323641
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,2849.828808,4851.229336,6659.776052,0.427917,0.728437,4.083583,0.000256,...,0.551299,0.372080,0.515541,0.521631,0.360145,19.167585,101.356334,231.204419,0.385069,1.397318
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,3315.000108,4300.114122,6782.316344,0.488771,0.634019,4.088783,0.000191,...,0.531900,0.364522,0.414856,0.423812,0.431576,19.944376,114.058462,293.327379,0.344633,1.270243
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,3105.006206,4093.853252,5671.614623,0.547464,0.721814,3.865884,0.000232,...,0.576767,0.278730,0.433292,0.469774,0.341165,19.549575,108.687575,301.806263,0.365151,1.244231


In [171]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [172]:
df_3d_descriptors.to_csv('features/Descriptors/Test_3d_all_descriptors_Caco2.csv', index=False)

In [173]:
#3d All descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models_3dall = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_3dall, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 442)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 442)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 112710
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 442
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 112710
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 442
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, th

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2935,0.4176,0.5418,0.5159,0.7184,0.678,0.2408,0.3949,0.4907,0.5839,0.7682,0.7331
DecisionTreeRegressor,0.6218,0.5818,0.7885,-0.0255,0.4778,0.4466,0.3245,0.4408,0.5697,0.4393,0.6684,0.6102
RandomForestRegressor,0.307,0.44,0.554,0.4937,0.7084,0.6649,0.2768,0.4334,0.5261,0.5217,0.7316,0.6929
GradientBoostingRegressor,0.3105,0.4317,0.5572,0.488,0.6994,0.6502,0.2605,0.4128,0.5104,0.5499,0.7477,0.6958
AdaBoostRegressor,0.3962,0.5287,0.6295,0.3465,0.6193,0.55,0.3587,0.5142,0.5989,0.3802,0.6701,0.5978
XGBRegressor,0.3421,0.4492,0.5849,0.4357,0.6642,0.6114,0.2484,0.4005,0.4984,0.5708,0.7593,0.7211
ExtraTreesRegressor,0.2876,0.428,0.5363,0.5256,0.7307,0.6913,0.2569,0.4123,0.5069,0.5561,0.7546,0.7111
LinearRegression,0.5052,0.5604,0.7108,0.1668,0.5993,0.5821,0.4265,0.51,0.653,0.2631,0.6312,0.5927
KNeighborsRegressor,0.428,0.5042,0.6542,0.2942,0.5794,0.5011,0.3616,0.4625,0.6014,0.3751,0.6318,0.5578
SVR,0.3108,0.4297,0.5575,0.4874,0.7018,0.6503,0.2893,0.4288,0.5378,0.5001,0.7077,0.6767


In [174]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.613083094952405, -6.921966977584403, -6.59...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.497556102423055, -6.203169375927687, -6.4...","[-6.589243933084594, -6.46108275677815, -6.672...","[0.10581843157294986, 0.13866955427588826, 0.1..."
1,DecisionTreeRegressor,"[-7.24, -5.96, -6.89, -6.82, -7.15, -6.3, -5.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -5.89, -5.96, -5.950781977, -7.045757...","[-6.773999999999999, -6.536, -6.85199999999999...","[0.5681408276123096, 0.6353455752580639, 0.820..."
2,RandomForestRegressor,"[-6.407621450289996, -6.7042072280600005, -6.5...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.46466932, -6.580500000000002, -6.48721020...","[-6.627743667920003, -6.589519988275998, -6.58...","[0.1610900161208192, 0.03496030711841602, 0.12..."
3,GradientBoostingRegressor,"[-6.946302980579683, -6.475575001931822, -6.77...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.573012061495892, -7.161055663467298, -6.9...","[-6.694914153548129, -6.809549385951927, -7.02...","[0.14458892091716022, 0.48038613186466017, 0.2..."
4,AdaBoostRegressor,"[-6.308570744303284, -6.685239979399316, -6.40...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.498425656147735, -6.538026315789475, -6.6...","[-6.579395067985, -6.585678182525802, -6.83128...","[0.1343198286155975, 0.1294113754288366, 0.191..."
5,XGBRegressor,"[-6.8477807, -6.5712824, -6.2852316, -6.491227...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.546626, -6.459586, -6.5167823, -5.919629,...","[-6.679862, -6.407274, -6.642984, -6.0348067, ...","[0.260521, 0.31118798, 0.41817024, 0.24408276,..."
6,ExtraTreesRegressor,"[-6.736532282659999, -6.864913630499999, -6.77...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7358322826600014, -6.594900000000001, -6....","[-6.869550340782004, -6.663659999999998, -6.67...","[0.1070310626576817, 0.06122418966388902, 0.09..."
7,LinearRegression,"[-6.296173287215084, -7.813307282117476, -7.79...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.862000637825103, -7.756908965577798, -5.6...","[-7.6705681932938194, -7.811534208983403, -6.2...","[0.3753272866288221, 0.44872924622687527, 0.46..."
8,KNeighborsRegressor,"[-6.28, -6.403333333333333, -6.496666666666666...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.41, -6.03, -5.996666666666666, -5....","[-6.552, -6.2746666666666675, -6.3353333333333...","[0.22816757974007726, 0.1608118293050743, 0.15..."
9,SVR,"[-6.7013873067629515, -6.749709783909729, -6.4...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.673688839196699, -6.3642694358036165, -6....","[-6.733234153274319, -6.312869861756234, -6.49...","[0.16367281240027912, 0.07501025413840558, 0.1..."


In [175]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_Caco2.csv')

In [176]:
#3d All descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")


X_train shape:  (1008, 442)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 442)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [177]:
#3d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_Caco2.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 372)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 372)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94860
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 372
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94860
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 372
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2899,0.4176,0.5384,0.5219,0.723,0.6776,0.2417,0.3994,0.4916,0.5823,0.7697,0.7387
DecisionTreeRegressor,0.6477,0.6001,0.8048,-0.0681,0.468,0.4318,0.3012,0.4323,0.5488,0.4796,0.6937,0.6491
RandomForestRegressor,0.3106,0.4407,0.5573,0.4877,0.7035,0.651,0.2732,0.4303,0.5227,0.5279,0.7376,0.697
GradientBoostingRegressor,0.3241,0.4428,0.5693,0.4655,0.6825,0.6278,0.2696,0.4226,0.5192,0.5342,0.736,0.6791
AdaBoostRegressor,0.396,0.5305,0.6293,0.347,0.6177,0.5568,0.3658,0.5203,0.6048,0.3679,0.6589,0.598
XGBRegressor,0.3395,0.4432,0.5827,0.44,0.6662,0.6192,0.242,0.398,0.4919,0.5819,0.7691,0.7366
ExtraTreesRegressor,0.2924,0.4279,0.5407,0.5178,0.7253,0.6843,0.258,0.4122,0.5079,0.5543,0.7545,0.7114
LinearRegression,0.4912,0.5467,0.7008,0.19,0.5889,0.5715,0.4036,0.5113,0.6353,0.3026,0.6407,0.5825
KNeighborsRegressor,0.3977,0.4849,0.6306,0.3441,0.613,0.5483,0.3436,0.4491,0.5861,0.4064,0.6577,0.5915
SVR,0.311,0.431,0.5576,0.4871,0.7023,0.6514,0.2864,0.4284,0.5352,0.5051,0.7118,0.6805


In [178]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.772220261824416, -6.937947858268684, -6.55...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.5116007299859024, -6.362071367479643, -6....","[-6.6384844719350395, -6.559490144849233, -6.6...","[0.1414192782616468, 0.14401678274818602, 0.12..."
1,DecisionTreeRegressor,"[-5.28, -7.03, -6.89, -6.89, -5.96, -6.85, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-7.24, -6.96, -6.66, -5.64, -5.8, -7.0, -5.2...","[-6.5040000000000004, -6.736, -6.6659999999999...","[0.6022823258240275, 0.4859876541641774, 0.762..."
2,RandomForestRegressor,"[-6.479099999999999, -6.798729221829999, -6.65...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.5079033305600005, -6.49668607315, -6.4601...","[-6.6605361539680015, -6.582658294484, -6.5872...","[0.15284956746337552, 0.0508995921028729, 0.10..."
3,GradientBoostingRegressor,"[-6.867958574394078, -6.441506384482038, -6.50...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.981818464552121, -7.199232888597217, -6.8...","[-6.88225059066191, -6.897734939670686, -6.890...","[0.16053581265072475, 0.3024540025591479, 0.16..."
4,AdaBoostRegressor,"[-6.618908457399992, -6.705832086171658, -6.48...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6809938622961305, -6.575671646704264, -6....","[-6.591257836513941, -6.535053305367327, -6.68...","[0.12344925703902834, 0.07315628584816435, 0.1..."
5,XGBRegressor,"[-6.658747, -6.8383975, -6.682644, -6.699721, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.6192503, -6.757822, -6.4840307, -6.320499...","[-6.687722, -6.683091, -6.6873426, -6.285418, ...","[0.32753578, 0.09894828, 0.357142, 0.089390285..."
6,ExtraTreesRegressor,"[-6.743239700039998, -6.90824793462, -6.814827...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.731899999999999, -6.459599999999997, -6.7...","[-6.867180000000005, -6.607157940007999, -6.79...","[0.11064556746657592, 0.07764196780333082, 0.0..."
7,LinearRegression,"[-5.807743069860188, -8.218118031396457, -7.59...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.994961555462269, -7.167018444969636, -5.2...","[-7.101488889287291, -7.217298737099507, -6.04...","[0.38926196962570814, 0.508880975097409, 0.523..."
8,KNeighborsRegressor,"[-6.28, -6.55, -6.576666666666667, -6.12333333...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.503333333333334, -6.03, -5.9966666...","[-6.552, -6.260000000000001, -6.35466666666666...","[0.22816757974007726, 0.1463481389624676, 0.16..."
9,SVR,"[-6.873083044519573, -6.856089885383239, -6.44...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.7870911702230226, -6.42703112828175, -6.4...","[-6.820704310630248, -6.440670664897278, -6.54...","[0.13345435681487666, 0.07189156924700424, 0.1..."


In [179]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_LVR_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_LVR_Caco2.csv')

In [180]:
#2d and 3d descriptors all
df_train_2d = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')
df_train_2d
df_train_3d = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_Caco2.csv')
df_train_3d

df_2d_3d_train = df_train_2d.merge(df_train_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_train.to_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv', index=False)
df_2d_3d_train

  df_train_2d = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_Caco2.csv')


Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2065,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.22,16.271931,16.271931,0.025756,-1.930898,0.058792,23.408333,1664.156,...,0.652402,0.288976,0.490060,0.392125,0.340978,84.972151,1759.679739,8625.222611,0.478603,1.223163
1,2067,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.24,16.083777,16.083777,0.027668,-1.934865,0.053531,23.411765,1650.129,...,0.640138,0.291958,0.430048,0.470112,0.328022,82.497866,1702.744759,8910.830409,0.460207,1.228181
2,1914,CCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CCC...,-8.00,15.668220,15.668220,0.003714,-4.150777,0.085546,24.018349,1587.863,...,0.590397,0.362577,0.489361,0.496496,0.344430,71.764722,1333.271798,5125.668681,0.429461,1.330288
3,2026,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.64,16.048909,16.048909,0.025704,-1.891709,0.101375,24.159292,1574.031,...,0.668782,0.278567,0.451178,0.416564,0.348152,83.135006,1632.338554,7351.509889,0.503173,1.215893
4,1920,CCCCN1CC(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@...,-7.05,15.511700,15.511700,0.033138,-4.150544,0.085338,23.366972,1567.445,...,0.522315,0.388730,0.541868,0.551796,0.382011,65.301775,1211.414868,6306.229872,0.366567,1.475675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,12.898424,12.898424,0.143657,-0.742398,0.662387,22.258065,430.549,...,0.620584,0.265348,0.444383,0.502894,0.436832,18.554653,91.483311,230.026534,0.430876,1.384109
1004,2470,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.60,12.903074,12.903074,0.161549,-0.731330,0.598056,24.933333,416.566,...,0.692270,0.249929,0.426958,0.468043,0.444280,20.215607,92.963698,195.799164,0.538405,1.339282
1005,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.70,12.835172,12.835172,0.168936,-0.728726,0.606745,24.965517,402.539,...,0.589060,0.305502,0.509137,0.469219,0.387322,18.258258,91.435060,225.184830,0.383590,1.365678
1006,2468,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.90,12.763992,12.763992,0.179945,-0.726122,0.611430,25.000000,388.512,...,0.785075,0.156266,0.462273,0.392676,0.434064,21.643247,83.333105,177.935116,0.677612,1.289012


In [181]:
df_test_2d = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_Caco2.csv')
df_test_2d
df_test_3d = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_Caco2.csv')
df_test_3d

df_2d_3d_test = df_test_2d.merge(df_test_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_test.to_csv('features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv', index=False)
df_2d_3d_test

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2064,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.19,15.976055,15.976055,0.030813,-1.935404,0.058779,22.883333,1664.156,...,0.652497,0.280515,0.445002,0.413012,0.296281,83.880226,1727.561974,9047.590565,0.478746,1.154294
1,8066,CC[C@H]1C(=O)N[C@@H](COCCC(C)C)C(=O)N(C)[C@@H]...,-6.21,15.516152,15.516152,0.000091,-1.679143,0.049536,24.360360,1572.049,...,0.485352,0.422296,0.447930,0.517179,0.314599,73.405636,1556.088414,9116.543699,0.361471,1.279709
2,2068,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-7.24,15.993973,15.993973,0.020362,-1.750211,0.057933,24.294643,1552.024,...,0.510350,0.444723,0.537150,0.549475,0.350829,73.976557,1476.888699,5678.961564,0.432609,1.437454
3,2234,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-5.85,15.804049,15.804049,0.002346,-2.970606,0.165351,26.981481,1537.344,...,0.497711,0.432923,0.510948,0.511623,0.391263,64.740473,1173.676682,5294.086991,0.395951,1.413835
4,2230,CC(C)C[C@H]1C(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N[...,-5.96,15.545371,15.545371,0.013183,-2.948188,0.126607,27.422018,1530.953,...,0.486151,0.417556,0.491781,0.507360,0.399975,68.119851,1345.764797,7592.643463,0.355561,1.399116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,8448,CC(C)C[C@@H]1NC(=O)CN(C)C(=O)[C@H](Cc2ccccc2)N...,-5.88,13.192828,13.192828,0.126957,-0.677034,0.670121,21.000000,440.544,...,0.754131,0.182249,0.480010,0.459342,0.384289,22.131975,96.501291,213.423664,0.631197,1.323641
248,2478,CC(C)C[C@@H]1NC(=O)[C@H](C)NCCCCCCNC(=O)[C@H](...,-4.50,13.114656,13.114656,0.186152,-0.689890,0.586385,24.903226,430.593,...,0.551299,0.372080,0.515541,0.521631,0.360145,19.167585,101.356334,231.204419,0.385069,1.397318
249,2477,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-4.20,13.174810,13.174810,0.158364,-0.755076,0.586385,24.903226,430.593,...,0.531900,0.364522,0.414856,0.423812,0.431576,19.944376,114.058462,293.327379,0.344633,1.270243
250,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,0.576767,0.278730,0.433292,0.469774,0.341165,19.549575,108.687575,301.806263,0.365151,1.244231


In [182]:
#All 2d and 3d descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 3531)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 3531)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 612762
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2790
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 613150
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2810
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1846,0.32,0.4296,0.6956,0.8355,0.8186,0.1557,0.3055,0.3946,0.731,0.8579,0.8459
DecisionTreeRegressor,0.3835,0.4564,0.6193,0.3675,0.6789,0.6497,0.2243,0.3618,0.4736,0.6124,0.7849,0.7712
RandomForestRegressor,0.2054,0.3387,0.4533,0.6612,0.818,0.7974,0.1774,0.3298,0.4212,0.6935,0.8386,0.8272
GradientBoostingRegressor,0.1939,0.3338,0.4404,0.6801,0.8263,0.8039,0.1589,0.3137,0.3986,0.7254,0.8572,0.8409
AdaBoostRegressor,0.2623,0.4099,0.5122,0.5674,0.7726,0.7326,0.2434,0.4141,0.4933,0.5795,0.785,0.7567
XGBRegressor,0.2006,0.3365,0.4479,0.6691,0.818,0.7988,0.1674,0.3119,0.4092,0.7107,0.8445,0.8348
ExtraTreesRegressor,0.1741,0.3061,0.4173,0.7129,0.8458,0.8314,0.155,0.2959,0.3937,0.7322,0.8578,0.8503
LinearRegression,1.5619,0.9249,1.2498,-1.5759,0.41,0.4604,0.7023,0.6271,0.8381,-0.2136,0.5551,0.5926
KNeighborsRegressor,0.2225,0.3393,0.4717,0.6331,0.8019,0.7769,0.1963,0.3317,0.443,0.6608,0.8162,0.7992
SVR,0.2037,0.3355,0.4514,0.664,0.8177,0.8075,0.1664,0.3227,0.4079,0.7125,0.8481,0.8356


In [183]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.3239551943750865, -7.4300588420614595, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.441359680671323, -6.2598048395463435, -6....","[-6.7086548897806235, -6.217133668304724, -6.3...","[0.14057401251212948, 0.1706962991944757, 0.07..."
1,DecisionTreeRegressor,"[-6.07, -7.37, -6.27, -7.43, -6.28, -6.82, -5....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.82, -7.1, -5.85, -5.92, -5.92, -7.0, -6.7...","[-6.816, -6.51, -6.026, -6.382000000000001, -6...","[0.6274264897181184, 0.6971943774873691, 0.335..."
2,RandomForestRegressor,"[-6.202858549559997, -7.211357574909997, -6.88...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.305169771049999, -6.5803411998199985, -6....","[-6.587035445574003, -6.414469627444001, -6.32...","[0.15232881640359397, 0.09988190079560458, 0.0..."
3,GradientBoostingRegressor,"[-6.458802739245218, -7.166853467379407, -7.05...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.36453522008232, -6.533752663648551, -6.52...","[-6.7613359174520955, -6.467589434471785, -6.4...","[0.22096782003708035, 0.24920961937784228, 0.0..."
4,AdaBoostRegressor,"[-6.190325550228773, -7.131596355724548, -6.94...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.459716775555553, -6.129521115807699, -6.4...","[-6.34663091628367, -6.393858755444659, -6.277...","[0.16612683684319493, 0.25235488687873336, 0.0..."
5,XGBRegressor,"[-6.3436275, -7.349347, -6.92147, -7.0928574, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.410254, -6.6115594, -6.43151, -6.01403, -...","[-6.8194633, -6.330651, -6.324813, -5.985284, ...","[0.2070164, 0.18104887, 0.1206626, 0.104193635..."
6,ExtraTreesRegressor,"[-6.394755614109996, -7.363399999999996, -7.06...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.376585749729998, -6.392799999999999, -6.1...","[-6.8297227901920055, -6.480563231786, -6.3224...","[0.22829975530755686, 0.10230727001701186, 0.1..."
7,LinearRegression,"[-10.0, -6.876081771731154, -6.72058471128096,...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-10.0, -9.181060896261307, -6.65316421157715...","[-8.520844750698263, -7.011347518329975, -7.69...","[1.1830204979968597, 2.538387978580786, 0.8191..."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.036666666666666, -6.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.776666666666666, -6.31666666666666...","[-6.552, -6.728, -6.298666666666667, -5.805333...","[0.22816757974007726, 0.02463962481676815, 0.0..."
9,SVR,"[-6.323726069194609, -7.152231219528151, -7.03...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.4217105733227235, -6.451571065910834, -6....","[-6.824478247718709, -6.5573376233780865, -6.2...","[0.2579454326829819, 0.08412043504736268, 0.07..."


In [184]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_Caco2.csv')

In [185]:
#All 2d and 3d descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 2917)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 2917)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097677 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 612762
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2790
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 613150
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2810
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1846,0.32,0.4296,0.6956,0.8355,0.8186,0.1557,0.3055,0.3946,0.731,0.8579,0.8459
DecisionTreeRegressor,0.3938,0.4601,0.6276,0.3505,0.6747,0.6508,0.2083,0.3455,0.4564,0.6401,0.801,0.7793
RandomForestRegressor,0.2069,0.3395,0.4548,0.6588,0.8164,0.7949,0.1779,0.33,0.4217,0.6927,0.8382,0.826
GradientBoostingRegressor,0.1941,0.3333,0.4406,0.6799,0.8261,0.804,0.1599,0.3149,0.3999,0.7236,0.856,0.8391
AdaBoostRegressor,0.2662,0.4129,0.5159,0.561,0.7673,0.7369,0.247,0.4193,0.497,0.5732,0.7811,0.741
XGBRegressor,0.2006,0.3365,0.4479,0.6691,0.818,0.7988,0.1674,0.3119,0.4092,0.7107,0.8445,0.8348
ExtraTreesRegressor,0.1744,0.3034,0.4176,0.7124,0.8454,0.8323,0.1542,0.2963,0.3926,0.7336,0.8588,0.85
LinearRegression,1.5619,0.9249,1.2498,-1.5759,0.41,0.4604,0.7023,0.6271,0.8381,-0.2136,0.5551,0.5926
KNeighborsRegressor,0.2225,0.3393,0.4717,0.6331,0.8019,0.7769,0.1963,0.3317,0.443,0.6608,0.8162,0.7992
SVR,0.2037,0.3355,0.4514,0.664,0.8177,0.8075,0.1664,0.3227,0.4079,0.7125,0.8481,0.8356


In [186]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.3239551943750865, -7.4300588420614595, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.441359680671323, -6.2598048395463435, -6....","[-6.7086548897806235, -6.217133668304724, -6.3...","[0.14057401251212948, 0.1706962991944757, 0.07..."
1,DecisionTreeRegressor,"[-6.07, -7.37, -6.49, -6.89, -6.28, -6.82, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.57, -7.1, -5.869999999999999, -5.92, -5.9...","[-6.789999999999999, -6.294, -5.984, -6.084000...","[0.5912698199637793, 0.6520613468071849, 0.350..."
2,RandomForestRegressor,"[-6.194859707699997, -7.1810999999999945, -6.8...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.274973982899999, -6.550726136149999, -6.2...","[-6.587565984342002, -6.402276940807999, -6.33...","[0.16524027584157175, 0.0940200917001238, 0.08..."
3,GradientBoostingRegressor,"[-6.62045983701556, -7.166853467379407, -7.056...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.387448547697937, -6.555870848310229, -6.5...","[-6.769782278105415, -6.465872366659797, -6.46...","[0.21439808690960072, 0.2584268567602794, 0.05..."
4,AdaBoostRegressor,"[-6.5549005543361725, -7.288683096877405, -6.6...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.531255768166381, -6.557999999999995, -6.2...","[-6.411304669150927, -6.527354152602823, -6.25...","[0.18573412965654898, 0.19841157567324882, 0.1..."
5,XGBRegressor,"[-6.3436275, -7.349347, -6.92147, -7.0928574, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.410254, -6.6115594, -6.43151, -6.01403, -...","[-6.8194633, -6.330651, -6.324813, -5.985284, ...","[0.2070164, 0.18104887, 0.1206626, 0.104193635..."
6,ExtraTreesRegressor,"[-6.4520437893699984, -7.382089700039995, -7.0...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.454133489419999, -6.278250000000001, -6.1...","[-6.803843668492005, -6.4719379238400006, -6.3...","[0.18692165387932033, 0.10071266347937356, 0.1..."
7,LinearRegression,"[-10.0, -6.876081771731369, -6.720584711281286...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-10.0, -9.181060896261075, -6.65316421157709...","[-8.520844750698298, -7.011347518329876, -7.69...","[1.183020497996832, 2.5383879785807197, 0.8191..."
8,KNeighborsRegressor,"[-6.28, -7.36, -6.98, -7.036666666666666, -6.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.28, -6.776666666666666, -6.31666666666666...","[-6.552, -6.728, -6.298666666666667, -5.805333...","[0.22816757974007726, 0.02463962481676815, 0.0..."
9,SVR,"[-6.323678948199774, -7.152503061312745, -7.03...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.421720452667785, -6.451579586938511, -6.2...","[-6.82452490570338, -6.557357935117706, -6.290...","[0.2579773008779037, 0.08409299593401605, 0.07..."


In [187]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_const_rem_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_const_rem_Caco2.csv')

In [188]:
#All 2d and 3d descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


X_train shape:  (1008, 2125)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 2125)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 428691
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2044
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.186283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 428924
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 2064
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.184,0.3215,0.429,0.6965,0.8357,0.8181,0.1556,0.3051,0.3945,0.7311,0.858,0.8468
DecisionTreeRegressor,0.4284,0.4774,0.6545,0.2934,0.6528,0.6321,0.2026,0.3456,0.4501,0.6499,0.8068,0.7822
RandomForestRegressor,0.203,0.338,0.4506,0.6651,0.8196,0.7982,0.1751,0.3269,0.4184,0.6975,0.8409,0.8272
GradientBoostingRegressor,0.1934,0.3299,0.4398,0.681,0.8265,0.8025,0.1603,0.3117,0.4003,0.7231,0.8545,0.8385
AdaBoostRegressor,0.2656,0.4128,0.5154,0.5619,0.7706,0.7334,0.2512,0.4217,0.5012,0.566,0.7766,0.735
XGBRegressor,0.22,0.349,0.469,0.6372,0.7984,0.7727,0.1707,0.3152,0.4132,0.705,0.8414,0.8286
ExtraTreesRegressor,0.1742,0.3039,0.4173,0.7127,0.8454,0.8322,0.1519,0.2944,0.3897,0.7376,0.8612,0.8495
LinearRegression,2.2766,1.1839,1.5089,-2.7547,0.3352,0.351,0.7238,0.6617,0.8508,-0.2508,0.5283,0.5326
KNeighborsRegressor,0.2208,0.3406,0.4699,0.6358,0.803,0.7817,0.1989,0.3399,0.446,0.6563,0.8134,0.7959
SVR,0.2034,0.335,0.451,0.6645,0.818,0.8099,0.1654,0.3232,0.4067,0.7141,0.8498,0.8405


In [189]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_LVR_Caco2.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_LVR_Caco2.csv')

In [54]:
#Stacked architecture model
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import joblib

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [55]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [56]:
from tqdm import tqdm
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Train/All_fingerprints_train_Caco2.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Test/All_fingerprints_test_Caco2.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Train_all_atomic_desc_Caco2.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Test_all_atomic_desc_Caco2.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
target_column = 'Permeability'
def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)
print(df_desc_train)
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(), 
    MLPRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),

]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]


  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(1008, 262)
(252, 262)
(1008, 916)
(252, 916)
(1008, 763)
(252, 763)
(1008, 12)
(252, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       ID                                             SMILES  Permeability  \
875    33  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -5.810   
846    40  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -6.340   
841    41  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -6.560   
932   927  CC(C)C[C@@H]1NC(=O)CNC(=O)[C@@H]2CCCN2[C@H](C(...        -6.400   
881   982  CC[C@H](C)[C@@H]1NC(

In [57]:
df_desc_train

Unnamed: 0,ID,SMILES,Permeability,qed,SPS,FpDensityMorgan1,AvgIpc,Ipc,PEOE_VSA14,EState_VSA11,...,LOBMAX,LOBMIN,MOMI-XY,MOMI-YZ,geomShape,RDF30s,L2m,L3m,Dv,De
875,33,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-5.810,0.703228,0.385496,0.761608,1.468187,-0.031659,-0.271516,-1.115788,...,0.311732,0.159893,-0.391345,-0.215417,1.700807,-1.471896,-1.440537,-0.827156,-2.942554,-3.149153
846,40,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-6.340,0.556275,-0.056922,1.009841,1.482616,-0.031659,3.339650,-1.115788,...,0.360008,0.813132,-1.454567,0.020463,-0.682773,-1.334859,-1.628581,-0.086557,-1.400372,-0.533062
841,41,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-6.560,0.238877,-0.168818,0.911163,1.508891,-0.031659,-0.271516,-1.115788,...,0.972995,-0.744163,-0.858476,1.408864,0.627862,-1.614832,-1.461375,-1.568574,-1.434428,-0.483362
932,927,CC(C)C[C@@H]1NC(=O)CNC(=O)[C@@H]2CCCN2[C@H](C(...,-6.400,0.438976,0.476621,2.623356,1.338114,-0.031659,-0.271516,-1.115788,...,2.138251,1.619295,-2.164387,2.700145,1.488521,-1.420723,-2.067594,-1.120407,-0.720228,-0.235899
881,982,CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](...,-5.280,1.154721,1.703431,1.639716,1.775562,-0.031659,-0.271516,-1.115788,...,-0.352665,-0.445131,0.836253,-0.387743,-0.156282,-1.488111,-1.102026,-1.594992,-1.792233,-1.564364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,8496,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-5.600,-0.607415,-0.834554,-0.689958,-0.892283,-0.031659,-0.271516,0.945818,...,0.860800,-0.401754,-0.615772,0.006028,1.717829,-0.226197,0.261951,1.375161,1.704078,1.218580
60,8498,CC[C@H](C)[C@H]1C(=O)N[C@@H]([C@@H](C)O)C(=O)N...,-5.960,-0.677203,-0.801970,-0.232814,-0.907508,-0.031655,-0.271516,1.289420,...,-0.834112,-0.313196,1.792746,-1.023875,-1.522834,0.301591,1.289178,-0.081311,1.105211,1.101078
310,8499,CC[C@H](C)[C@H]1C(=O)N[C@@H]([C@@H](C)O)C(=O)N...,-5.595,-0.539252,-0.234947,-0.219363,-0.903582,-0.031659,-0.271516,0.602217,...,-0.869882,-0.346934,0.949490,-0.657730,-1.493457,0.778807,0.925891,0.092756,1.372040,1.174441
286,8500,CC[C@H](C)[C@H]1C(=O)N[C@@H]([C@@H](C)O)C(=O)N...,-5.890,-0.426822,-0.377577,-0.060946,-0.895950,-0.031659,-0.271516,0.258616,...,-0.203118,-0.773476,0.350372,-0.120445,1.208900,0.079513,0.705817,-0.219285,0.880309,0.667069


In [58]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 5-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=5, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=5, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (5 fold CV)': mse_train,
        'Train MAE (5 fold CV)': mae_train,
        'Train RMSE (5 fold CV)': rmse_train,
        'Train R2 (5 fold CV)': r2_train,
        'Train PCC (5 fold CV)': pearson_train,
        'Train SCC (5 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53544
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 251
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53503
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53502
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start t


Training models: 1it [00:03,  3.04s/it][A




Training models: 2it [00:05,  2.92s/it][A
Training models: 3it [00:31, 13.46s/it][A
Training models: 4it [00:39, 11.19s/it][A
Training models: 5it [00:43,  8.51s/it][A
Training models: 6it [00:44,  6.14s/it][A
Training models: 7it [00:45,  4.22s/it][A

Training models: 9it [00:50,  3.52s/it][A
Training models: 10it [00:51,  5.12s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:51<02:33, 51.22s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2431
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 654
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021297 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 671
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training models: 1it [00:03,  3.09s/it][A




Training models: 2it [00:05,  2.51s/it][A
Training models: 3it [00:10,  3.84s/it][A
Training models: 4it [00:14,  3.70s/it][A
Training models: 5it [00:17,  3.48s/it][A
Training models: 6it [00:18,  2.90s/it][A
Training models: 7it [00:19,  2.04s/it][A
Training models: 8it [00:20,  1.77s/it][A
Training models: 9it [00:23,  2.32s/it][A
Training models: 10it [00:24,  2.43s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [01:15<01:10, 35.42s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Star


Training models: 1it [00:03,  3.36s/it][A




Training models: 2it [00:08,  4.26s/it][A
Training models: 3it [01:44, 46.08s/it][A
Training models: 4it [02:11, 38.80s/it][A
Training models: 5it [02:21, 28.43s/it][A
Training models: 6it [02:23, 19.47s/it][A
Training models: 7it [02:24, 13.19s/it][A
Training models: 8it [02:25,  9.31s/it][A
Training models: 9it [02:28,  7.55s/it][A
Training models: 10it [02:31, 15.19s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [03:47<01:28, 88.62s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training models: 1it [00:01,  1.51s/it][A




Training models: 2it [00:03,  1.69s/it][A
Training models: 3it [00:03,  1.04s/it][A
Training models: 4it [00:03,  1.49it/s][A
Training models: 5it [00:04,  1.36it/s][A
Training models: 6it [00:05,  1.07it/s][A

Training models: 10it [00:07,  1.32it/s][A
Processing dataframe pairs: 100%|██████████| 4/4 [03:55<00:00, 58.77s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (1008, 40)
Dimensions of meta_features_test: (252, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063403 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9277
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000891 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9254
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000892 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9097
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40
[LightGBM] [Info] Start training from score -6.270552




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9187
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 40
[LightGBM] [Info] Start training from score -6.284236




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9255
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 40
[LightGBM] [Info] Start training from score -6.280596




LGBMRegressor Evaluation completed: Test R2 score: 0.7491076260502498




DecisionTreeRegressor Evaluation completed: Test R2 score: 0.7354640526538386
RandomForestRegressor Evaluation completed: Test R2 score: 0.7430470678296435
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.744179993491047
AdaBoostRegressor Evaluation completed: Test R2 score: 0.704560942331324
XGBRegressor Evaluation completed: Test R2 score: 0.7485242724486367
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.7402286688165898
LinearRegression Evaluation completed: Test R2 score: 0.749050116022123
KNeighborsRegressor Evaluation completed: Test R2 score: 0.702067754582006
SVR Evaluation completed: Test R2 score: 0.7565272823996034
MLPRegressor Evaluation completed: Test R2 score: 0.6847942818924024




Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.159956,0.291163,0.399944,0.736193,0.85822,0.846713,0.145197,0.284837,0.381048,0.749108,0.866689,0.84996
DecisionTreeRegressor,0.299475,0.41532,0.547243,0.50609,0.743898,0.726259,0.153093,0.283637,0.391271,0.735464,0.85843,0.844976
RandomForestRegressor,0.152452,0.287027,0.390451,0.748568,0.865201,0.852316,0.148705,0.284626,0.385622,0.743047,0.863077,0.850381
GradientBoostingRegressor,0.15621,0.290291,0.395234,0.742371,0.861785,0.848611,0.148049,0.284608,0.384771,0.74418,0.863906,0.848251
AdaBoostRegressor,0.169276,0.313129,0.411431,0.720822,0.850118,0.830827,0.170977,0.318872,0.413494,0.704561,0.843893,0.83188
XGBRegressor,0.172412,0.302033,0.415226,0.715648,0.847095,0.832813,0.145535,0.286552,0.38149,0.748524,0.866385,0.850713
ExtraTreesRegressor,0.149728,0.281803,0.386947,0.753061,0.867792,0.856207,0.150336,0.288282,0.387732,0.740229,0.861691,0.847543
LinearRegression,0.152395,0.291064,0.390378,0.748662,0.865421,0.857749,0.145231,0.283996,0.381091,0.74905,0.867,0.853699
KNeighborsRegressor,0.170697,0.31027,0.413155,0.718478,0.848212,0.831957,0.17242,0.312902,0.415235,0.702068,0.839186,0.819159
SVR,0.163206,0.296467,0.403987,0.730832,0.855191,0.842456,0.140903,0.28208,0.375371,0.756527,0.870071,0.856755


In [59]:
results_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Results_5_folds_stacked_archi_Caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Prediction_data_5_folds_stacked_archi_Caco2.csv')

In [60]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 10-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=10, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10,-4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=10, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (10 fold CV)': mse_train,
        'Train MAE (10 fold CV)': mae_train,
        'Train RMSE (10 fold CV)': rmse_train,
        'Train R2 (10 fold CV)': r2_train,
        'Train PCC (10 fold CV)': pearson_train,
        'Train SCC (10 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53693
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 251
[LightGBM] [Info] Start training from score -6.283689
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53716
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 251
[LightGBM] [Info] Start training from score -6.294458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53692
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 251
[LightGBM] [Info] Start t


Training models: 1it [00:06,  6.77s/it][A
Training models: 2it [00:12,  6.07s/it][A
Training models: 3it [01:11, 30.18s/it][A
Training models: 4it [01:28, 25.10s/it][A
Training models: 5it [01:36, 18.76s/it][A
Training models: 6it [01:39, 13.52s/it][A
Training models: 7it [01:39,  9.25s/it][A

Training models: 9it [01:51,  8.01s/it][A
Training models: 10it [01:54, 11.42s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [01:54<05:42, 114.20s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2558
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 693
[LightGBM] [Info] Start training from score -6.283689
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2541
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 689
[LightGBM] [Info] Start training from score -6.294458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi


Training models: 1it [00:07,  7.07s/it][A
Training models: 2it [00:11,  5.39s/it][A
Training models: 3it [00:23,  8.44s/it][A
Training models: 4it [00:30,  8.10s/it][A
Training models: 5it [00:37,  7.54s/it][A
Training models: 6it [00:41,  6.20s/it][A
Training models: 7it [00:41,  4.34s/it][A
Training models: 8it [00:43,  3.72s/it][A
Training models: 9it [00:54,  5.72s/it][A
Training models: 10it [00:54,  5.50s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [02:49<02:38, 79.36s/it] 
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028532 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 760
[LightGBM] [Info] Start training from score -6.283689
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 760
[LightGBM] [Info] Start training from score -6.294458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023474 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 760
[LightGBM] [Info] Star


Training models: 1it [00:06,  6.11s/it][A
Training models: 2it [00:17,  9.42s/it][A
Training models: 3it [03:56, 104.97s/it][A
Training models: 4it [04:58, 87.98s/it] [A
Training models: 5it [05:19, 63.82s/it][A
Training models: 6it [05:23, 43.58s/it][A
Training models: 7it [05:24, 29.49s/it][A
Training models: 8it [05:26, 20.74s/it][A
Training models: 9it [05:41, 19.01s/it][A
Training models: 10it [05:48, 34.87s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [08:37<03:22, 202.35s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 9
[LightGBM] [Info] Start training from score -6.283689
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 9
[LightGBM] [Info] Start training from score -6.294458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training models: 1it [00:03,  3.72s/it][A
Training models: 2it [00:07,  3.54s/it][A
Training models: 3it [00:07,  2.17s/it][A
Training models: 4it [00:07,  1.39s/it][A
Training models: 5it [00:09,  1.42s/it][A
Training models: 6it [00:12,  1.89s/it][A

Training models: 10it [00:15,  1.58s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [08:53<00:00, 133.42s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (1008, 40)
Dimensions of meta_features_test: (252, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9708
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.283689




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9708
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.294458




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9687
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.292229




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9681
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.287011




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9696
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.268746




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9689
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.288406




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9623
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.291423




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9627
[LightGBM] [Info] Number of data points in the train set: 907, number of used features: 40
[LightGBM] [Info] Start training from score -6.277034




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000884 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9619
[LightGBM] [Info] Number of data points in the train set: 908, number of used features: 40
[LightGBM] [Info] Start training from score -6.287175




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000905 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9708
[LightGBM] [Info] Number of data points in the train set: 908, number of used features: 40
[LightGBM] [Info] Start training from score -6.279756




LGBMRegressor Evaluation completed: Test R2 score: 0.7472685539232105
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.7168634807686989
RandomForestRegressor Evaluation completed: Test R2 score: 0.7453569668192688
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.742698399645302
AdaBoostRegressor Evaluation completed: Test R2 score: 0.7051909530578123
XGBRegressor Evaluation completed: Test R2 score: 0.7417718929422539
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.7450056455146163
LinearRegression Evaluation completed: Test R2 score: 0.7410408315080039
KNeighborsRegressor Evaluation completed: Test R2 score: 0.707951108745701
SVR Evaluation completed: Test R2 score: 0.7467678908730655




MLPRegressor Evaluation completed: Test R2 score: 0.7161462058737579


Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.152143,0.284523,0.390055,0.749078,0.865619,0.851738,0.146262,0.285215,0.382442,0.747269,0.865548,0.847621
DecisionTreeRegressor,0.301162,0.401213,0.548782,0.503309,0.754309,0.746712,0.163858,0.296626,0.404793,0.716863,0.8478,0.835092
RandomForestRegressor,0.145697,0.279353,0.381703,0.759708,0.871632,0.859232,0.147368,0.28279,0.383885,0.745357,0.864132,0.851576
GradientBoostingRegressor,0.148757,0.284208,0.385691,0.754662,0.868857,0.857435,0.148906,0.289896,0.385884,0.742698,0.862902,0.84975
AdaBoostRegressor,0.158634,0.301448,0.398289,0.738372,0.860013,0.842947,0.170613,0.315919,0.413053,0.705191,0.843601,0.829994
XGBRegressor,0.163378,0.296894,0.4042,0.730549,0.855572,0.84196,0.149443,0.28956,0.386578,0.741772,0.862356,0.846438
ExtraTreesRegressor,0.147052,0.282668,0.383474,0.757474,0.870338,0.855836,0.147571,0.283408,0.38415,0.745006,0.863849,0.848919
LinearRegression,0.151227,0.290445,0.388879,0.750589,0.866457,0.854528,0.149866,0.290985,0.387125,0.741041,0.862482,0.852507
KNeighborsRegressor,0.160306,0.302871,0.400382,0.735616,0.858192,0.832382,0.169016,0.307547,0.411115,0.707951,0.842485,0.825084
SVR,0.152938,0.288693,0.391072,0.747767,0.86481,0.848001,0.146551,0.2862,0.38282,0.746768,0.865245,0.847982


In [61]:
results_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Results_10_folds_stacked_archi_Caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Prediction_data_10_folds_stacked_archi_Caco2.csv')

In [62]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 15-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=15, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=15, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (15 fold CV)': mse_train,
        'Train MAE (15 fold CV)': mae_train,
        'Train RMSE (15 fold CV)': rmse_train,
        'Train R2 (15 fold CV)': r2_train,
        'Train PCC (15 fold CV)': pearson_train,
        'Train SCC (15 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53739
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 251
[LightGBM] [Info] Start training from score -6.283466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53755
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 251
[LightGBM] [Info] Start training from score -6.294726
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53756
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 251
[LightGBM] [Info] Start t


Training models: 1it [00:09,  9.31s/it][A
Training models: 2it [00:17,  8.92s/it][A
Training models: 3it [01:49, 46.87s/it][A
Training models: 4it [02:16, 38.95s/it][A
Training models: 5it [02:28, 28.99s/it][A
Training models: 6it [02:33, 20.88s/it][A
Training models: 7it [02:33, 14.26s/it][A

Training models: 9it [02:53, 12.70s/it][A
Training models: 10it [02:57, 17.72s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [02:57<08:51, 177.16s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2578
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 699
[LightGBM] [Info] Start training from score -6.283466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2591
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 703
[LightGBM] [Info] Start training from score -6.294726
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi


Training models: 1it [00:10, 10.61s/it][A
Training models: 2it [00:16,  8.10s/it][A
Training models: 3it [00:35, 12.95s/it][A
Training models: 4it [00:47, 12.59s/it][A
Training models: 5it [00:57, 11.45s/it][A
Training models: 6it [01:02,  9.47s/it][A
Training models: 7it [01:03,  6.62s/it][A
Training models: 8it [01:07,  5.64s/it][A
Training models: 9it [01:23,  8.86s/it][A
Training models: 10it [01:24,  8.44s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [04:21<04:05, 122.59s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 760
[LightGBM] [Info] Start training from score -6.283466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 760
[LightGBM] [Info] Start training from score -6.294726
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 760
[LightGBM] [Info] Star


Training models: 1it [00:09,  9.45s/it][A
Training models: 2it [00:27, 14.75s/it][A
Training models: 3it [06:08, 163.51s/it][A
Training models: 4it [07:43, 136.64s/it][A
Training models: 5it [08:14, 98.56s/it] [A
Training models: 6it [08:21, 67.29s/it][A
Training models: 7it [08:22, 45.51s/it][A
Training models: 8it [08:25, 31.97s/it][A
Training models: 9it [08:47, 29.06s/it][A
Training models: 10it [08:59, 53.91s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [13:20<05:12, 312.77s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 9
[LightGBM] [Info] Start training from score -6.283466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 9
[LightGBM] [Info] Start training from score -6.294726
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training models: 1it [00:05,  5.57s/it][A




Training models: 2it [00:10,  5.34s/it][A
Training models: 3it [00:11,  3.26s/it][A
Training models: 4it [00:11,  2.13s/it][A
Training models: 5it [00:14,  2.32s/it][A
Training models: 6it [00:18,  2.95s/it][A

Training models: 10it [00:24,  2.43s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [13:44<00:00, 206.24s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (1008, 40)
Dimensions of meta_features_test: (252, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9795
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 40
[LightGBM] [Info] Start training from score -6.283466




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9808
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 40
[LightGBM] [Info] Start training from score -6.294726




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000918 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9809
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 40
[LightGBM] [Info] Start training from score -6.286257




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9808
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.289338




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9798
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.285950




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9807
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.287022




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9797
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.276292




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9804
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.275942




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9814
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.290368




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9803
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.294256




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9800
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.278588




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9807
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.281485




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9811
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.285528




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000928 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9804
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.286695




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9798
[LightGBM] [Info] Number of data points in the train set: 941, number of used features: 40
[LightGBM] [Info] Start training from score -6.278984




LGBMRegressor Evaluation completed: Test R2 score: 0.7484132027182668
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.7422595455233149
RandomForestRegressor Evaluation completed: Test R2 score: 0.7522682320122523
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.7448191389317815
AdaBoostRegressor Evaluation completed: Test R2 score: 0.7129057387503221
XGBRegressor Evaluation completed: Test R2 score: 0.7456950615003126
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.7405408799819135
LinearRegression Evaluation completed: Test R2 score: 0.7448750857188178
KNeighborsRegressor Evaluation completed: Test R2 score: 0.6986184184414312
SVR Evaluation completed: Test R2 score: 0.7496407170712985




MLPRegressor Evaluation completed: Test R2 score: 0.7201846739756324




Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.138781,0.281379,0.372533,0.771115,0.878297,0.858302,0.145599,0.284723,0.381575,0.748413,0.865909,0.849685
DecisionTreeRegressor,0.276678,0.390456,0.526002,0.543688,0.771085,0.751175,0.14916,0.28705,0.386213,0.74226,0.862916,0.845664
RandomForestRegressor,0.136046,0.27715,0.368844,0.775626,0.880698,0.859618,0.143368,0.281294,0.37864,0.752268,0.868129,0.853225
GradientBoostingRegressor,0.140135,0.280946,0.374347,0.768881,0.877017,0.855443,0.147679,0.2868,0.38429,0.744819,0.86394,0.847147
AdaBoostRegressor,0.154673,0.303994,0.393284,0.744906,0.863583,0.842,0.166148,0.311375,0.407613,0.712906,0.847116,0.838879
XGBRegressor,0.156615,0.297704,0.395746,0.741703,0.862023,0.841742,0.147172,0.287893,0.38363,0.745695,0.864243,0.848203
ExtraTreesRegressor,0.134415,0.27525,0.366627,0.778316,0.882228,0.862582,0.150155,0.286764,0.387498,0.740541,0.861294,0.846426
LinearRegression,0.140423,0.284186,0.374731,0.768407,0.876667,0.861608,0.147647,0.28923,0.384248,0.744875,0.864491,0.855644
KNeighborsRegressor,0.155587,0.300242,0.394445,0.743398,0.862451,0.839192,0.174417,0.307937,0.417632,0.698618,0.836646,0.817156
SVR,0.142655,0.284328,0.377697,0.764725,0.87452,0.85193,0.144889,0.284136,0.380643,0.749641,0.866472,0.853639


In [63]:
results_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Results_15_folds_stacked_archi_Caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Prediction_data_15_folds_stacked_archi_Caco2.csv')

In [64]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 20-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=20, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=20, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (20 fold CV)': mse_train,
        'Train MAE (20 fold CV)': mae_train,
        'Train RMSE (20 fold CV)': rmse_train,
        'Train R2 (20 fold CV)': r2_train,
        'Train PCC (20 fold CV)': pearson_train,
        'Train SCC (20 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53770
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 251
[LightGBM] [Info] Start training from score -6.284925
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53774
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 251
[LightGBM] [Info] Start training from score -6.284310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53781
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 251
[LightGBM] [Info] Start t


Training models: 1it [00:12, 12.93s/it][A
Training models: 2it [00:24, 12.02s/it][A
Training models: 3it [02:28, 63.46s/it][A
Training models: 4it [03:05, 52.70s/it][A
Training models: 5it [03:20, 39.05s/it][A
Training models: 6it [03:26, 28.09s/it][A
Training models: 7it [03:27, 19.21s/it][A

Training models: 9it [03:54, 17.06s/it][A
Training models: 10it [03:59, 23.90s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [03:59<11:57, 239.05s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2608
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 708
[LightGBM] [Info] Start training from score -6.284925
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 705
[LightGBM] [Info] Start training from score -6.284310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi


Training models: 1it [00:13, 13.66s/it][A
Training models: 2it [00:22, 10.59s/it][A
Training models: 3it [00:47, 17.33s/it][A
Training models: 4it [01:03, 16.96s/it][A
Training models: 5it [01:16, 15.24s/it][A
Training models: 6it [01:23, 12.63s/it][A
Training models: 7it [01:24,  8.80s/it][A
Training models: 8it [01:29,  7.51s/it][A
Training models: 9it [01:54, 13.06s/it][A
Training models: 10it [01:56, 11.64s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [05:55<05:33, 166.89s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 760
[LightGBM] [Info] Start training from score -6.284925
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 760
[LightGBM] [Info] Start training from score -6.284310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 760
[LightGBM] [Info] Star


Training models: 1it [00:12, 12.63s/it][A
Training models: 2it [00:36, 19.06s/it][A
Training models: 3it [08:19, 222.14s/it][A
Training models: 4it [10:29, 185.67s/it][A
Training models: 5it [11:11, 133.94s/it][A
Training models: 6it [11:20, 91.29s/it] [A
Training models: 7it [11:21, 61.76s/it][A
Training models: 8it [11:25, 43.36s/it][A
Training models: 9it [11:53, 38.66s/it][A
Training models: 10it [12:09, 72.90s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [18:04<07:03, 423.58s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.077324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 9
[LightGBM] [Info] Start training from score -6.284925
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 9
[LightGBM] [Info] Start training from score -6.284310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training models: 1it [00:07,  7.60s/it][A




Training models: 2it [00:14,  7.24s/it][A
Training models: 3it [00:15,  4.42s/it][A
Training models: 4it [00:16,  2.87s/it][A
Training models: 5it [00:19,  2.94s/it][A
Training models: 6it [00:24,  3.77s/it][A
Training models: 7it [00:24,  2.57s/it][A

Training models: 10it [00:32,  3.21s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [18:36<00:00, 279.13s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (1008, 40)
Dimensions of meta_features_test: (252, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065843 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9844
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.284925




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9841
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.284310




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9843
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.290182




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9846
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.289855




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9833
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.289846




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9841
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.286311




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9835
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.283376




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9837
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 40
[LightGBM] [Info] Start training from score -6.286658




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9839
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.278916




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9838
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.276664




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9845
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.282745




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9842
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.290288




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9835
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.295392




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000937 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9833
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.279925




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9842
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.280986




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9843
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.282409




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000920 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9843
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.280643




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000928 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9842
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.291410




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9839
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.285486




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000935 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9843
[LightGBM] [Info] Number of data points in the train set: 958, number of used features: 40
[LightGBM] [Info] Start training from score -6.279535




LGBMRegressor Evaluation completed: Test R2 score: 0.7442433843047853
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.7237275505780998
RandomForestRegressor Evaluation completed: Test R2 score: 0.747604673354638
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.7440185380341222
AdaBoostRegressor Evaluation completed: Test R2 score: 0.7240135106374106
XGBRegressor Evaluation completed: Test R2 score: 0.7320089066097493
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.7442110723074681
LinearRegression Evaluation completed: Test R2 score: 0.7430037000171565
KNeighborsRegressor Evaluation completed: Test R2 score: 0.6990414292845415
SVR Evaluation completed: Test R2 score: 0.742286449142805




MLPRegressor Evaluation completed: Test R2 score: 0.7132801241959464


Unnamed: 0,Train MSE (20 fold CV),Train MAE (20 fold CV),Train RMSE (20 fold CV),Train R2 (20 fold CV),Train PCC (20 fold CV),Train SCC (20 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.144796,0.285493,0.38052,0.761195,0.87265,0.852499,0.148012,0.287533,0.384724,0.744243,0.863093,0.850617
DecisionTreeRegressor,0.264988,0.386069,0.51477,0.562967,0.773974,0.764317,0.159885,0.301789,0.399857,0.723728,0.852391,0.840202
RandomForestRegressor,0.140866,0.279415,0.375322,0.767676,0.87618,0.859886,0.146067,0.278888,0.382187,0.747605,0.865297,0.852456
GradientBoostingRegressor,0.140961,0.27969,0.375448,0.767519,0.876184,0.860036,0.148142,0.284196,0.384893,0.744019,0.863714,0.849767
AdaBoostRegressor,0.157096,0.300576,0.396353,0.740909,0.861398,0.840698,0.15972,0.300592,0.39965,0.724014,0.853147,0.843551
XGBRegressor,0.154853,0.294281,0.393514,0.744608,0.863644,0.84245,0.155093,0.287789,0.393818,0.732009,0.856449,0.845644
ExtraTreesRegressor,0.138215,0.276321,0.371773,0.772049,0.878671,0.861394,0.148031,0.28293,0.384748,0.744211,0.863235,0.850443
LinearRegression,0.141875,0.284878,0.376664,0.766012,0.875297,0.860234,0.14873,0.290816,0.385655,0.743004,0.863023,0.853195
KNeighborsRegressor,0.152791,0.302093,0.390885,0.748009,0.86514,0.839466,0.174172,0.30761,0.417339,0.699041,0.836832,0.816638
SVR,0.144352,0.286002,0.379937,0.761926,0.872949,0.852179,0.149145,0.284828,0.386193,0.742286,0.862271,0.849236


In [65]:
results_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Results_20_folds_stacked_archi_Caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/Stacked_architecture/Prediction_data_20_folds_stacked_archi_Caco2.csv')

In [78]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import joblib

# Ensure the models directory exists
os.makedirs('/home/users/akshay/PCPpred/Caco2/models_Caco2/', exist_ok=True)

# Assuming remove_low_variance_columns and features functions are defined elsewhere
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_desc = features(train, "Permeability")
joblib.dump(selected_features_desc, '/home/users/akshay/PCPpred/Caco2/models_Caco2/selected_features_descriptors.joblib')
df_desc_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_desc]], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test = df_desc_test.dropna()
df_desc_test = df_desc_test[df_desc_train.columns]

# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Train/All_fingerprints_train_Caco2.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_fp = features(train, "Permeability")
joblib.dump(selected_features_fp, '/home/users/akshay/PCPpred/Caco2/models_Caco2/selected_features_fingerprints.joblib')
df_fp_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_fp]], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Test/All_fingerprints_test_Caco2.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test = df_fp_test[df_fp_train.columns]

# Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_emb = features(train, "Permeability")
joblib.dump(selected_features_emb, '/home/users/akshay/PCPpred/Caco2/models_Caco2/selected_features_embeddings.joblib')
df_emb_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_emb]], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test = df_emb_test[df_emb_train.columns]

# Atomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Train_all_atomic_desc_Caco2.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_atomic = features(train, "Permeability")
joblib.dump(selected_features_atomic, '/home/users/akshay/PCPpred/Caco2/models_Caco2/selected_features_atomic.joblib')
df_atomic_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_atomic]], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Test_all_atomic_desc_Caco2.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test = df_atomic_test[df_atomic_train.columns]

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Filter dataframes to have consistent IDs
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]
df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]
df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

target_column = 'Permeability'

def scale_features(df_train, df_test, feature_type):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    # Save the scaler
    joblib.dump(scaler, f'/home/users/akshay/PCPpred/Caco2/models_Caco2/scaler_{feature_type}.joblib')
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test, 'Descriptor')
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test, 'Fingerprints')
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test, 'Embeddings')
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test , 'Atomic')

models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]

dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
data_names = ['descriptors', 'fingerprints', 'embeddings', 'atomic']

meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 5-fold cross-validation
for df_idx, (df_train, df_test) in enumerate(tqdm(dataframes, desc="Processing dataframe pairs")):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=5, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for fold_idx, (train_index, val_index) in enumerate(kf.split(X_weak)):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)
            
            model_name = model.__class__.__name__
            joblib.dump(model, f'/home/users/akshay/PCPpred/Caco2/models_Caco2/weak_{data_names[df_idx]}_{model_name}_fold_{fold_idx}.joblib')

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -4.0)

            test_predictions_fold = np.clip(model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)
    
    joblib.dump(fold_meta_features_train, f'/home/users/akshay/PCPpred/Caco2/models_Caco2/meta_features_train_{data_names[df_idx]}.joblib')
    joblib.dump(fold_meta_features_test, f'/home/users/akshay/PCPpred/Caco2/models_Caco2/meta_features_test_{data_names[df_idx]}.joblib')

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

joblib.dump(meta_features_train, '/home/users/akshay/PCPpred/Caco2/models_Caco2/meta_features_train_combined.joblib')
joblib.dump(meta_features_test, '/home/users/akshay/PCPpred/Caco2/models_Caco2/meta_features_test_combined.joblib')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=5, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for fold_idx, (train_index, val_index) in enumerate(kf.split(meta_features_train)):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        
        joblib.dump(model, f'/home/users/akshay/PCPpred/Caco2/models_Caco2/meta_{model_name}_fold_{fold_idx}.joblib')

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Test actual': y_eval,
        'Test prediction folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_mean,
    })

    results[model_name] = {
        'Train MSE (5 fold CV)': mse_train,
        'Train MAE (5 fold CV)': mae_train,
        'Train RMSE (5 fold CV)': rmse_train,
        'Train R2 (5 fold CV)': r2_train,
        'Train PCC (5 fold CV)': pearson_train,
        'Train SCC (5 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T

  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(1008, 262)
(252, 262)
(1008, 916)
(252, 916)
(1008, 763)
(252, 763)
(1008, 12)
(252, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       ID                                             SMILES  Permeability  \
875    33  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -5.810   
846    40  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -6.340   
841    41  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -6.560   
932   927  CC(C)C[C@@H]1NC(=O)CNC(=O)[C@@H]2CCCN2[C@H](C(...        -6.400   
881   982  CC[C@H](C)[C@@H]1NC(

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53544
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 251
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53503
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53502
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start t


Training models:  10%|█         | 1/10 [00:03<00:35,  3.98s/it][A




Training models:  20%|██        | 2/10 [00:07<00:30,  3.83s/it][A
Training models:  30%|███       | 3/10 [00:33<01:37, 13.97s/it][A
Training models:  40%|████      | 4/10 [00:41<01:08, 11.49s/it][A
Training models:  50%|█████     | 5/10 [00:45<00:43,  8.66s/it][A
Training models:  60%|██████    | 6/10 [00:47<00:25,  6.41s/it][A
Training models:  70%|███████   | 7/10 [00:47<00:13,  4.50s/it][A
Training models:  80%|████████  | 8/10 [00:48<00:06,  3.22s/it][A
Training models:  90%|█████████ | 9/10 [00:53<00:03,  3.75s/it][A
Training models: 100%|██████████| 10/10 [00:54<00:00,  5.41s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:54<02:42, 54.10s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2431
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 654
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 671
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi


Training models:  10%|█         | 1/10 [00:04<00:40,  4.48s/it][A
Training models:  20%|██        | 2/10 [00:07<00:30,  3.78s/it][A
Training models:  30%|███       | 3/10 [00:13<00:32,  4.59s/it][A
Training models:  40%|████      | 4/10 [00:16<00:24,  4.16s/it][A
Training models:  50%|█████     | 5/10 [00:20<00:19,  3.84s/it][A
Training models:  60%|██████    | 6/10 [00:22<00:13,  3.27s/it][A
Training models:  70%|███████   | 7/10 [00:23<00:07,  2.64s/it][A
Training models:  80%|████████  | 8/10 [00:25<00:05,  2.51s/it][A
Training models:  90%|█████████ | 9/10 [00:29<00:02,  2.82s/it][A
Training models: 100%|██████████| 10/10 [00:29<00:00,  2.97s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [01:23<01:19, 39.79s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Star


Training models:  10%|█         | 1/10 [00:02<00:25,  2.88s/it][A
Training models:  20%|██        | 2/10 [00:08<00:34,  4.36s/it][A
Training models:  30%|███       | 3/10 [01:43<05:22, 46.06s/it][A
Training models:  40%|████      | 4/10 [02:11<03:52, 38.83s/it][A
Training models:  50%|█████     | 5/10 [02:21<02:22, 28.48s/it][A
Training models:  60%|██████    | 6/10 [02:24<01:18, 19.67s/it][A
Training models:  70%|███████   | 7/10 [02:25<00:40, 13.52s/it][A
Training models:  80%|████████  | 8/10 [02:26<00:19,  9.56s/it][A
Training models:  90%|█████████ | 9/10 [02:30<00:07,  7.87s/it][A
Training models: 100%|██████████| 10/10 [02:33<00:00, 15.36s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [03:57<01:31, 91.81s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training models:  10%|█         | 1/10 [00:01<00:14,  1.65s/it][A




Training models:  20%|██        | 2/10 [00:03<00:14,  1.77s/it][A
Training models:  30%|███       | 3/10 [00:03<00:07,  1.09s/it][A
Training models:  40%|████      | 4/10 [00:04<00:04,  1.33it/s][A
Training models:  50%|█████     | 5/10 [00:04<00:03,  1.31it/s][A
Training models:  60%|██████    | 6/10 [00:06<00:03,  1.03it/s][A
Training models:  80%|████████  | 8/10 [00:06<00:01,  1.85it/s][A
Training models: 100%|██████████| 10/10 [00:08<00:00,  1.13it/s][A
Processing dataframe pairs: 100%|██████████| 4/4 [04:06<00:00, 61.61s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (1008, 40)
Dimensions of meta_features_test: (252, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9279
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9252
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9100
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 40
[LightGBM] [Info] Start training from score -6.270552




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9190
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 40
[LightGBM] [Info] Start training from score -6.284236




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9254
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 40
[LightGBM] [Info] Start training from score -6.280596








In [67]:
#Saving best model
#2d RDKit descriptors const removal
import os
import joblib 


def train_and_test_predict(models, X_train, y_train, X_test, y_test, save_dir='models_Caco2_2dRDKit'):
   
    os.makedirs(save_dir, exist_ok=True)

    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        fold_no = 1
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            fold_model_path = os.path.join(save_dir, f"{model_name}_fold{fold_no}_Caco2.joblib")
            joblib.dump(model, fold_model_path)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -4.0)
            test_predictions_folds.append(predictions_test_fold)

            fold_no += 1

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df

df_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_RDKit_des_Caco2.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Test_2d_RDKit_des_Caco2.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Saving the scaler and const_col
joblib.dump(scaler, '/home/users/akshay/PCPpred/Caco2/models_Caco2_2dRDKit/scaler_caco2.joblib')
joblib.dump(const_col, '/home/users/akshay/PCPpred/Caco2/models_Caco2_2dRDKit/const_col_lgbmreg_caco2.joblib')

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df


X_train shape:  (1008, 170)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 170)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18045
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 147
[LightGBM] [Info] Start training from score -6.296846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18192
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 148
[LightGBM] [Info] Start training from score -6.274773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1923,0.3182,0.4385,0.6829,0.8265,0.8108,0.1488,0.2965,0.3857,0.7429,0.8639,0.8564


In [68]:
models_dir = '/home/users/akshay/PCPpred/Caco2/models_Caco2_2dRDKit' 
scaler_path = '/home/users/akshay/PCPpred/Caco2/models_Caco2_2dRDKit/scaler_caco2.joblib' 
const_col_path =   'models_Caco2_2dRDKit/const_col_lgbmreg_caco2.joblib'                         
model_base_name = 'LGBMRegressor'                   
n_folds = 5                                    
const_col = joblib.load('/home/users/akshay/PCPpred/Caco2/models_Caco2_2dRDKit/const_col_lgbmreg_caco2.joblib')
print(const_col)

df_new_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Test_2d_RDKit_des_Caco2.csv')


df_new_test = df_new_test.drop(columns=const_col, errors='ignore') 

X_new_test_features = df_new_test.drop(columns=['ID', 'SMILES','Permeability'], errors='ignore')
y_test = df_new_test['Permeability']

scaler = joblib.load(scaler_path)
X_new_scaled = scaler.transform(X_new_test_features)
X_new_scaled = pd.DataFrame(X_new_scaled, columns=X_new_test_features.columns,index=X_new_test_features.index)

all_fold_preds = []

for fold in range(1, n_folds + 1):
    fold_model_path = os.path.join(models_dir, f"{model_base_name}_fold{fold}_Caco2.joblib")
    fold_model = joblib.load(fold_model_path)
    preds = fold_model.predict(X_new_scaled)
    preds = np.clip(preds, -10, -4.0)  
    all_fold_preds.append(preds)


all_fold_preds = np.array(all_fold_preds)
mean_prediction = np.mean(all_fold_preds, axis=0)

mse_test = mean_squared_error(y_test, mean_prediction)
print(f"{mse_test:.4f}")
mae_test = mean_absolute_error(y_test, mean_prediction)
print(f"{mae_test:.4f}")
rmse_test = np.sqrt(mse_test)
print(f"{rmse_test:.4f}")
r2_test = r2_score(y_test, mean_prediction)
print(f"{r2_test:.4f}")
pearson_test, _ = pearsonr(y_test, mean_prediction)
print(f"{pearson_test:.4f}")
spearman_test, _ = spearmanr(y_test, mean_prediction)
print(f"{spearman_test:.4f}")

print("Prediction on new data complete.")


['NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'fr_ArN', 'fr_Ar_COO', 'fr_HOCCN', 'fr_Imine', 'fr_N_O', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_amidine', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzodiazepine', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_hdrzine', 'fr_hdrzone', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperzine', 'fr_prisulfonamd', 'fr_quatN', 'fr_sulfide', 'fr_sulfone', 'fr_term_acetylene', 'fr_thiocyan', 'fr_thiophene', 'fr_urea']
0.1488
0.2965
0.3857
0.7429
0.8639
0.8564
Prediction on new data complete.


In [69]:
#Ablation study
import os
import joblib
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [70]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [71]:
# 2D and 3D descriptors dataframes
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Train/All_fingerprints_train_Caco2.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Test/All_fingerprints_test_Caco2.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Train_all_atomic_desc_Caco2.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Test_all_atomic_desc_Caco2.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

target_column = 'Permeability'
df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(1008, 262)
(252, 262)
(1008, 916)
(252, 916)
(1008, 763)
(252, 763)
(1008, 12)
(252, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(1008, 262)
(252, 262)
(1008, 916)
(252, 916)
(1008, 763)
(252, 763)
(1008, 12)
(252, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       ID                                             SMILES  Permeability  \
875    33  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...        -5.810   

In [72]:
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),

]
models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500)
]

dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

In [73]:
ablation_results = {}

for ablation_idx in range(len(dataframes)):
    print(f"========== Ablation: Excluding feature at index {ablation_idx} ==========")
    feature_names = ['Descriptor', 'Fingerprints', 'Embeddings', 'Atomic']
    print(f"========== Ablation: Excluding feature :-- {feature_names[ablation_idx]} ==========")

    ablated_dataframes = [pair for i, pair in enumerate(dataframes) if i != ablation_idx]

    meta_features_train = []
    meta_features_test = []

    # Stage 1
    for df_train, df_test in tqdm(ablated_dataframes, desc="Processing ablated dataframes"):
        X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
        y_weak = df_train[target_column]
        X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
        y_eval = df_test[target_column]

        kf = KFold(n_splits=5, shuffle=True, random_state=101)

        fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
        fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

        for i, model in tqdm(enumerate(models_weak), desc="Training weak models", total=len(models_weak)):
            fold_predictions = np.zeros(X_weak.shape[0])
            test_predictions_folds = []

            for train_index, val_index in kf.split(X_weak):
                X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
                y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

                model.fit(X_train, y_train)

                fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -4.0)
                test_predictions_fold = np.clip(model.predict(X_eval), -10, -4.0)
                test_predictions_folds.append(test_predictions_fold)

            fold_meta_features_train[:, i] = fold_predictions
            fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
            print(f'Model training done {i}: {model.__class__.__name__}')

        meta_features_train.append(fold_meta_features_train)
        meta_features_test.append(fold_meta_features_test)
        print('Dataframe training completed')

    # Stack all meta-features
    meta_features_train = np.hstack(meta_features_train)
    meta_features_test = np.hstack(meta_features_test)

    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
    print('Stage 1 completed (Weak Learners)')
    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

    # Stage 2
    results = {}
    kf = KFold(n_splits=5, shuffle=True, random_state=101)

    for model in models_meta:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        for train_index, val_index in kf.split(meta_features_train):
            X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
            y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_fold_train, y_fold_train)
            y_pred_fold = np.clip(model.predict(X_fold_val), -10, -4.0)

            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_fold_val)

            test_predictions_fold = model.predict(meta_features_test)
            test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        mse_test = mean_squared_error(y_eval, predictions_test_mean)
        mae_test = mean_absolute_error(y_eval, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_eval, predictions_test_mean)
        pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
        spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

        results[model_name] = {
            'Train MSE (5 fold CV)': mse_train,
            'Train MAE (5 fold CV)': mae_train,
            'Train RMSE (5 fold CV)': rmse_train,
            'Train R2 (5 fold CV)': r2_train,
            'Train PCC (5 fold CV)': pearson_train,
            'Train SCC (5 fold CV)': spearman_train,
            'Test MSE': mse_test,
            'Test MAE': mae_test,
            'Test RMSE': rmse_test,
            'Test R2': r2_test,
            'Test PCC': pearson_test,
            'Test SCC': spearman_test,
        }

    ablation_results[f"Ablation_{feature_names[ablation_idx]}"] = pd.DataFrame(results).T

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Ablation Study Completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# To view the results
ablation_results_df = {key: value for key, value in ablation_results.items()}




Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2431
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 654
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 671
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training weak models:  10%|█         | 1/10 [00:03<00:27,  3.09s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:20,  2.59s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:10<00:27,  3.89s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:14<00:22,  3.74s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:17<00:17,  3.48s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:19<00:11,  2.92s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:19<00:06,  2.06s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:20<00:03,  1.78s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:24<00:02,  2.39s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:24<00:00,  2.47s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:24<00:49, 24.72s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011647 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:03<00:28,  3.13s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:08<00:36,  4.51s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [01:44<05:23, 46.15s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [02:11<03:52, 38.75s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [02:22<02:22, 28.49s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [02:24<01:17, 19.49s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [02:24<00:39, 13.21s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [02:25<00:18,  9.31s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [02:28<00:07,  7.52s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [02:32<00:00, 15.20s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [02:56<01:39, 99.60s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training weak models:  10%|█         | 1/10 [00:01<00:16,  1.79s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:03<00:14,  1.80s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:03<00:07,  1.10s/it][A
Training weak models:  40%|████      | 4/10 [00:03<00:04,  1.41it/s][A

Model training done 2: GradientBoostingRegressor
Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:04<00:04,  1.23it/s][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:06<00:03,  1.04it/s][A
Training weak models:  80%|████████  | 8/10 [00:06<00:01,  1.88it/s][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:08<00:00,  1.13it/s][A
Processing ablated dataframes: 100%|██████████| 3/3 [03:05<00:00, 61.87s/it]

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6858
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.294177





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6834
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.295407








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6691
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.270552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6777
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.284236




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6849
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.280596










Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53544
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 251
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006740 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53503
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53502
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start t


Training weak models:  10%|█         | 1/10 [00:03<00:28,  3.14s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:24,  3.02s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:32<01:34, 13.51s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:39<01:07, 11.20s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:43<00:42,  8.49s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:45<00:24,  6.15s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:45<00:12,  4.22s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:45<00:06,  3.03s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:50<00:03,  3.60s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:51<00:00,  5.16s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:51<01:43, 51.59s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:06<00:55,  6.15s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:12<00:48,  6.05s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [01:47<05:29, 47.00s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [02:15<03:56, 39.38s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [02:25<02:24, 28.82s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [02:27<01:18, 19.72s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [02:28<00:40, 13.37s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [02:29<00:18,  9.44s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [02:33<00:07,  7.74s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [02:36<00:00, 15.62s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [03:27<01:53, 113.14s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092578 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training weak models:  10%|█         | 1/10 [00:01<00:16,  1.79s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:03<00:14,  1.82s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:03<00:07,  1.11s/it][A
Training weak models:  40%|████      | 4/10 [00:04<00:04,  1.40it/s][A

Model training done 2: GradientBoostingRegressor
Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:04<00:04,  1.25it/s][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:06<00:03,  1.02it/s][A
Training weak models:  80%|████████  | 8/10 [00:06<00:01,  1.86it/s][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:08<00:00,  1.12it/s][A
Processing ablated dataframes: 100%|██████████| 3/3 [03:36<00:00, 72.26s/it] 

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6878
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.294177





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6845
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.295407




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6777
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.270552




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6784
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.284236




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000733 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6850
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.280596










Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53544
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 251
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53503
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53502
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start t


Training weak models:  10%|█         | 1/10 [00:03<00:28,  3.20s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:24,  3.04s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:32<01:34, 13.53s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:39<01:07, 11.21s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:43<00:42,  8.45s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:44<00:24,  6.11s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:45<00:12,  4.19s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:45<00:06,  3.01s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:50<00:03,  3.47s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:51<00:00,  5.11s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:51<01:42, 51.10s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2431
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 654
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 671
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training weak models:  10%|█         | 1/10 [00:03<00:29,  3.24s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:20,  2.61s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:10<00:27,  3.90s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:14<00:22,  3.74s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:17<00:17,  3.52s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:19<00:11,  2.96s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:19<00:06,  2.08s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:20<00:03,  1.79s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:24<00:02,  2.40s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:24<00:00,  2.49s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [01:16<00:35, 35.70s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 9
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c


Training weak models:  10%|█         | 1/10 [00:01<00:16,  1.80s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:03<00:13,  1.72s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:03<00:07,  1.06s/it][A
Training weak models:  40%|████      | 4/10 [00:03<00:04,  1.47it/s][A

Model training done 2: GradientBoostingRegressor
Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:04<00:03,  1.28it/s][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:06<00:03,  1.05it/s][A
Training weak models:  80%|████████  | 8/10 [00:06<00:01,  1.90it/s][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:08<00:00,  1.15it/s][A
Processing ablated dataframes: 100%|██████████| 3/3 [01:24<00:00, 28.24s/it]

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6852
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.294177





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6831
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.295407




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6671
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.270552




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6768
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.284236




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6833
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.280596










Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53544
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 251
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004921 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53503
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005487 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53502
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 250
[LightGBM] [Info] Start t


Training weak models:  10%|█         | 1/10 [00:03<00:28,  3.22s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:23,  2.92s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:31<01:34, 13.46s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:39<01:07, 11.17s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:43<00:42,  8.47s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:45<00:24,  6.22s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:45<00:12,  4.27s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:45<00:06,  3.06s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:50<00:03,  3.52s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:51<00:00,  5.14s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:51<01:42, 51.36s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019949 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2431
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 654
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2478
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 671
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.122352 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training weak models:  10%|█         | 1/10 [00:03<00:28,  3.17s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:20,  2.55s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:10<00:27,  3.86s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:14<00:22,  3.71s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:17<00:17,  3.50s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:19<00:11,  2.94s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:19<00:06,  2.09s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:20<00:03,  1.79s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:24<00:02,  2.32s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:24<00:00,  2.45s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [01:15<00:35, 35.58s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010928 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193800
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 760
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:03<00:29,  3.31s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:08<00:34,  4.37s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [01:44<05:22, 46.05s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [02:11<03:52, 38.68s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [02:22<02:22, 28.53s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [02:24<01:18, 19.55s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [02:24<00:39, 13.25s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [02:25<00:18,  9.36s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [02:29<00:07,  7.67s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [02:32<00:00, 15.25s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [03:48<00:00, 76.13s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7249
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.294177





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7246
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.295407




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7161
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 30
[LightGBM] [Info] Start training from score -6.270552




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7241
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.284236




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7230
[LightGBM] [Info] Number of data points in the train set: 807, number of used features: 30
[LightGBM] [Info] Start training from score -6.280596








XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Ablation Study Completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [74]:
ablation_results

{'Ablation_Descriptor':                            Train MSE (5 fold CV)  Train MAE (5 fold CV)  \
 LGBMRegressor                           0.158338               0.294101   
 DecisionTreeRegressor                   0.289521               0.402188   
 RandomForestRegressor                   0.149608               0.286725   
 GradientBoostingRegressor               0.152904               0.291063   
 AdaBoostRegressor                       0.172118               0.315982   
 XGBRegressor                            0.176798               0.314544   
 ExtraTreesRegressor                     0.150305               0.287463   
 LinearRegression                        0.154187               0.296893   
 KNeighborsRegressor                     0.173353               0.310813   
 SVR                                     0.163390               0.299419   
 MLPRegressor                            0.188291               0.330079   
 
                            Train RMSE (5 fold CV)  Train R2 (5

In [75]:
import os
import pickle

ablation_result_dir = '/home/users/akshay/PCPpred/Caco2/results/Ablation/'
os.makedirs(ablation_result_dir, exist_ok=True)

pickle_path = os.path.join(ablation_result_dir, 'ablation_results.pkl')
with open(pickle_path, 'wb') as f:
    pickle.dump(ablation_results, f)


with open(pickle_path, 'rb') as f:
    ablation_results = pickle.load(f)


ablation_results

{'Ablation_Descriptor':                            Train MSE (5 fold CV)  Train MAE (5 fold CV)  \
 LGBMRegressor                           0.158338               0.294101   
 DecisionTreeRegressor                   0.289521               0.402188   
 RandomForestRegressor                   0.149608               0.286725   
 GradientBoostingRegressor               0.152904               0.291063   
 AdaBoostRegressor                       0.172118               0.315982   
 XGBRegressor                            0.176798               0.314544   
 ExtraTreesRegressor                     0.150305               0.287463   
 LinearRegression                        0.154187               0.296893   
 KNeighborsRegressor                     0.173353               0.310813   
 SVR                                     0.163390               0.299419   
 MLPRegressor                            0.188291               0.330079   
 
                            Train RMSE (5 fold CV)  Train R2 (5

In [76]:
ablation_result_dir = '/home/users/akshay/PCPpred/Caco2/results/Ablation'
os.makedirs(ablation_result_dir, exist_ok=True)

for ablation_label, df in ablation_results.items():
    print(f"Results for {ablation_label}: \n")
    safe_label = ablation_label.replace(" ", "_").replace("/", "_")
    file_path = os.path.join(ablation_result_dir, f"{safe_label}.csv")
    df.to_csv(file_path)

Results for Ablation_Descriptor: 

Results for Ablation_Fingerprints: 

Results for Ablation_Embeddings: 

Results for Ablation_Atomic: 



In [77]:
from IPython.display import display
for ablation_label, df in ablation_results.items():
    print(f"Results for {ablation_label}: \n")
    display(df)

Results for Ablation_Descriptor: 



Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.158338,0.294101,0.397917,0.738861,0.859932,0.8444,0.164078,0.303597,0.405065,0.716483,0.847724,0.828361
DecisionTreeRegressor,0.289521,0.402188,0.538071,0.522507,0.763556,0.741629,0.202359,0.332568,0.449843,0.650336,0.811208,0.77653
RandomForestRegressor,0.149608,0.286725,0.386791,0.753259,0.867917,0.853036,0.16199,0.29644,0.402479,0.720091,0.849519,0.83471
GradientBoostingRegressor,0.152904,0.291063,0.39103,0.747822,0.864931,0.849723,0.158913,0.294054,0.398639,0.725407,0.852846,0.835209
AdaBoostRegressor,0.172118,0.315982,0.414871,0.716134,0.847147,0.8287,0.173399,0.318353,0.416412,0.700377,0.841169,0.826924
XGBRegressor,0.176798,0.314544,0.420474,0.708415,0.843291,0.831849,0.1645,0.300641,0.405586,0.715754,0.84761,0.830663
ExtraTreesRegressor,0.150305,0.287463,0.387691,0.75211,0.867244,0.852672,0.160573,0.301016,0.400716,0.722539,0.851049,0.837502
LinearRegression,0.154187,0.296893,0.392667,0.745706,0.863646,0.853803,0.154096,0.294056,0.39255,0.733732,0.858145,0.84411
KNeighborsRegressor,0.173353,0.310813,0.416357,0.714097,0.845478,0.825148,0.18277,0.319751,0.427516,0.684184,0.828843,0.799765
SVR,0.16339,0.299419,0.404215,0.730529,0.855025,0.840515,0.148218,0.286891,0.384991,0.743887,0.862643,0.848372


Results for Ablation_Fingerprints: 



Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.163424,0.29629,0.404257,0.730473,0.855211,0.842113,0.14137,0.287796,0.375992,0.755721,0.870632,0.852477
DecisionTreeRegressor,0.330961,0.414856,0.575293,0.454161,0.720668,0.707356,0.182576,0.317869,0.427289,0.68452,0.831783,0.806495
RandomForestRegressor,0.159795,0.291629,0.399744,0.736457,0.858258,0.846524,0.144796,0.286766,0.380521,0.7498,0.867222,0.851779
GradientBoostingRegressor,0.164412,0.295147,0.405477,0.728844,0.854133,0.842057,0.144322,0.287829,0.379897,0.75062,0.868517,0.855616
AdaBoostRegressor,0.182109,0.319344,0.426742,0.699656,0.837539,0.825181,0.166721,0.318466,0.408315,0.711916,0.848061,0.838182
XGBRegressor,0.175446,0.310137,0.418862,0.710646,0.844267,0.831917,0.143451,0.290573,0.378749,0.752125,0.868948,0.854876
ExtraTreesRegressor,0.155126,0.288394,0.393861,0.744157,0.862667,0.849591,0.148495,0.293721,0.385351,0.743409,0.863744,0.847454
LinearRegression,0.154394,0.290791,0.39293,0.745365,0.863446,0.8542,0.153784,0.296586,0.392153,0.73427,0.858801,0.85051
KNeighborsRegressor,0.185981,0.320583,0.431256,0.69327,0.83351,0.813471,0.167434,0.314954,0.409187,0.710684,0.8452,0.822254
SVR,0.157237,0.291074,0.396531,0.740677,0.860868,0.847754,0.142273,0.286808,0.37719,0.754161,0.868905,0.855918


Results for Ablation_Embeddings: 



Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.171908,0.30476,0.414618,0.716481,0.846603,0.833225,0.147541,0.290239,0.384111,0.745058,0.863947,0.843721
DecisionTreeRegressor,0.310027,0.418524,0.556801,0.488687,0.740877,0.728025,0.179872,0.313806,0.424114,0.689191,0.832881,0.814868
RandomForestRegressor,0.160902,0.296409,0.401126,0.734632,0.857127,0.841031,0.148076,0.285064,0.384806,0.744134,0.863321,0.847732
GradientBoostingRegressor,0.164741,0.296106,0.405883,0.728301,0.853743,0.838056,0.148092,0.286779,0.384827,0.744106,0.863957,0.842844
AdaBoostRegressor,0.184186,0.325007,0.429169,0.696231,0.835194,0.817472,0.166636,0.312699,0.408211,0.712063,0.847735,0.837089
XGBRegressor,0.179702,0.314586,0.423913,0.703625,0.839295,0.822799,0.15309,0.29307,0.391267,0.73547,0.858723,0.843806
ExtraTreesRegressor,0.157395,0.294622,0.396731,0.740416,0.860489,0.844628,0.150232,0.291634,0.387597,0.740408,0.861211,0.844301
LinearRegression,0.162636,0.29634,0.403282,0.731772,0.855562,0.848086,0.138787,0.283374,0.372542,0.760183,0.872834,0.855949
KNeighborsRegressor,0.195123,0.32583,0.441728,0.678192,0.824338,0.800873,0.180282,0.325029,0.424596,0.688484,0.830447,0.808964
SVR,0.176302,0.306012,0.419884,0.709233,0.842445,0.825357,0.140207,0.286046,0.374442,0.757731,0.870654,0.858834


Results for Ablation_Atomic: 



Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.163456,0.293771,0.404297,0.730419,0.855005,0.845316,0.147156,0.282907,0.383609,0.745723,0.864125,0.84854
DecisionTreeRegressor,0.302177,0.405041,0.549707,0.501633,0.74753,0.733744,0.160315,0.290106,0.400394,0.722985,0.851363,0.840208
RandomForestRegressor,0.153607,0.288526,0.391927,0.746664,0.864114,0.851188,0.148118,0.28363,0.384861,0.744061,0.863372,0.852331
GradientBoostingRegressor,0.160624,0.294426,0.400779,0.735091,0.85781,0.846602,0.147424,0.286162,0.383959,0.745259,0.863998,0.851639
AdaBoostRegressor,0.174562,0.315122,0.417807,0.712103,0.844807,0.827215,0.174677,0.317587,0.417943,0.698169,0.838709,0.82798
XGBRegressor,0.178299,0.310213,0.422255,0.705939,0.841427,0.828903,0.153222,0.290903,0.391436,0.735241,0.858993,0.84514
ExtraTreesRegressor,0.152798,0.287382,0.390894,0.747998,0.864885,0.85348,0.150377,0.28935,0.387785,0.740157,0.861451,0.848296
LinearRegression,0.150981,0.288247,0.388562,0.750995,0.866685,0.859378,0.142686,0.281813,0.377738,0.753447,0.869373,0.85765
KNeighborsRegressor,0.173767,0.31021,0.416854,0.713415,0.845396,0.828722,0.168573,0.311912,0.410577,0.708716,0.842637,0.824516
SVR,0.162908,0.299177,0.403619,0.731323,0.855504,0.844324,0.14151,0.280107,0.376178,0.755479,0.869527,0.857334
