In [1]:
print('start')

start


In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -4.0)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [4]:
#Monomeric models
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)

    df.columns = [clean_name(col) for col in df.columns]
    return df

In [9]:
#Monomer composition
df_mc_train = pd.read_csv('features/Monomeric/Train_mon_comp_MDCK.csv')
df_mc_train = clean_feature_names(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_MDCK.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(51, 385)
(51,)
(13, 385)
(13,)
0.2923202922753174
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 1
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 41, number of used feature



-1.2776285723570941




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.7296,0.5823,0.8542,-0.1279,0.2558,0.3734,0.4163,0.5215,0.6452,0.2923,0.5979,0.2865
LGBMRegressor,0.6446,0.6796,0.8028,0.0036,0.1308,0.1395,0.5276,0.5751,0.7263,0.1031,0.4182,0.3245
XGBRegressor,0.8507,0.6431,0.9224,-0.3151,0.2721,0.356,0.3258,0.4432,0.5708,0.4462,0.711,0.4628
DecisionTreeRegressor,1.0375,0.6894,1.0186,-0.6039,0.0966,0.2015,0.3747,0.4881,0.6121,0.363,0.6606,0.4077
RandomForestRegressor,0.6361,0.6084,0.7976,0.0166,0.2675,0.3387,0.3256,0.5001,0.5706,0.4465,0.6707,0.4353
GradientBoostingRegressor,0.781,0.6339,0.8837,-0.2073,0.1955,0.2986,0.3042,0.4338,0.5515,0.4829,0.7093,0.4628
AdaBoostRegressor,0.7158,0.6462,0.8461,-0.1066,0.1428,0.2387,0.258,0.4479,0.5079,0.5614,0.7532,0.562
SVR,0.7202,0.6786,0.8487,-0.1133,0.0479,-0.0587,0.3813,0.5052,0.6175,0.3518,0.6237,0.5785
LinearRegression,1.1792,0.8393,1.0859,-0.8229,0.179,0.2027,0.5803,0.5368,0.7618,0.0134,0.7032,0.6116
KNeighborsRegressor,0.857,0.7338,0.9258,-0.3249,0.1005,0.1947,0.3572,0.4786,0.5976,0.3928,0.6808,0.3917


In [10]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.048558046430011, -6.80190000000001, -5.657...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.8029044125550024, -5.753935149195002, -5....","[-5.700078470685001, -5.582273631594001, -5.96...","[0.3900394892473089, 0.09680365467181355, 0.28..."
1,LGBMRegressor,"[-5.72429500042461, -5.781901428122076, -5.781...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.72429500042461, -5.72429500042461, -5.781...","[-5.508673793612709, -5.508673793612709, -5.62...","[0.1956217934002597, 0.1956217934002597, 0.192..."
2,XGBRegressor,"[-6.2183466, -7.569528, -5.4499307, -5.596678,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.985038, -5.706633, -6.759508, -6.9262185,...","[-5.8299103, -5.498244, -6.1002665, -6.6718307...","[0.43846768, 0.26189202, 0.6996025, 0.5103311,..."
3,DecisionTreeRegressor,"[-5.22, -6.97, -5.06, -5.22, -6.929917083, -5....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.295, -5.22, -6.85, -6.927245587, -5.22, -...","[-5.832, -5.3859667324, -6.609017847999999, -6...","[1.138229326629744, 0.24822520835348863, 0.300..."
4,RandomForestRegressor,"[-5.865097338736674, -6.371899999999998, -5.77...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.692643656850002, -5.5981434520383395, -5....","[-5.653318263936335, -5.516528619457403, -5.86...","[0.10946989771195936, 0.05882262933001048, 0.2..."
5,GradientBoostingRegressor,"[-6.048170161042695, -6.6922024002120635, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.871766622783936, -5.6245522063437186, -6....","[-5.739107746439394, -5.420930018864825, -5.97...","[0.33465932282986494, 0.13293890920620488, 0.4..."
6,AdaBoostRegressor,"[-5.639976052304348, -6.77, -5.663214181444443...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.663214181444443, -5.600981205750001, -5.5...","[-5.574862276957454, -5.546830403219381, -5.62...","[0.13067365045804516, 0.08403370405403282, 0.2..."
7,SVR,"[-5.693891035066837, -5.65413016780909, -5.747...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.666661130052129, -5.701594019876477, -5.4...","[-5.531594480261154, -5.693265666274938, -5.46...","[0.14780547145157835, 0.029695389201786812, 0...."
8,LinearRegression,"[-4.126989483536487, -4.717425534753499, -6.53...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -4.0, -6.272531131732464, -6.922973078...","[-4.0, -5.632525333476553, -5.828073967233101,...","[0.0, 1.0051831044888202, 0.8372471943944391, ..."
9,KNeighborsRegressor,"[-5.1866666666666665, -6.07, -6.07, -6.0404790...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.2783009283333335, -5....","[-5.528666666666667, -5.258158495866667, -4.96...","[0.1545689346393881, 0.0726709195073452, 0.137..."


In [11]:
result_df.to_csv('results/Monomeric/Monomer_comp_results_MDCK.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_prediction_data_MDCK.csv')

In [5]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [13]:
df_mc_train = pd.read_csv('features/Monomeric/Train_mon_comp_MDCK.csv')
df_mc_train = clean_feature_names(df_mc_train)
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_MDCK.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(51, 33)
(51,)
(13, 33)
(13,)
0.2999025104491113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 1
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Numb



-1.2748725696629006




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.7537,0.5905,0.8682,-0.1651,0.2344,0.359,0.4118,0.5214,0.6417,0.2999,0.6024,0.2865
LGBMRegressor,0.6446,0.6796,0.8028,0.0036,0.1308,0.1395,0.5276,0.5751,0.7263,0.1031,0.4182,0.3245
XGBRegressor,0.8507,0.6431,0.9224,-0.3151,0.2721,0.356,0.3258,0.4432,0.5708,0.4462,0.711,0.4628
DecisionTreeRegressor,0.8154,0.6095,0.903,-0.2606,0.2759,0.3843,0.4612,0.5201,0.6791,0.216,0.5814,0.3636
RandomForestRegressor,0.6376,0.6071,0.7985,0.0143,0.2657,0.3357,0.32,0.4941,0.5657,0.456,0.6792,0.4187
GradientBoostingRegressor,0.8067,0.6449,0.8982,-0.247,0.1782,0.2606,0.3139,0.447,0.5602,0.4664,0.6983,0.4628
AdaBoostRegressor,0.7124,0.666,0.844,-0.1012,0.1452,0.1895,0.2491,0.4436,0.4991,0.5766,0.7709,0.5655
SVR,0.7202,0.6786,0.8487,-0.1134,0.0478,-0.0587,0.3813,0.5052,0.6175,0.3518,0.6237,0.5785
LinearRegression,1.1792,0.8393,1.0859,-0.8229,0.179,0.2027,0.5803,0.5368,0.7618,0.0134,0.7032,0.6116
KNeighborsRegressor,0.857,0.7338,0.9258,-0.3249,0.1005,0.1947,0.3572,0.4786,0.5976,0.3928,0.6808,0.3917


In [14]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.043058404850012, -6.858800000000014, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.827461524290002, -5.842012402510005, -5.6...","[-5.710546744568001, -5.608915493959001, -5.95...","[0.3906250531695762, 0.12201030103337158, 0.24..."
1,LGBMRegressor,"[-5.72429500042461, -5.781901428122076, -5.781...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.72429500042461, -5.72429500042461, -5.781...","[-5.508673793612709, -5.508673793612709, -5.62...","[0.1956217934002597, 0.1956217934002597, 0.192..."
2,XGBRegressor,"[-6.2183466, -7.569528, -5.4499307, -5.596678,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.985038, -5.706633, -6.759508, -6.9262185,...","[-5.8299103, -5.498244, -6.1002665, -6.6718307...","[0.43846768, 0.26189202, 0.6996025, 0.5103311,..."
3,DecisionTreeRegressor,"[-5.22, -6.97, -5.93, -5.22, -6.929917083, -5....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.295, -5.22, -6.85, -6.927245587, -5.22, -...","[-5.804, -5.3859667324, -6.384, -6.6710733941,...","[0.4317105511798386, 0.24822520835348863, 0.79..."
4,RandomForestRegressor,"[-5.789949056884673, -6.3782637986, -5.7064473...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.726976346974669, -5.602294175914672, -5.9...","[-5.6589558541721345, -5.500899846052269, -5.8...","[0.11596493811230052, 0.07139148305302195, 0.2..."
5,GradientBoostingRegressor,"[-6.01954291884001, -6.896882968962064, -5.848...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.871766622783936, -5.6245522063437186, -6....","[-5.656593247102809, -5.426784452298417, -5.91...","[0.254960206681533, 0.12943965011407715, 0.440..."
6,AdaBoostRegressor,"[-5.732601341625002, -6.826250000000001, -5.68...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.664027293571428, -5.5025569934375005, -5....","[-5.633116740219047, -5.646040413013983, -5.85...","[0.1488838914294579, 0.15159057733235587, 0.52..."
7,SVR,"[-5.693891238094307, -5.654130024953056, -5.74...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.6666614021657615, -5.7015941795430045, -5...","[-5.531588059386451, -5.693262986683411, -5.46...","[0.14780058278796937, 0.02969453085614056, 0.0..."
8,LinearRegression,"[-4.126989483536487, -4.717425534753501, -6.53...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -4.0, -6.272531131732463, -6.922973078...","[-4.0, -5.632525333476573, -5.828073967233261,...","[0.0, 1.0051831044888169, 0.8372471943942058, ..."
9,KNeighborsRegressor,"[-5.1866666666666665, -6.07, -6.07, -6.0404790...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.2783009283333335, -5....","[-5.528666666666667, -5.258158495866667, -4.96...","[0.1545689346393881, 0.0726709195073452, 0.137..."


In [15]:
const_col

['Ala_tBu_',
 'Ala_indol_2_yl_',
 'dAla_indol_2_yl_',
 'Me_Ala_indol_2_yl_',
 'Ala_5_Tet_',
 'Me_dAbu',
 'Me_Abu_morpholino_',
 '2Abz',
 'Aib',
 'Aoc_2_',
 '5_Ava',
 'Bal',
 'Me_Bal',
 'HOCOCH2_Bal',
 'Cys_EtO2H__NH2',
 'dCha',
 'Me_Cha',
 'D',
 'meD',
 'Asp_piperidide',
 'Asp_OMe_',
 'Asp_Ph_2_NH2__',
 'dAsp_pyrrol_1_yl_',
 'E',
 'Glu_NH2',
 'Glu_3R_Me_',
 'Glu_OMe_',
 'dGlu_OMe_',
 'Phe_4_F_',
 'dPhe_4_F_',
 'Phe_4_CF3_',
 'Phe_4_NO2_',
 'Phe_CHF2_',
 'dPhe_3_4_diF_',
 'Et_Phe',
 'H2NEt_Phe',
 'Me_Phe_3_Cl_',
 'Me_Phe_4_Cl_',
 'Me_Phe_a_b_dehydro_',
 'G',
 'Bn_Gly',
 'Bn_4_Cl__Gly',
 'Bn_4_OH__Gly',
 'Bu_Gly',
 'EtOEt_Gly',
 'HOCOCH2_Gly_ol',
 'MeOEt_Gly',
 'NH2Bu_Gly',
 'PhEt_Gly',
 'PhPr_Gly',
 'isoamyl_Gly',
 'pentyl_Gly',
 '3_pyridylethyl_Gly',
 '2_pyridylmethyl_Gly',
 'd_N__O_Gly_allyl_',
 'GABA',
 'H',
 'Hph',
 'Me_Hph',
 'bHph',
 'Hph_2_Cl_',
 'Hph_3_Cl_',
 'Hph_4_Cl_',
 'Hse_Et_',
 'dHyp',
 'Hyp_Et_',
 'dI',
 'meI',
 'Me_dI',
 '_N__O_xiIle',
 'd_N__O_aIle',
 'K',
 'dK',
 'meK

In [16]:
result_df.to_csv('results/Monomeric/Monomer_comp_constRemoval_results_MDCK.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_constRemoval_prediction_data_MDCK.csv')

In [6]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [18]:
df_train = pd.read_csv('features/Monomeric/Train_mon_comp_MDCK.csv')
df_mc_train = clean_feature_names(df_train)
df_mc_train = df_mc_train.drop(['ID','SMILES','Permeability'],axis=1)
df_mc, const_col = remove_low_variance_columns(df_mc_train)
X_train = df_mc
y_train = df_train['Permeability']
print(X_train.shape)
print(y_train.shape)

df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_MDCK.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(51, 7)
(51,)
(13, 7)
(13,)
0.3807880935153942
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 1
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1



-1.377499296426219




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.733,0.6004,0.8562,-0.1331,0.2581,0.352,0.3642,0.4812,0.6035,0.3808,0.6495,0.4662
LGBMRegressor,0.6446,0.6796,0.8028,0.0036,0.1308,0.1395,0.5276,0.5751,0.7263,0.1031,0.4182,0.3245
XGBRegressor,0.851,0.6477,0.9225,-0.3155,0.2543,0.2836,0.2849,0.4069,0.5338,0.5156,0.7517,0.5655
DecisionTreeRegressor,0.9274,0.6765,0.963,-0.4336,0.1762,0.2434,0.348,0.4894,0.5899,0.4084,0.6746,0.5048
RandomForestRegressor,0.6588,0.6228,0.8116,-0.0184,0.2437,0.2863,0.3054,0.4776,0.5527,0.4808,0.6955,0.56
GradientBoostingRegressor,0.7493,0.6204,0.8656,-0.1583,0.2217,0.2691,0.3581,0.5027,0.5984,0.3912,0.6624,0.4662
AdaBoostRegressor,0.7706,0.6881,0.8778,-0.1912,0.0359,0.047,0.26,0.442,0.5099,0.558,0.7535,0.5986
SVR,0.7647,0.6556,0.8745,-0.1822,0.048,0.1248,0.4922,0.5599,0.7016,0.1632,0.4749,0.2648
LinearRegression,0.7019,0.6858,0.8378,-0.085,0.1833,0.1044,0.4294,0.5007,0.6553,0.27,0.575,0.6621
KNeighborsRegressor,0.8033,0.7184,0.8963,-0.2418,-0.0628,-0.0163,0.5373,0.6093,0.733,0.0866,0.3205,0.3122


In [19]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.220000000000016, -6.733600000000011, -5.67...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.220000000000016, -6.129689426090013, -5.5...","[-6.019999999999998, -5.871163893482627, -5.86...","[0.12649110640673977, 0.1526077768687823, 0.36..."
1,LGBMRegressor,"[-5.72429500042461, -5.781901428122076, -5.781...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.72429500042461, -5.72429500042461, -5.781...","[-5.508673793612709, -5.508673793612709, -5.62...","[0.1956217934002597, 0.1956217934002597, 0.192..."
2,XGBRegressor,"[-6.218839, -6.9695864, -5.9245267, -5.540827,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.218839, -5.885452, -6.576931, -6.7478294,...","[-6.019603, -5.778534, -6.264598, -6.5054007, ...","[0.12594935, 0.23116368, 0.9168826, 0.48475742..."
3,DecisionTreeRegressor,"[-6.22, -6.97, -5.93, -5.5406878929999985, -6....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.22, -5.5406878929999985, -6.06, -6.748412...","[-6.02, -5.7864103901199995, -5.592, -6.505002...","[0.126491106406735, 0.2171104101154356, 0.4400..."
4,RandomForestRegressor,"[-5.981101583799694, -6.357333925255005, -5.77...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.981101583799692, -5.9044673180604335, -5....","[-5.908064044092657, -5.75531720488374, -5.993...","[0.10046023128790825, 0.10967448772312777, 0.4..."
5,GradientBoostingRegressor,"[-5.957345547511532, -6.681787871239538, -6.03...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.957345547511532, -5.896933982419868, -5.5...","[-5.897891961686751, -5.790969423295177, -5.57...","[0.07287761692143958, 0.1132398759357142, 0.53..."
6,AdaBoostRegressor,"[-5.612922772666667, -6.1899999999999995, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.612922772666667, -5.612922772666667, -5.7...","[-5.617266178315978, -5.6086070870843105, -5.8...","[0.061062592201174014, 0.05490751298716359, 0...."
7,SVR,"[-5.790601714892611, -6.346966090150607, -5.82...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.790601714892611, -5.99114964406607, -5.68...","[-5.734015775011687, -5.756433936050504, -5.53...","[0.02829382771721455, 0.12837677657045846, 0.1..."
8,LinearRegression,"[-5.146109898623878, -6.299279317299723, -5.97...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.146109898623878, -5.038286352092517, -5.8...","[-5.199828524003662, -5.126777978281019, -6.14...","[0.0916506680229044, 0.18610857810771805, 0.26..."
9,KNeighborsRegressor,"[-5.691405793666665, -6.293333333333333, -5.86...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.691405793666665, -5.691405793666665, -5.3...","[-5.7596246898, -5.7596246898, -5.308666666666...","[0.1003320012147988, 0.1003320012147988, 0.089..."


In [20]:
result_df.to_csv('results/Monomeric/Monomer_comp_LVR_results_MDCK.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_LVR_prediction_data_MDCK.csv')

In [21]:
#AA composition
df_aac_train = pd.read_csv('features/Monomeric/Train_aac_MDCK.csv')
X_train = df_aac_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_aac_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_aac_test = pd.read_csv('features/Monomeric/Test_aac_MDCK.csv')
X_test = df_aac_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_aac_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
aac_comp,prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
aac_comp

(51, 21)
(51,)
(13, 21)
(13,)
0.14535106206428194
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 0
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.694363
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of



-1.1766483032998134




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.9105,0.7913,0.9542,-0.4075,0.0662,0.0958,0.5027,0.6301,0.709,0.1454,0.4367,0.171
LGBMRegressor,0.7103,0.6855,0.8428,-0.098,-0.0872,-0.0382,0.5616,0.6033,0.7494,0.0453,0.2513,0.2733
XGBRegressor,0.8803,0.7819,0.9382,-0.3608,0.1149,0.13,0.571,0.6658,0.7557,0.0292,0.3601,0.171
DecisionTreeRegressor,0.8846,0.7892,0.9405,-0.3674,0.121,0.1529,0.5861,0.6714,0.7656,0.0036,0.3416,0.1186
RandomForestRegressor,0.7994,0.7403,0.8941,-0.2358,0.1161,0.1597,0.4479,0.5874,0.6692,0.2386,0.5065,0.2152
GradientBoostingRegressor,0.8959,0.7802,0.9465,-0.385,0.0762,0.1065,0.4825,0.6189,0.6947,0.1797,0.4651,0.2538
AdaBoostRegressor,0.6927,0.6829,0.8323,-0.0708,0.2134,0.1629,0.437,0.5747,0.661,0.2571,0.5126,0.3586
SVR,0.7771,0.7354,0.8815,-0.2013,-0.0179,0.0463,0.5337,0.573,0.7306,0.0927,0.4019,0.2372
LinearRegression,0.6729,0.657,0.8203,-0.0401,0.2564,0.2745,0.5771,0.6134,0.7597,0.0188,0.382,0.2538
KNeighborsRegressor,0.8517,0.7793,0.9229,-0.3166,0.0841,0.1587,0.6329,0.6084,0.7955,-0.0759,0.2872,0.2648


In [22]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-5.2950000000000035, -6.221999999999991, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.366350000000006, -6.25, -5.29500000000000...","[-5.17693135088315, -6.022842056844859, -5.470...","[0.3538069070444989, 0.4543158863102818, 0.147..."
1,LGBMRegressor,"[-5.753098213672638, -5.753098213672638, -5.75...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.753098213672638, -5.753098213672638, -5.7...","[-5.551464208338411, -5.661144897664251, -5.55...","[0.14966825667694483, 0.07209968741488275, 0.1..."
2,XGBRegressor,"[-5.29548, -6.2220545, -6.2220545, -6.0047245,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.7313776, -6.24856, -5.29548, -6.9255056, ...","[-4.87979, -6.0119925, -5.4701047, -6.6387153,...","[0.3947066, 0.47240928, 0.14748119, 0.5733854,..."
3,DecisionTreeRegressor,"[-5.295, -6.2219999999999995, -6.2219999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -5.295, -6.927245587, -5.45349...","[-4.8556, -6.038, -5.470000000000001, -6.64376...","[0.4911999999999996, 0.4239999999999998, 0.147..."
4,RandomForestRegressor,"[-5.368503426139999, -6.193135364357863, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.130029644703837, -5.868885041386251, -5.3...","[-5.250813480455994, -5.726103386135561, -5.50...","[0.20369569259445852, 0.2784406428026411, 0.12..."
5,GradientBoostingRegressor,"[-5.346659526734528, -6.216450432194516, -6.21...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.670062194839065, -6.199897901570898, -5.3...","[-5.224380017202935, -5.946667124709225, -5.48...","[0.4669255230923063, 0.5513825208708881, 0.137..."
6,AdaBoostRegressor,"[-5.592, -6.054603827840001, -6.05460382784000...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.778333333333333, -5.830838848000001, -5.5...","[-5.156329494904314, -5.6220478939843135, -5.5...","[0.39867371132640117, 0.33264565920264694, 0.2..."
7,SVR,"[-5.413858664579116, -5.82956840885068, -5.829...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.363969723180899, -5.932967024967623, -5.4...","[-5.239878595545685, -5.758998418057883, -5.44...","[0.14297492202777257, 0.3118904242653825, 0.14..."
8,LinearRegression,"[-5.375994735925552, -6.2219999999999995, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.759507417751967, -5.229049185524878, -5.3...","[-4.594681909532405, -5.083414903889928, -5.40...","[0.24062890175128807, 0.2421002251172405, 0.04..."
9,KNeighborsRegressor,"[-5.183333333333334, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.319999999999999, -5.503333333333333, -5.1...","[-5.101333333333333, -5.472, -5.368, -6.240415...","[0.20126930549225178, 0.18465102220134053, 0.1..."


In [23]:
aac_comp.to_csv('results/Monomeric/AAC_comp_results_MDCK.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_prediction_data_MDCK.csv')

In [24]:
#Constant column removal
df_mc_train = pd.read_csv('features/Monomeric/Train_aac_MDCK.csv')
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_aac_MDCK.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_mc = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_mc, X_train,y_train, X_test,  y_test)
result_df

(51, 11)
(51,)
(13, 11)
(13,)
0.12519451244472146
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 0
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.694363
[LightGBM] [Info] Total Bins 0
[LightG



-1.0883109453369375




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.9191,0.8004,0.9587,-0.4208,0.0563,0.0858,0.5146,0.6357,0.7173,0.1252,0.423,0.171
LGBMRegressor,0.7103,0.6855,0.8428,-0.098,-0.0872,-0.0382,0.5616,0.6033,0.7494,0.0453,0.2513,0.2733
XGBRegressor,0.8803,0.7819,0.9382,-0.3608,0.1149,0.13,0.571,0.6658,0.7557,0.0292,0.3601,0.171
DecisionTreeRegressor,0.8935,0.7805,0.9453,-0.3813,0.081,0.1225,0.6123,0.6824,0.7825,-0.0409,0.3195,0.0662
RandomForestRegressor,0.7929,0.7376,0.8905,-0.2257,0.1219,0.1717,0.458,0.5959,0.6767,0.2214,0.4893,0.2152
GradientBoostingRegressor,0.899,0.7823,0.9482,-0.3897,0.0738,0.1135,0.4839,0.6198,0.6956,0.1774,0.4639,0.2538
AdaBoostRegressor,0.6846,0.6823,0.8274,-0.0583,0.2317,0.1776,0.401,0.5536,0.6333,0.3183,0.5647,0.2924
SVR,0.7771,0.7354,0.8815,-0.2013,-0.0179,0.0463,0.5337,0.5729,0.7305,0.0928,0.402,0.2372
LinearRegression,0.6729,0.657,0.8203,-0.0401,0.2564,0.2745,0.5771,0.6134,0.7597,0.0188,0.382,0.2538
KNeighborsRegressor,0.8517,0.7793,0.9229,-0.3166,0.0841,0.1607,0.6329,0.6084,0.7955,-0.0759,0.2872,0.2648


In [25]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-5.2950000000000035, -6.221999999999991, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.267200000000006, -6.25, -5.29500000000000...","[-5.108304899434407, -6.007034036174002, -5.47...","[0.39648064298813945, 0.48593192765199694, 0.1..."
1,LGBMRegressor,"[-5.753098213672638, -5.753098213672638, -5.75...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.753098213672638, -5.753098213672638, -5.7...","[-5.551464208338411, -5.661144897664251, -5.55...","[0.14966825667694483, 0.07209968741488275, 0.1..."
2,XGBRegressor,"[-5.29548, -6.2220545, -6.2220545, -6.0047245,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.7313776, -6.24856, -5.29548, -6.9255056, ...","[-4.87979, -6.0119925, -5.4701047, -6.6387153,...","[0.3947066, 0.47240928, 0.14748119, 0.5733854,..."
3,DecisionTreeRegressor,"[-5.295, -6.2219999999999995, -6.2219999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -5.295, -6.927245587, -5.22, -...","[-4.7487342614, -6.038, -5.470000000000001, -6...","[0.27746852279999995, 0.4239999999999998, 0.14..."
4,RandomForestRegressor,"[-5.378811377966667, -6.193135364357865, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.151233314944576, -5.8698978409558356, -5....","[-5.21894731896135, -5.7283366737621835, -5.51...","[0.18159984983084346, 0.26673485448750067, 0.1..."
5,GradientBoostingRegressor,"[-5.346659526734528, -6.216450432194516, -6.21...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.673265557952084, -6.199897901570898, -5.3...","[-5.212882550149957, -5.932998463099878, -5.48...","[0.4387149057116776, 0.5787140006464542, 0.137..."
6,AdaBoostRegressor,"[-5.639388053500001, -6.054603827840001, -6.05...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.778333333333333, -5.798008666214286, -5.6...","[-5.288716192270877, -5.687453808178583, -5.65...","[0.37632143239522436, 0.26006021173346516, 0.1..."
7,SVR,"[-5.41385908973807, -5.8295669306780855, -5.82...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.363974989317022, -5.932975284667058, -5.4...","[-5.239930218648427, -5.759037921475825, -5.44...","[0.14292461001976278, 0.31186830181873276, 0.1..."
8,LinearRegression,"[-5.375994735925552, -6.2219999999999995, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.759507417751967, -5.229049185524878, -5.3...","[-4.594681909532405, -5.083414903889928, -5.40...","[0.2406289017512884, 0.24210022511724086, 0.04..."
9,KNeighborsRegressor,"[-5.183333333333334, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.319999999999999, -5.503333333333333, -5.1...","[-5.101333333333334, -5.472, -5.368, -6.240415...","[0.20126930549225144, 0.18465102220134053, 0.1..."


In [26]:
result_df.to_csv('results/Monomeric/AAC_comp_const_rem_results_MDCK.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_const_rem_prediction_data_MDCK.csv')

In [27]:
#LVR column removal
df_mc_train = pd.read_csv('features/Monomeric/Train_aac_MDCK.csv')
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
X_train, const_col = remove_low_variance_columns(X_train)

y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_aac_MDCK.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_mc = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_mc, X_train,y_train, X_test,  y_test)
result_df

(51, 5)
(51,)
(13, 5)
(13,)
0.15769216486706372
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 0
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.694363
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of us



-1.8365009629725537




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.8601,0.7564,0.9274,-0.3296,0.0965,0.1326,0.4955,0.6429,0.7039,0.1577,0.4428,0.2317
LGBMRegressor,0.7142,0.6874,0.8451,-0.104,-0.0961,-0.0057,0.5734,0.6093,0.7572,0.0252,0.1643,0.1981
XGBRegressor,0.8357,0.7519,0.9142,-0.2919,0.1066,0.1575,0.5804,0.6872,0.7619,0.0133,0.3488,0.16
DecisionTreeRegressor,0.8939,0.7768,0.9455,-0.3819,0.0741,0.1111,0.5442,0.6802,0.7377,0.0749,0.3795,0.16
RandomForestRegressor,0.7738,0.7153,0.8797,-0.1962,0.1169,0.1626,0.4658,0.6137,0.6825,0.2081,0.4734,0.1986
GradientBoostingRegressor,0.8658,0.7665,0.9305,-0.3384,0.0702,0.0792,0.5154,0.6452,0.7179,0.1238,0.4243,0.2648
AdaBoostRegressor,0.642,0.6607,0.8013,0.0076,0.2842,0.218,0.401,0.5458,0.6333,0.3182,0.5672,0.3255
SVR,0.7514,0.6959,0.8668,-0.1615,0.1108,0.1974,0.5653,0.5999,0.7519,0.039,0.3426,0.32
LinearRegression,0.6437,0.65,0.8023,0.0049,0.2687,0.3193,0.6587,0.6629,0.8116,-0.1199,0.2071,0.171
KNeighborsRegressor,0.737,0.6871,0.8585,-0.1393,0.2059,0.2896,0.662,0.6279,0.8136,-0.1254,0.275,0.4359


In [28]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-5.2950000000000035, -6.221999999999991, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.493000000000006, -6.25, -5.29500000000000...","[-5.289147032051578, -6.01276316731743, -5.470...","[0.35837322109765557, 0.4744736653651401, 0.14..."
1,LGBMRegressor,"[-5.753098213672638, -5.753098213672638, -5.75...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.753098213672638, -5.753098213672638, -5.7...","[-5.563857796772694, -5.688173775878739, -5.56...","[0.15098156155020348, 0.09747574238919043, 0.1..."
2,XGBRegressor,"[-5.2943735, -6.2216816, -6.2216816, -5.882914...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.7308993, -6.2486463, -5.2943735, -6.92498...","[-4.8916063, -6.0391207, -5.469754, -6.6509256...","[0.3361239, 0.41782245, 0.14764787, 0.54866576..."
3,DecisionTreeRegressor,"[-5.295, -6.2219999999999995, -6.2219999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -5.295, -6.927245587, -5.37858...","[-5.146, -6.038, -5.470000000000001, -6.645208...","[0.6364149589693817, 0.4239999999999998, 0.147..."
4,RandomForestRegressor,"[-5.357729213069999, -6.193135364357865, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.187422741160974, -5.930343215755667, -5.3...","[-5.2584346971367015, -5.723190282761889, -5.5...","[0.18565477668203204, 0.2884345799868826, 0.11..."
5,GradientBoostingRegressor,"[-5.280803795917586, -6.212501921859376, -6.21...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.395883403028521, -6.232020096947168, -5.2...","[-5.128101961908328, -6.022487121988138, -5.45...","[0.5787362440737861, 0.42978622984557263, 0.13..."
6,AdaBoostRegressor,"[-5.416666666666667, -6.054603827840001, -6.05...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.592013923733333, -5.592013923733333, -5.4...","[-5.2540714516027744, -5.600426497564314, -5.5...","[0.36181938597529073, 0.12192205449916106, 0.1..."
7,SVR,"[-5.1255559504553245, -5.430416887688733, -5.4...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.482072747197859, -6.149482561085053, -5.1...","[-5.401018283756371, -5.959983045139651, -5.40...","[0.16916132598435032, 0.35717463096306273, 0.1..."
8,LinearRegression,"[-5.432670445883795, -5.81963513804729, -5.819...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.657723138447146, -5.031221947794083, -5.4...","[-4.57083102201134, -4.937827834893946, -5.368...","[0.26908157615036793, 0.2592305670934972, 0.07..."
9,KNeighborsRegressor,"[-5.136666666666667, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.319999999999999, -5.284768119, -5.1366666...","[-5.0920000000000005, -5.2613094257999995, -5....","[0.19610201426808416, 0.181834057691948, 0.136..."


In [29]:
result_df.to_csv('results/Monomeric/AAC_comp_LVR_results_MDCK.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_LVR_prediction_data_MDCK.csv')

In [30]:
#Atomic models
df_train = pd.read_csv('features/Atomic/Train_all_atomic_desc_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Atomic/Test_all_atomic_desc_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_degree = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_degree, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 23)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 23)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 1
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing 



0.21088004498339585




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6352,0.6581,0.797,0.018,0.2095,0.2556,0.4738,0.5765,0.6883,0.1946,0.4619,0.3928
DecisionTreeRegressor,0.977,0.7465,0.9885,-0.5104,0.1998,0.3301,0.5878,0.6842,0.7667,0.0007,0.5665,0.3503
RandomForestRegressor,0.7208,0.6718,0.849,-0.1142,0.2609,0.2915,0.4523,0.617,0.6725,0.231,0.585,0.2755
GradientBoostingRegressor,0.9244,0.733,0.9615,-0.429,0.1934,0.3197,0.4531,0.6099,0.6732,0.2297,0.641,0.3691
AdaBoostRegressor,0.6644,0.6473,0.8151,-0.0271,0.3346,0.3994,0.4028,0.5815,0.6347,0.3153,0.6216,0.3471
XGBRegressor,0.9284,0.7406,0.9635,-0.4351,0.2348,0.3847,0.5276,0.6344,0.7264,0.103,0.585,0.2424
ExtraTreesRegressor,0.8315,0.7196,0.9119,-0.2854,0.2574,0.3662,0.4578,0.6145,0.6766,0.2217,0.6482,0.281
LinearRegression,0.8011,0.6724,0.895,-0.2383,0.2747,0.3276,0.3999,0.544,0.6324,0.3201,0.5849,0.4573
KNeighborsRegressor,0.6035,0.6162,0.7769,0.067,0.4287,0.4489,0.4117,0.5695,0.6417,0.3,0.5788,0.4055
SVR,0.6565,0.6301,0.8102,-0.0148,0.2482,0.1837,0.3816,0.4913,0.6178,0.3512,0.6918,0.4738


In [31]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.965745329504426, -5.965745329504426, -5.96...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.965745329504426, -5.540451101521032, -5.5...","[-5.626978421003314, -5.367659698329064, -5.36...","[0.1708394140073482, 0.21696786900259085, 0.21..."
1,DecisionTreeRegressor,"[-5.295, -5.195, -6.97, -6.929917083, -6.92991...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.295, -6.25, -6.25, -6.927245587, -7.82, -...","[-4.983, -6.093999999999999, -6.09399999999999...","[0.2459999999999998, 0.3120000000000001, 0.312..."
2,RandomForestRegressor,"[-5.363379999999999, -5.349524999999997, -6.50...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.349423333333333, -5.994491570946668, -6.0...","[-5.261993585553428, -5.916150197676856, -5.90...","[0.09167580506668391, 0.21318781557600078, 0.2..."
3,GradientBoostingRegressor,"[-5.302816986363614, -5.210833578301628, -6.92...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.337353846375231, -6.228767526566608, -6.1...","[-5.24919859948122, -6.075529405460093, -6.055...","[0.10527411802820809, 0.3144815147967797, 0.34..."
4,AdaBoostRegressor,"[-5.295, -5.293333333333333, -6.61029337845454...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.295, -6.02614462275, -6.13, -6.7828334522...","[-5.313476190476189, -5.818762229842382, -5.83...","[0.21182479818169594, 0.21207596740944587, 0.2..."
5,XGBRegressor,"[-5.295471, -5.195101, -6.968063, -6.9289513, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3076005, -6.2483983, -5.9604754, -6.92772...","[-5.1549377, -6.093379, -5.9614635, -6.8631706...","[0.2368334, 0.31175116, 0.3512696, 0.12672764,..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -5.194999999999996, -6.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.282550000000003, -6.25, -6.13724999999999...","[-5.370626666666672, -6.0955400000000015, -6.0...","[0.17154458461610458, 0.30891999999999625, 0.3..."
7,LinearRegression,"[-5.365590535765415, -6.1816013389543, -6.2457...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.229704550628526, -5.607117279972588, -5.6...","[-5.343710306767623, -5.434628259905332, -5.66...","[0.16200289293830036, 0.3257056247751279, 0.24..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.6...","[-5.6259999999999994, -5.666, -5.666, -6.43727...","[0.16096100286853474, 0.14081350945290877, 0.1..."
9,SVR,"[-5.5500601501148354, -5.702291087924101, -5.7...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.49343417031825, -5.608575903802059, -5.64...","[-5.5997993801996415, -5.708491166221073, -5.7...","[0.07210610503450504, 0.11201598713907131, 0.1..."


In [32]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_MDCK.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_MDCK.csv')

In [7]:
#Atomic + monomeric_composition based features
df1 = pd.read_csv('features/Monomeric/Train_mon_comp_MDCK.csv')
df2 = pd.read_csv('features/Atomic/Train_all_atomic_desc_MDCK.csv')
df_train = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_train

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_S,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0,64,10,0,12,0,0,102,1,1
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0,64,10,0,12,0,0,102,1,1
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0,64,10,0,12,0,0,102,1,1
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,0.0,0.0,0.2,0.2,0.0,0.0,0.0,...,0,60,10,0,12,0,0,102,1,1
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0,66,10,0,6,0,0,98,1,1
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,66,10,0,6,0,0,98,1,1
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,66,10,0,6,0,0,94,1,1
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0,66,10,0,6,0,0,94,1,1
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,...,2,65,10,0,0,0,0,95,0,1
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,0.0,0.0,0.2,0.2,0.0,0.0,0.0,...,0,64,10,0,0,0,0,90,0,1


In [8]:
df1 = pd.read_csv('features/Monomeric/Test_mon_comp_MDCK.csv')
df2 = pd.read_csv('features/Atomic/Test_all_atomic_desc_MDCK.csv')
df_test = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_test

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_S,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0,66,10,0,12,0,0,102,1,1
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,0.0,0.0,0.4,0.0,0.0,0.0,0.0,...,0,60,10,0,12,0,0,102,1,1
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,60,10,0,12,0,0,102,1,1
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,...,2,64,10,0,0,0,0,94,0,1
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,...,2,62,10,0,0,0,0,95,0,1
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,...,2,61,10,0,0,0,0,94,0,1
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0,63,10,0,0,0,0,92,0,1
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,0.0,0.0,0.111111,0.222222,0.0,0.0,0.0,...,2,59,10,0,0,0,0,94,0,1
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,0.0,0.0,0.111111,0.222222,0.0,0.0,0.0,...,2,59,10,0,0,0,0,94,0,1
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,0.1,0.0,0.0,0.0,0.0,0.0,0.0,...,0,62,10,0,0,0,0,92,0,1


In [7]:
import re
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)

    df.columns = [clean_name(col) for col in df.columns]
    return df

In [8]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [11]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 408)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 408)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 3
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info



-0.6911929466188871




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5925,0.6462,0.7697,0.0842,0.2978,0.3452,0.4601,0.5649,0.6783,0.2178,0.4908,0.4242
DecisionTreeRegressor,0.8452,0.6014,0.9193,-0.3065,0.2377,0.3955,0.4299,0.4963,0.6557,0.2691,0.6696,0.3691
RandomForestRegressor,0.6196,0.6139,0.7872,0.0422,0.3374,0.3979,0.3423,0.4966,0.5851,0.418,0.663,0.4242
GradientBoostingRegressor,0.7917,0.6344,0.8898,-0.2238,0.242,0.326,0.418,0.5022,0.6465,0.2893,0.6244,0.292
AdaBoostRegressor,0.7316,0.6605,0.8553,-0.131,0.2356,0.3035,0.3009,0.4951,0.5486,0.4884,0.7113,0.3802
XGBRegressor,0.7512,0.6157,0.8667,-0.1613,0.3252,0.3985,0.4873,0.5501,0.6981,0.1716,0.6173,0.3471
ExtraTreesRegressor,0.7052,0.6066,0.8398,-0.0901,0.2956,0.3406,0.4109,0.5239,0.6411,0.3014,0.6292,0.2645
LinearRegression,5.6413,1.6446,2.3752,-7.7207,-0.2651,-0.1591,1.7674,1.0058,1.3294,-2.0046,0.5564,0.4876
KNeighborsRegressor,0.7792,0.6877,0.8827,-0.2045,0.1884,0.2396,0.31,0.4936,0.5568,0.473,0.7054,0.2869
SVR,0.7181,0.6804,0.8474,-0.11,0.0831,0.0334,0.3605,0.4868,0.6004,0.3871,0.6547,0.5785


In [12]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_and_mono_comp_MDCK.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_and_mono_comp_MDCK.csv')

In [13]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 44)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 44)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 3
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] 



-0.7336560360630215




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5925,0.6462,0.7697,0.0842,0.2978,0.3452,0.4601,0.5649,0.6783,0.2178,0.4908,0.4242
DecisionTreeRegressor,0.9522,0.6424,0.9758,-0.472,0.1765,0.3146,0.4552,0.5058,0.6747,0.2261,0.65,0.4298
RandomForestRegressor,0.6189,0.6175,0.7867,0.0433,0.3409,0.4012,0.3385,0.4894,0.5818,0.4245,0.6682,0.4187
GradientBoostingRegressor,0.7879,0.6359,0.8877,-0.218,0.2486,0.3507,0.4203,0.5029,0.6483,0.2855,0.6237,0.2975
AdaBoostRegressor,0.6813,0.6507,0.8254,-0.0532,0.254,0.3129,0.299,0.4981,0.5468,0.4917,0.7164,0.3967
XGBRegressor,0.7512,0.6157,0.8667,-0.1613,0.3252,0.3985,0.4873,0.5501,0.6981,0.1716,0.6173,0.3471
ExtraTreesRegressor,0.7336,0.6193,0.8565,-0.134,0.2663,0.3221,0.4042,0.5173,0.6358,0.3128,0.6337,0.2645
LinearRegression,5.6413,1.6446,2.3752,-7.7207,-0.2651,-0.1591,1.7674,1.0058,1.3294,-2.0046,0.5564,0.4876
KNeighborsRegressor,0.7792,0.6877,0.8827,-0.2045,0.1884,0.2396,0.2925,0.48,0.5409,0.5027,0.7262,0.3641
SVR,0.7181,0.6804,0.8474,-0.11,0.0831,0.0334,0.3605,0.4868,0.6004,0.3871,0.6548,0.5785


In [14]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.93773412153736, -5.983867338806448, -5.983...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.93773412153736, -5.522329091122372, -5.56...","[-5.513319307793336, -5.30371095852256, -5.390...","[0.23692144274442523, 0.22305643559392596, 0.2..."
1,DecisionTreeRegressor,"[-5.295, -6.97, -5.06, -6.927245587, -6.929917...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.295, -6.25, -6.34, -6.927245587, -6.92724...","[-4.8740000000000006, -5.6674000366, -6.753, -...","[0.3981130492711837, 0.3363303407234969, 0.947..."
2,RandomForestRegressor,"[-5.199322852306668, -6.1032, -5.9719156004853...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.2844938284800005, -5.5573819193866685, -5...","[-5.173279848922666, -5.690885473969334, -6.04...","[0.09513265885634099, 0.16224735896136933, 0.2..."
3,GradientBoostingRegressor,"[-5.340313349521261, -5.875076441062632, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.366950699318421, -6.022222154868427, -6.1...","[-5.140454034240314, -5.801680050946473, -6.04...","[0.20585295811257112, 0.21193960638716092, 0.4..."
4,AdaBoostRegressor,"[-5.25, -6.112082641571429, -6.075085482266667...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.2774981592307695, -5.840014536727272, -5....","[-5.232638352341153, -5.732087806195454, -5.83...","[0.10924366869176487, 0.2335388613917509, 0.32..."
5,XGBRegressor,"[-5.269669, -6.4541073, -5.929844, -5.891423, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.316853, -5.6233106, -5.8720407, -6.926082...","[-4.9429846, -5.847226, -5.946418, -6.6580553,...","[0.24036725, 0.19860332, 0.49665016, 0.5372004..."
6,ExtraTreesRegressor,"[-5.464314347820004, -6.478000000000008, -5.79...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.468499170830004, -5.891488861390001, -5.6...","[-5.4116388265180015, -5.763699381808001, -5.8...","[0.3533603053615431, 0.14542657087010974, 0.37..."
7,LinearRegression,"[-10.0, -5.114003259911044, -6.519430345382716...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -10.0, -10.0, -6.927245586999788, -10....","[-5.2, -7.719579234674027, -9.47307106647092, ...","[2.4, 1.2189088722922783, 1.0538578670581593, ..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -6.07...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.3...","[-5.4126666666666665, -5.638, -5.4946666666666...","[0.21437661564017046, 0.16956938927111156, 0.2..."
9,SVR,"[-5.603287247427174, -5.654331805907437, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.609303475363959, -5.621233619080112, -5.4...","[-5.492972801252818, -5.674890303016237, -5.51...","[0.1608086109511643, 0.03594912096921638, 0.15..."


In [15]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_and_mono_comp_const_rem_MDCK.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_and_mono_comp_const_rem_MDCK.csv')

In [16]:
const_col

['Ala_tBu_',
 'Ala_indol_2_yl_',
 'dAla_indol_2_yl_',
 'Me_Ala_indol_2_yl_',
 'Ala_5_Tet_',
 'Me_dAbu',
 'Me_Abu_morpholino_',
 '2Abz',
 'Aib',
 'Aoc_2_',
 '5_Ava',
 'Bal',
 'Me_Bal',
 'HOCOCH2_Bal',
 'Cys_EtO2H__NH2',
 'dCha',
 'Me_Cha',
 'D',
 'meD',
 'Asp_piperidide',
 'Asp_OMe_',
 'Asp_Ph_2_NH2__',
 'dAsp_pyrrol_1_yl_',
 'E',
 'Glu_NH2',
 'Glu_3R_Me_',
 'Glu_OMe_',
 'dGlu_OMe_',
 'Phe_4_F_',
 'dPhe_4_F_',
 'Phe_4_CF3_',
 'Phe_4_NO2_',
 'Phe_CHF2_',
 'dPhe_3_4_diF_',
 'Et_Phe',
 'H2NEt_Phe',
 'Me_Phe_3_Cl_',
 'Me_Phe_4_Cl_',
 'Me_Phe_a_b_dehydro_',
 'G',
 'Bn_Gly',
 'Bn_4_Cl__Gly',
 'Bn_4_OH__Gly',
 'Bu_Gly',
 'EtOEt_Gly',
 'HOCOCH2_Gly_ol',
 'MeOEt_Gly',
 'NH2Bu_Gly',
 'PhEt_Gly',
 'PhPr_Gly',
 'isoamyl_Gly',
 'pentyl_Gly',
 '3_pyridylethyl_Gly',
 '2_pyridylmethyl_Gly',
 'd_N__O_Gly_allyl_',
 'GABA',
 'H',
 'Hph',
 'Me_Hph',
 'bHph',
 'Hph_2_Cl_',
 'Hph_3_Cl_',
 'Hph_4_Cl_',
 'Hse_Et_',
 'dHyp',
 'Hyp_Et_',
 'dI',
 'meI',
 'Me_dI',
 '_N__O_xiIle',
 'd_N__O_aIle',
 'K',
 'dK',
 'meK

In [4]:
#Fingerprints models
#All fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/All_fingerprints_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/All_fingerprints_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 20188)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 20188)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 177
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 300
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 43
[LightGBM] [Info] Start training from score -5.673825
[LightG

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6859,0.7022,0.8282,-0.0604,0.1121,0.1744,0.4641,0.5902,0.6812,0.2111,0.4866,0.4408
DecisionTreeRegressor,1.2508,0.8872,1.1184,-0.9336,0.1248,0.1626,0.4415,0.5841,0.6645,0.2494,0.6629,0.4044
RandomForestRegressor,0.7375,0.7025,0.8588,-0.14,0.1544,0.1797,0.264,0.4502,0.5138,0.5511,0.7475,0.3384
GradientBoostingRegressor,0.9869,0.7881,0.9934,-0.5256,0.0853,0.1348,0.3372,0.51,0.5807,0.4267,0.7196,0.3494
AdaBoostRegressor,0.6947,0.68,0.8335,-0.0739,0.2326,0.2835,0.2603,0.4427,0.5102,0.5575,0.7586,0.3659
XGBRegressor,1.0337,0.7926,1.0167,-0.598,0.1139,0.1784,0.2862,0.4573,0.535,0.5135,0.7589,0.5254
ExtraTreesRegressor,1.0782,0.8126,1.0384,-0.6667,0.0936,0.1396,0.3686,0.5229,0.6071,0.3734,0.7054,0.3934
LinearRegression,1.2517,0.8227,1.1188,-0.935,0.1099,0.1722,0.6979,0.6548,0.8354,-0.1864,0.6201,0.4869
KNeighborsRegressor,0.7367,0.6927,0.8583,-0.1389,0.194,0.2564,0.3412,0.5067,0.5842,0.4199,0.6594,0.3604
SVR,0.7239,0.6942,0.8508,-0.119,0.0642,0.0709,0.3692,0.5095,0.6076,0.3723,0.6485,0.5144


In [5]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.682897517574019, -6.114074271195862, -6.33...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.682897517574019, -5.696941666398196, -5.9...","[-5.503795248512171, -5.506604078277007, -5.75...","[0.17432552151726882, 0.17727684879712866, 0.1..."
1,DecisionTreeRegressor,"[-4.94, -6.97, -5.06, -5.793469425, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -7.82, -6.927245587, -6.929917...","[-4.894, -5.9, -6.99, -6.521304152800001, -5.9...","[0.6797823180989632, 0.7, 1.0600000000000005, ..."
2,RandomForestRegressor,"[-5.2730352908908324, -6.218100000000004, -5.7...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.480949170829999, -5.693703562856668, -6.4...","[-5.355840829118435, -5.653452067015069, -6.39...","[0.16765190204929076, 0.2860020468905707, 0.38..."
3,GradientBoostingRegressor,"[-5.001145771268826, -6.103753715980014, -5.46...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.837844258263351, -5.859508342863473, -6.9...","[-5.159889235135806, -5.904519174531797, -6.74...","[0.4114737482275645, 0.3753212181355212, 0.592..."
4,AdaBoostRegressor,"[-5.290073526111111, -5.688648896041666, -5.86...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.114, -5.868222216653847, -5.9433462714999...","[-5.29443206796, -5.715445626647236, -6.035609...","[0.23157917717277252, 0.29272400833988077, 0.3..."
5,XGBRegressor,"[-5.0236025, -6.0459657, -5.1496067, -5.867244...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3382225, -5.579111, -6.318296, -6.9381027...","[-5.3215857, -5.763702, -6.736338, -6.595792, ...","[0.27006215, 0.50087476, 0.7288663, 0.30035895..."
6,ExtraTreesRegressor,"[-4.959199999999999, -6.840900000000009, -5.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.772100000000005, -6.23240082135, -7.59890...","[-5.043259164152005, -5.864540164270002, -7.03...","[0.4507059606812378, 0.6000287153602135, 0.752..."
7,LinearRegression,"[-4.412120833994302, -6.311838740957112, -4.62...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.897533518349219, -5.4847159888925425, -9....","[-5.147759128402, -5.22978135830437, -8.174240...","[1.3027994618818746, 0.6419525138669969, 0.701..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -5.1...","[-5.6546666666666665, -5.666000000000001, -5.3...","[0.14230796026770795, 0.14081350945290821, 0.1..."
9,SVR,"[-5.522848894407083, -5.669700115999046, -5.67...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.794299384358194, -5.810164240519107, -5.3...","[-5.696688171402978, -5.741273304873648, -5.40...","[0.28568219415239465, 0.2574184408049692, 0.05..."


In [6]:
result_df.to_csv('results/Fingerprints/Results_All_fingerprints_fp_MDCK.csv')
prediction_df.to_csv('results/Fingerprints/Prediction_data_All_fingerprints_fp_MDCK.csv')

In [7]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [8]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [9]:
#All fingerprints constant removal
df_train = pd.read_csv('features/Fingerprints/Train/All_fingerprints_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/All_fingerprints_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 2131)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 2131)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 177
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 300
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 43
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6859,0.7022,0.8282,-0.0604,0.1121,0.1744,0.4641,0.5902,0.6812,0.2111,0.4866,0.4408
DecisionTreeRegressor,1.3223,0.8992,1.1499,-1.0441,0.0921,0.1744,0.4442,0.602,0.6665,0.2448,0.6686,0.4044
RandomForestRegressor,0.7195,0.6877,0.8482,-0.1122,0.1795,0.2169,0.2572,0.4442,0.5072,0.5627,0.7549,0.3384
GradientBoostingRegressor,0.9941,0.7879,0.997,-0.5367,0.0889,0.1733,0.3477,0.5147,0.5897,0.4089,0.71,0.3494
AdaBoostRegressor,0.708,0.6711,0.8414,-0.0945,0.2424,0.296,0.2536,0.4308,0.5036,0.5688,0.7637,0.4044
XGBRegressor,1.0337,0.7926,1.0167,-0.598,0.1139,0.1784,0.2862,0.4573,0.535,0.5135,0.7589,0.5254
ExtraTreesRegressor,1.0775,0.8118,1.038,-0.6656,0.091,0.1533,0.3711,0.523,0.6092,0.3691,0.7013,0.3934
LinearRegression,1.2517,0.8227,1.1188,-0.935,0.1099,0.1722,0.6979,0.6548,0.8354,-0.1864,0.6201,0.4869
KNeighborsRegressor,0.7367,0.6927,0.8583,-0.1389,0.194,0.2564,0.3412,0.5067,0.5842,0.4199,0.6594,0.3604
SVR,0.7238,0.6942,0.8508,-0.119,0.0642,0.0709,0.3692,0.5095,0.6076,0.3724,0.6485,0.5144


In [10]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.682897517574019, -6.114074271195862, -6.33...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.682897517574019, -5.696941666398196, -5.9...","[-5.503795248512171, -5.506604078277007, -5.75...","[0.17432552151726882, 0.17727684879712866, 0.1..."
1,DecisionTreeRegressor,"[-4.94, -6.97, -5.06, -5.793469425, -5.7934694...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -7.82, -6.927245587, -6.929917...","[-4.962000000000001, -5.922, -7.004, -6.521304...","[0.6456748407673941, 0.6559999999999998, 1.111..."
2,RandomForestRegressor,"[-5.311991913136663, -6.02709578491, -5.757391...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.391189713396666, -5.672530490213334, -6.3...","[-5.360490696511768, -5.656821579548169, -6.38...","[0.17893768805714805, 0.2569776098740846, 0.37..."
3,GradientBoostingRegressor,"[-5.001145771268826, -5.958810528606914, -5.45...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.798097169834813, -5.859508342863473, -6.9...","[-5.112701245363833, -5.908847555856864, -6.69...","[0.37243758249407755, 0.3833700518428232, 0.62..."
4,AdaBoostRegressor,"[-5.282500000000001, -5.741428571428572, -5.86...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.173192133999999, -5.977412934681819, -6.0...","[-5.2676068478526314, -5.682049759195207, -6.0...","[0.17495638215281503, 0.37109269488091773, 0.4..."
5,XGBRegressor,"[-5.0236025, -6.0459657, -5.1496067, -5.867244...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3382225, -5.579111, -6.318296, -6.9381027...","[-5.3215857, -5.763702, -6.736338, -6.595792, ...","[0.27006215, 0.50087476, 0.7288663, 0.30035895..."
6,ExtraTreesRegressor,"[-4.933399999999999, -6.926500000000008, -5.23...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.8094000000000054, -6.219100000000001, -7....","[-5.004502175707505, -5.836185707168002, -7.00...","[0.47273804711970324, 0.6054635015919565, 0.76..."
7,LinearRegression,"[-4.412120833994424, -6.311838740957146, -4.62...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.897533518349335, -5.48471598889263, -9.04...","[-5.147759128402035, -5.229781358304389, -8.17...","[1.3027994618818817, 0.6419525138670001, 0.701..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -5.1...","[-5.6546666666666665, -5.666000000000001, -5.3...","[0.14230796026770795, 0.14081350945290821, 0.1..."
9,SVR,"[-5.523043330238771, -5.669801407333472, -5.67...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.794644222213698, -5.810487310164273, -5.3...","[-5.6967319987484855, -5.7412999999274605, -5....","[0.285552106577087, 0.25728693551069687, 0.058..."


In [11]:
result_df.to_csv('results/Fingerprints/Results_All_const_rem_fingerprints_MDCK.csv')
prediction_df.to_csv('results/Fingerprints/Prediction_data_All_const_rem_fingerprints_MDCK.csv')

In [12]:
#Morgan fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/morgan_fp_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/morgan_fp_test_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_fp

X_train shape:  (51, 2048)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 2048)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 4
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testi

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7445,0.7088,0.8629,-0.151,-0.0942,-0.0393,0.5459,0.5726,0.7389,0.0719,0.2686,0.2271
DecisionTreeRegressor,0.9207,0.7323,0.9595,-0.4233,0.1887,0.2807,0.3263,0.4973,0.5712,0.4453,0.738,0.4711
RandomForestRegressor,0.7561,0.6826,0.8696,-0.1689,0.2092,0.3127,0.2173,0.4196,0.4662,0.6305,0.7962,0.4601
GradientBoostingRegressor,0.8936,0.7018,0.9453,-0.3813,0.2221,0.3511,0.2983,0.4905,0.5462,0.4928,0.7724,0.5702
AdaBoostRegressor,0.7112,0.6818,0.8433,-0.0994,0.2119,0.2766,0.2247,0.3999,0.4741,0.6179,0.7919,0.5372
XGBRegressor,0.9075,0.6994,0.9527,-0.4029,0.2162,0.3746,0.3171,0.4895,0.5631,0.461,0.7264,0.5041
ExtraTreesRegressor,0.9418,0.7338,0.9704,-0.4558,0.1682,0.2768,0.3585,0.5194,0.5988,0.3905,0.7093,0.4601
LinearRegression,1.0265,0.708,1.0132,-0.5868,0.265,0.3239,0.5535,0.6196,0.744,0.059,0.595,0.3994
KNeighborsRegressor,0.7691,0.7171,0.877,-0.1889,0.2225,0.2974,0.4473,0.5219,0.6688,0.2395,0.5528,0.2893
SVR,0.7732,0.7225,0.8793,-0.1952,-0.006,0.0024,0.3507,0.5081,0.5922,0.4039,0.6622,0.5647


In [13]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.431899695363696, -5.431899695363696, -5.43...","[0.08096631896519323, 0.08096631896519323, 0.0..."
1,DecisionTreeRegressor,"[-6.929917083, -6.97, -5.195, -5.793469425, -5...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -7.82, -6.927245587, -5.453495...","[-5.361, -5.391, -6.861999999999999, -6.809796...","[0.7633505092681866, 0.599102662320908, 0.9452..."
2,RandomForestRegressor,"[-5.444057791020001, -6.835970000000008, -5.25...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.516890837496668, -5.688767504163336, -6.5...","[-5.405798983036735, -5.656295180114808, -6.45...","[0.23598314192653955, 0.21365990223656536, 0.2..."
3,GradientBoostingRegressor,"[-5.6248985009875545, -7.6553987686755685, -4....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.6473838477230425, -5.865850383798877, -6....","[-5.569206768627078, -5.863218633412728, -7.05...","[0.41422433770853745, 0.28472121221116664, 0.5..."
4,AdaBoostRegressor,"[-5.5322460303181815, -6.888333333333333, -5.5...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.52125, -5.5377225707777775, -6.09, -6.064...","[-5.334547903576611, -5.430791200095166, -6.07...","[0.13475896263685877, 0.12497547567278622, 0.2..."
5,XGBRegressor,"[-6.0731106, -7.8135104, -5.187934, -6.146343,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3840137, -6.2445383, -6.4294143, -6.89694...","[-5.112603, -6.02817, -6.604773, -6.6355524, -...","[0.47593063, 0.29658327, 0.6902337, 0.3530023,..."
6,ExtraTreesRegressor,"[-6.391680929090008, -6.970000000000017, -5.14...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.561200000000005, -6.25, -7.82000000000001...","[-5.164850000000003, -5.4750346852280005, -6.8...","[0.3518338698874813, 0.6009241337334386, 0.878..."
7,LinearRegression,"[-4.697589991787713, -6.738835677961945, -4.42...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.838188606117533, -5.554719750175392, -6.1...","[-5.196967516931039, -5.5426177065939735, -5.9...","[0.529316879399676, 0.5984889146578108, 0.4527..."
8,KNeighborsRegressor,"[-5.183333333333334, -5.786666666666666, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.503333333333333, -5.503333333333333, -4.8...","[-5.390666666666666, -5.483333333333333, -4.75...","[0.19764614845728687, 0.187936159373336, 0.181..."
9,SVR,"[-5.59323403953569, -5.954349517113198, -5.618...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.778955248416474, -5.736412124225234, -5.3...","[-5.558253001620082, -5.629575116892168, -5.28...","[0.35570124192838404, 0.22985032593113042, 0.0..."


In [14]:
df_morgan_fp.to_csv('results/Fingerprints/Results_Morgan_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Morgan_fp_MDCK.csv')

In [15]:
#Morgan count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/count_morgan_fp_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/count_morgan_fp_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_count_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_count_fp

X_train shape:  (51, 2048)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 2048)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 8
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 3
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test



0.028134297862200452


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7844,0.7307,0.8857,-0.2126,-0.1428,-0.0731,0.5262,0.5746,0.7254,0.1054,0.3337,0.2928
DecisionTreeRegressor,0.9775,0.7711,0.9887,-0.511,0.2115,0.2734,0.3713,0.5238,0.6094,0.3688,0.6759,0.3659
RandomForestRegressor,0.7518,0.7041,0.8671,-0.1622,0.1383,0.2032,0.2572,0.4463,0.5072,0.5627,0.7524,0.3769
GradientBoostingRegressor,1.0399,0.8056,1.0197,-0.6075,-0.0248,-0.0317,0.3848,0.5468,0.6203,0.3458,0.6833,0.4264
AdaBoostRegressor,0.8571,0.7587,0.9258,-0.325,0.0457,0.0214,0.2641,0.426,0.5139,0.551,0.7503,0.4209
XGBRegressor,1.128,0.8403,1.0621,-0.7437,0.0306,0.0482,0.3314,0.4894,0.5757,0.4365,0.6981,0.4649
ExtraTreesRegressor,0.8863,0.7433,0.9415,-0.3701,0.1371,0.1782,0.4085,0.5534,0.6392,0.3055,0.6728,0.3879
LinearRegression,1.3863,0.8953,1.1774,-1.1431,0.1079,0.1668,0.7527,0.6573,0.8676,-0.2796,0.5319,0.4374
KNeighborsRegressor,0.8014,0.743,0.8952,-0.2388,0.1485,0.2365,0.4625,0.5464,0.6801,0.2137,0.5326,0.3136
SVR,0.7349,0.6977,0.8573,-0.1361,0.0468,0.0245,0.3367,0.4951,0.5803,0.4276,0.6834,0.5695


In [16]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.424136649304684, -6.082059775932106, -5.99...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.424136649304684, -5.424136649304684, -5.6...","[-5.392722008630507, -5.392722008630507, -5.61...","[0.14859947763635886, 0.14859947763635886, 0.1..."
1,DecisionTreeRegressor,"[-6.22, -5.93, -5.195, -5.793469425, -5.547418...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -6.17, -6.927245587, -5.453495...","[-5.0961444334, -5.8691444334, -6.396, -6.5263...","[0.6531706507117631, 0.4709656090574217, 0.827..."
2,RandomForestRegressor,"[-5.545743328870002, -6.451074170830005, -5.55...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.456660870676669, -5.744730992470002, -6.5...","[-5.341749295657665, -5.642562720364613, -6.39...","[0.24515355135829134, 0.24293870734742698, 0.3..."
3,GradientBoostingRegressor,"[-5.781373743769475, -7.058643488327688, -4.99...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.45957738103695, -6.176454414422514, -7.39...","[-5.142652865284374, -5.814178953146951, -6.90...","[0.5088154694666068, 0.46103050476508345, 0.50..."
4,AdaBoostRegressor,"[-5.561668507615385, -6.736449117399999, -5.81...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.281999999999999, -5.819615094814815, -5.6...","[-5.273926052330476, -5.588487210795127, -6.04...","[0.1680713454303819, 0.2890412714430882, 0.409..."
5,XGBRegressor,"[-5.737909, -7.4206095, -5.037543, -5.8247414,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.051465, -6.243319, -6.221617, -6.936932, ...","[-5.0392733, -5.913934, -5.977878, -6.688245, ...","[0.50836474, 0.4824697, 0.13208126, 0.368149, ..."
6,ExtraTreesRegressor,"[-5.965666448120007, -6.6105000000000045, -5.3...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.865000000000008, -6.2281669404499995, -7....","[-5.041258393322006, -5.797182829229667, -6.82...","[0.4247036720301815, 0.3721761128851866, 0.416..."
7,LinearRegression,"[-4.0, -5.585596460555274, -4.0, -5.6089634708...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -4.225460828221529, -9.884734317384428...","[-4.0, -4.759746928757257, -7.4479125422094254...","[0.0, 0.8337546912336395, 1.701064455015078, 0..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -4.8...","[-5.4126666666666665, -5.666, -4.75, -6.220185...","[0.21437661564017046, 0.14081350945290877, 0.1..."
9,SVR,"[-5.609201158915962, -5.6646398202271175, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.670689936298147, -5.723245533802789, -5.5...","[-5.4817794010725205, -5.603897278043824, -5.5...","[0.25559520867088065, 0.16206752563565405, 0.0..."


In [17]:
df_morgan_count_fp.to_csv('results/Fingerprints/Results_Count_Morgan_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Count_Morgan_fp_MDCK.csv')

In [18]:
#AtomPairs2d fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/AtomPairs2D_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/AtomPairs2D_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2D_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2D_fp

X_train shape:  (51, 780)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 780)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 3
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the trai



0.1421587298869399




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.696,0.6868,0.8343,-0.0759,-0.1047,0.0305,0.5304,0.5976,0.7283,0.0984,0.4284,0.3717
DecisionTreeRegressor,0.7538,0.7188,0.8682,-0.1652,0.0282,0.0964,0.4854,0.562,0.6967,0.1748,0.4377,0.3379
RandomForestRegressor,0.7512,0.7178,0.8667,-0.1612,0.0264,0.0964,0.4867,0.5629,0.6977,0.1726,0.4381,0.3379
GradientBoostingRegressor,0.7538,0.7188,0.8682,-0.1652,0.0282,0.0964,0.4854,0.562,0.6967,0.1748,0.4377,0.3379
AdaBoostRegressor,0.8105,0.7518,0.9003,-0.253,-0.0227,0.0484,0.4782,0.5622,0.6915,0.187,0.4404,0.3379
XGBRegressor,0.7536,0.7187,0.8681,-0.1649,0.0282,0.0964,0.4854,0.562,0.6967,0.1748,0.4377,0.3379
ExtraTreesRegressor,0.7538,0.7188,0.8682,-0.1652,0.0282,0.0964,0.4854,0.562,0.6967,0.1748,0.4377,0.3379
LinearRegression,0.755,0.7188,0.8689,-0.1671,0.0294,0.0946,0.4854,0.562,0.6967,0.1748,0.4377,0.3379
KNeighborsRegressor,0.8827,0.7893,0.9395,-0.3645,-0.0576,0.0446,0.6854,0.6521,0.8279,-0.1652,0.0991,0.1824
SVR,0.7278,0.6642,0.8531,-0.125,0.0597,0.0846,0.5789,0.5773,0.7609,0.0158,0.3589,0.3619


In [19]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.567076712219281, -5.567076712219281, -5.56...","[0.1272708901920145, 0.1272708901920145, 0.127..."
1,DecisionTreeRegressor,"[-5.346875, -6.2219999999999995, -6.2219999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.346875, -5.346875, -5.346875, -5.85826198...","[-5.317460714285714, -5.317460714285714, -5.31...","[0.06052896970434374, 0.06052896970434374, 0.0..."
2,RandomForestRegressor,"[-5.3613644095540405, -6.193135364357863, -6.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.36136440955404, -5.36136440955404, -5.361...","[-5.315527625950532, -5.315527625950532, -5.31...","[0.06843983365593138, 0.06843983365593138, 0.0..."
3,GradientBoostingRegressor,"[-5.346885789857637, -6.221987545313435, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.346885789857637, -5.346885789857637, -5.3...","[-5.317470119496489, -5.317470119496489, -5.31...","[0.06052802111288864, 0.06052802111288864, 0.0..."
4,AdaBoostRegressor,"[-5.3088235294117645, -6.27, -6.27, -5.9211516...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3088235294117645, -5.3088235294117645, -5...","[-5.361861395570374, -5.361861395570374, -5.36...","[0.1080421892258728, 0.1080421892258728, 0.108..."
5,XGBRegressor,"[-5.3470435, -6.2218075, -6.2218075, -5.85834,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3470435, -5.3470435, -5.3470435, -5.85834...","[-5.3176765, -5.3176765, -5.3176765, -5.807809...","[0.060521808, 0.060521808, 0.060521808, 0.0728..."
6,ExtraTreesRegressor,"[-5.346875000000006, -6.221999999999991, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.346875000000006, -5.346875000000006, -5.3...","[-5.317460714285722, -5.317460714285722, -5.31...","[0.06052896970434286, 0.06052896970434286, 0.0..."
7,LinearRegression,"[-5.346875000000001, -6.222, -6.222, -5.858261...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.346875000000001, -5.346875000000001, -5.3...","[-5.317460714285714, -5.317460714285714, -5.31...","[0.06052896970434419, 0.06052896970434419, 0.0..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.6...","[-5.6259999999999994, -5.6259999999999994, -5....","[0.16096100286853474, 0.16096100286853474, 0.1..."
9,SVR,"[-5.19980221288724, -5.830341035767339, -5.830...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.19980221288724, -5.19980221288724, -5.199...","[-5.2063612582588, -5.2063612582588, -5.206361...","[0.04422598182623809, 0.04422598182623809, 0.0..."


In [20]:
df_AtomPairs2D_fp.to_csv('results/Fingerprints/Results_AtomPairs2D_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_AtomPairs2D_fp_MDCK.csv')

In [21]:
#AtomPairs2d Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/AtomPairs2DCount_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/AtomPairs2DCount_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2DCount_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2DCount_fp

X_train shape:  (51, 780)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 780)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 12
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 13
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of tes



0.5241691399897415




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6212,0.6565,0.7882,0.0397,0.228,0.1734,0.4333,0.5746,0.6583,0.2633,0.5732,0.3269
DecisionTreeRegressor,0.9301,0.7519,0.9644,-0.4378,0.2993,0.3055,0.3806,0.5518,0.6169,0.3529,0.6908,0.2779
RandomForestRegressor,0.717,0.6866,0.8467,-0.1083,0.2229,0.253,0.3329,0.4966,0.577,0.434,0.6984,0.3329
GradientBoostingRegressor,0.8801,0.7621,0.9381,-0.3605,0.1653,0.1856,0.407,0.541,0.638,0.308,0.6879,0.3164
AdaBoostRegressor,0.7017,0.6896,0.8377,-0.0848,0.2765,0.2401,0.3408,0.5079,0.5838,0.4206,0.7132,0.2944
XGBRegressor,0.7931,0.6889,0.8906,-0.226,0.2798,0.2669,0.3873,0.5453,0.6224,0.3415,0.6789,0.3549
ExtraTreesRegressor,0.8526,0.7379,0.9234,-0.318,0.2058,0.2421,0.3898,0.5471,0.6244,0.3373,0.6918,0.3109
LinearRegression,2.6488,1.2609,1.6275,-3.0946,0.2338,0.2745,1.1788,0.8583,1.0857,-1.0039,0.578,0.4484
KNeighborsRegressor,0.7623,0.677,0.8731,-0.1784,0.1943,0.1743,0.3902,0.4879,0.6247,0.3366,0.5946,0.2948
SVR,0.6557,0.6224,0.8098,-0.0136,0.2354,0.1637,0.3427,0.4588,0.5854,0.4174,0.7676,0.674


In [22]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.801092403684741, -6.279518273163124, -6.27...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.801092403684741, -5.801092403684741, -5.8...","[-5.721090726827581, -5.721090726827581, -5.72...","[0.13644640707471545, 0.13644640707471545, 0.1..."
1,DecisionTreeRegressor,"[-4.94, -5.33, -6.97, -5.793469425, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.34, -6.25, -5.93, -6.927245587, -6.569578...","[-5.438, -5.922, -5.921999999999999, -6.571015...","[0.6061484966573784, 0.6559999999999998, 0.625..."
2,RandomForestRegressor,"[-5.251199999999996, -5.390091569819997, -6.26...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.677299999999994, -5.872400000000002, -6.2...","[-5.708859999999998, -5.760440000000001, -6.02...","[0.09197144339413205, 0.3192603802541123, 0.21..."
3,GradientBoostingRegressor,"[-4.964881842643498, -5.128927195404449, -6.34...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.178081652417845, -6.249346628712105, -6.8...","[-5.555170691664529, -5.9747732929507364, -6.1...","[0.4346731395012083, 0.4860065778359095, 0.491..."
4,AdaBoostRegressor,"[-5.3725, -5.530000000000001, -6.4852695702857...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.504999999999999, -5.755, -6.2049999999999...","[-5.59079462009478, -5.80189561804183, -5.9725...","[0.2597673820200409, 0.3218838919915005, 0.313..."
5,XGBRegressor,"[-4.9411807, -5.273791, -6.770791, -5.6544137,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.7171683, -6.1566877, -5.577727, -6.942414...","[-5.528285, -5.9325666, -5.748015, -6.660899, ...","[0.2971646, 0.58796966, 0.24468806, 0.30170453..."
6,ExtraTreesRegressor,"[-4.957299999999998, -5.298899999999999, -6.68...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.805699999999996, -6.2083, -6.031668240780...","[-5.693159999999999, -5.965580000000001, -5.90...","[0.24348770482305576, 0.4639873162059496, 0.34..."
7,LinearRegression,"[-7.884570042809932, -4.0, -5.124744134176638,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -5.770441495156561, -10.0, -6.84710134...","[-5.169044447737309, -6.635723590784858, -8.80...","[1.5637243594864858, 1.9280850769377937, 1.469..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -6.07...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -5.9...","[-5.6259999999999994, -5.666000000000001, -6.0...","[0.16096100286853474, 0.14081350945290821, 0.2..."
9,SVR,"[-5.516176473156865, -5.522959185174364, -5.69...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.648025051124643, -5.4792949774874184, -5....","[-5.685719029310985, -5.556043578716791, -5.57...","[0.10578901169302979, 0.15160542404475955, 0.1..."


In [23]:
df_AtomPairs2DCount_fp.to_csv('results/Fingerprints/Results_AtomPairs2D_Count_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_df_AtomPairs2D_Count_fp_MDCK.csv')

In [24]:
#EState fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/EState_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/EState_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_estate_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_estate_fp

X_train shape:  (51, 79)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 79)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 0
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.694363
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.549798
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of d

  pearson_test, _ = pearsonr(y_test, predictions_test_mean)
  spearman_test, _ = spearmanr(y_test, predictions_test_mean)


0.17702918536074175
0.1793326935898093
0.17997508203356594
0.17936665437141053
0.17933222711882124
0.18322264527500676
-0.14410688305165542
0.04681620962351818
0.055771147711443425




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6869,0.6819,0.8288,-0.0618,-0.3293,-0.3067,0.5884,0.6227,0.767,-0.0002,,
DecisionTreeRegressor,0.7319,0.7177,0.8555,-0.1315,0.1132,0.1344,0.4827,0.573,0.6948,0.1793,0.4322,0.3037
RandomForestRegressor,0.6943,0.7055,0.8332,-0.0733,0.1474,0.1642,0.4841,0.5754,0.6958,0.177,0.4301,0.3037
GradientBoostingRegressor,0.7319,0.7177,0.8555,-0.1315,0.1132,0.1344,0.4827,0.573,0.6948,0.1793,0.4322,0.3037
AdaBoostRegressor,0.6977,0.702,0.8353,-0.0786,0.1971,0.1695,0.4824,0.571,0.6945,0.18,0.4336,0.3037
XGBRegressor,0.7314,0.7175,0.8552,-0.1307,0.1136,0.1344,0.4827,0.573,0.6948,0.1794,0.4322,0.3037
ExtraTreesRegressor,0.7319,0.7177,0.8555,-0.1315,0.1132,0.1344,0.4827,0.573,0.6948,0.1793,0.4322,0.3037
LinearRegression,0.6732,0.6916,0.8205,-0.0407,0.1887,0.1825,0.4804,0.5688,0.6931,0.1832,0.4355,0.3037
KNeighborsRegressor,0.9268,0.8008,0.9627,-0.4328,-0.0824,-0.0016,0.673,0.6327,0.8204,-0.1441,0.1582,0.2512
SVR,0.6976,0.6622,0.8353,-0.0785,0.1448,0.1258,0.5607,0.5876,0.7488,0.0468,0.4015,0.3271


In [25]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.753098213672638, -5.753098213672638, -5.75...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.753098213672638, -5.753098213672638, -5.7...","[-5.67155390815037, -5.67155390815037, -5.6715...","[0.06667572909639166, 0.06667572909639166, 0.0..."
1,DecisionTreeRegressor,"[-5.247999999999999, -6.357142857142857, -6.35...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.247999999999999, -5.247999999999999, -5.2...","[-5.227047474747474, -5.227047474747474, -5.22...","[0.0640910252723193, 0.0640910252723193, 0.064..."
2,RandomForestRegressor,"[-5.262657238594738, -6.305263512598512, -6.30...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.262657238594738, -5.262657238594738, -5.2...","[-5.228165137862138, -5.228165137862138, -5.22...","[0.06795565984511041, 0.06795565984511041, 0.0..."
3,GradientBoostingRegressor,"[-5.248013416115953, -6.3571268128729574, -6.3...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.248013416115953, -5.248013416115953, -5.2...","[-5.227059480539671, -5.227059480539671, -5.22...","[0.0640896010153116, 0.0640896010153116, 0.064..."
4,AdaBoostRegressor,"[-5.138000000000001, -6.359999999999999, -6.35...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.138000000000001, -5.138000000000001, -5.1...","[-5.233639393939393, -5.233639393939393, -5.23...","[0.08352817794349457, 0.08352817794349457, 0.0..."
5,XGBRegressor,"[-5.24825, -6.3566995, -6.3566995, -5.8583474,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.24825, -5.24825, -5.24825, -5.8583474, -5...","[-5.227351, -5.227351, -5.227351, -5.8077884, ...","[0.06406123, 0.06406123, 0.06406123, 0.0728661..."
6,ExtraTreesRegressor,"[-5.247999999999995, -6.357142857142854, -6.35...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.247999999999995, -5.247999999999995, -5.2...","[-5.227047474747475, -5.227047474747475, -5.22...","[0.06409102527231593, 0.06409102527231593, 0.0..."
7,LinearRegression,"[-5.235787878787878, -6.33969696969697, -6.339...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.235787878787878, -5.235787878787878, -5.2...","[-5.240701217086212, -5.240701217086212, -5.24...","[0.07628099169213029, 0.07628099169213029, 0.0..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.6...","[-5.6259999999999994, -5.6259999999999994, -5....","[0.16096100286853474, 0.16096100286853474, 0.1..."
9,SVR,"[-5.06042401618693, -5.829928750830033, -5.829...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.06042401618693, -5.06042401618693, -5.060...","[-5.099879802362196, -5.099879802362196, -5.09...","[0.057526732934047475, 0.057526732934047475, 0..."


In [26]:
df_estate_fp.to_csv('results/Fingerprints/Results_EState_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_EState_fp_MDCK.csv')

In [27]:
#Extended fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Extended_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Extended_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_extended_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_extended_fp

X_train shape:  (51, 1024)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1024)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 32
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7231,0.702,0.8503,-0.1177,-0.1799,-0.0669,0.5326,0.5985,0.7298,0.0946,0.4068,0.384
DecisionTreeRegressor,0.9944,0.8008,0.9972,-0.5372,0.0568,0.1389,0.6071,0.6793,0.7792,-0.0322,0.4472,0.251
RandomForestRegressor,0.839,0.7594,0.916,-0.297,0.0299,0.0834,0.4963,0.6026,0.7045,0.1563,0.477,0.3884
GradientBoostingRegressor,0.9208,0.7705,0.9596,-0.4234,0.0518,0.0865,0.5408,0.63,0.7354,0.0806,0.4484,0.3884
AdaBoostRegressor,0.8386,0.7663,0.9157,-0.2963,0.1442,0.1359,0.5007,0.5931,0.7076,0.1489,0.487,0.3779
XGBRegressor,0.9884,0.7985,0.9942,-0.528,0.0436,0.0835,0.5795,0.6521,0.7613,0.0148,0.4231,0.2645
ExtraTreesRegressor,0.9994,0.8021,0.9997,-0.5449,0.0617,0.1451,0.5955,0.6698,0.7717,-0.0124,0.4526,0.281
LinearRegression,1.1449,0.8181,1.07,-0.7698,-0.0286,0.0304,0.4158,0.547,0.6449,0.2931,0.5683,0.438
KNeighborsRegressor,0.8063,0.7114,0.8979,-0.2464,-0.0324,-0.006,0.5807,0.566,0.762,0.0128,0.3561,0.3945
SVR,0.7777,0.7149,0.8819,-0.2022,-0.0206,-0.0196,0.5095,0.5453,0.7138,0.1338,0.4619,0.416


In [28]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.555513440841483, -5.555513440841483, -5.55...","[0.13404624229798626, 0.13404624229798626, 0.1..."
1,DecisionTreeRegressor,"[-5.295, -6.2219999999999995, -6.2219999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.6850000000000005, -4.6850000000000005, -5...","[-4.67, -4.67, -5.470000000000001, -6.46671853...","[0.03000000000000007, 0.03000000000000007, 0.1..."
2,RandomForestRegressor,"[-5.300279999999999, -6.1931353643578655, -6.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.927544047619049, -4.925586938171549, -5.3...","[-4.972449409741874, -4.974512746134584, -5.46...","[0.03247505701970131, 0.04053999030841141, 0.1..."
3,GradientBoostingRegressor,"[-5.253212635373249, -6.2256605091155235, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.6937941323889, -4.6937941323889, -5.25321...","[-4.822774316153668, -4.858312933088546, -5.43...","[0.11862640051335409, 0.09625417857789582, 0.1..."
4,AdaBoostRegressor,"[-5.176666666666667, -6.4514491174, -6.4514491...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.75, -4.75, -5.176666666666667, -6.4514491...","[-4.846040621818181, -4.846040621818181, -5.40...","[0.2526009704735208, 0.2526009704735208, 0.154..."
5,XGBRegressor,"[-5.2943616, -6.222085, -6.222085, -5.886463, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.6900196, -4.6900196, -5.2943616, -6.57034...","[-4.698951, -4.7618384, -5.469215, -6.4876556,...","[0.07813407, 0.11411654, 0.14727867, 0.2549756..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -6.221999999999991, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.690500000000001, -4.690500000000001, -5.2...","[-4.6874800000000025, -4.686580000000003, -5.4...","[0.0031036430207112384, 0.002257565059970636, ..."
7,LinearRegression,"[-5.294999999999997, -6.222, -6.222, -5.705397...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.714005709253274, -5.7057083612022605, -5....","[-5.570407704285721, -5.5242652656712234, -5.4...","[0.6377077157958417, 0.5941530669253395, 0.147..."
8,KNeighborsRegressor,"[-5.183333333333334, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.206666666666667, -5.206666666666667, -5.1...","[-5.259333333333333, -5.259333333333333, -5.36...","[0.1053333333333331, 0.1053333333333331, 0.128..."
9,SVR,"[-5.040148986263137, -5.8301489931382715, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.564512836041758, -5.568422924631442, -5.0...","[-5.542985817643418, -5.54114243775402, -5.260...","[0.29250263674532545, 0.2860508910364153, 0.18..."


In [29]:
df_extended_fp.to_csv('results/Fingerprints/Results_Extended_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Extended_fp_MDCK.csv')

In [30]:
#Fingerprinter fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Fingerprinter_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Fingerprinter_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_fingerprinter_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_fingerprinter_fp

X_train shape:  (51, 1024)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1024)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 28
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7008,0.692,0.8371,-0.0833,-0.0272,0.0291,0.5073,0.5769,0.7122,0.1376,0.4284,0.4501
DecisionTreeRegressor,0.8757,0.7525,0.9358,-0.3537,0.0781,0.1715,0.5264,0.6152,0.7256,0.105,0.448,0.3679
RandomForestRegressor,0.8132,0.7496,0.9018,-0.2571,0.0604,0.1397,0.4877,0.5918,0.6983,0.171,0.4611,0.3287
GradientBoostingRegressor,0.8712,0.7638,0.9334,-0.3467,0.0685,0.1256,0.4791,0.5732,0.6922,0.1855,0.4889,0.3729
AdaBoostRegressor,0.8835,0.7908,0.9399,-0.3658,0.0628,0.1065,0.5008,0.6195,0.7077,0.1487,0.437,0.3232
XGBRegressor,0.8555,0.7449,0.925,-0.3225,0.0884,0.1529,0.5443,0.632,0.7378,0.0746,0.3897,0.1796
ExtraTreesRegressor,0.8567,0.7445,0.9256,-0.3244,0.0992,0.1849,0.5228,0.6026,0.7231,0.1112,0.4595,0.384
LinearRegression,1.0138,0.7989,1.0069,-0.5672,0.0275,0.0786,0.4819,0.5888,0.6942,0.1808,0.5008,0.4392
KNeighborsRegressor,0.7671,0.7,0.8759,-0.1859,0.0561,0.0518,0.6029,0.619,0.7765,-0.0249,0.2452,0.4122
SVR,0.7676,0.7088,0.8761,-0.1865,-0.0079,-0.0076,0.5144,0.5438,0.7172,0.1256,0.4485,0.4061


In [31]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.507383922852948, -5.507383922852948, -5.50...","[0.10397057457759268, 0.10397057457759268, 0.1..."
1,DecisionTreeRegressor,"[-4.99, -6.2219999999999995, -6.22199999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.99, -4.99, -4.99, -6.748412039, -5.739149...","[-4.9997, -4.9997, -5.1716999999999995, -6.524...","[0.21056675901005853, 0.21056675901005853, 0.1..."
2,RandomForestRegressor,"[-5.066547516082141, -6.193135364357865, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.135617619047621, -5.2175166082626205, -5....","[-5.221908540241519, -5.253101202717454, -5.20...","[0.1256489021265099, 0.10708916347565378, 0.15..."
3,GradientBoostingRegressor,"[-4.990807370843178, -6.222101638277233, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.172497143907629, -5.168416726688939, -4.9...","[-5.238239753817956, -5.268910910012542, -5.16...","[0.210662778459824, 0.1386778475973629, 0.1622..."
4,AdaBoostRegressor,"[-4.98, -6.359999999999999, -6.359999999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.98, -4.98, -4.98, -6.927245587, -5.943314...","[-5.136960557015874, -5.205432779238095, -5.17...","[0.14843203635954935, 0.1660036634709048, 0.15..."
5,XGBRegressor,"[-4.9899874, -6.2218585, -6.2218585, -5.739042...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.1014094, -5.087356, -4.9899874, -6.748446...","[-5.126996, -5.34199, -5.1715727, -6.584859, -...","[0.19015437, 0.25940844, 0.16897678, 0.3269155..."
6,ExtraTreesRegressor,"[-4.990000000000006, -6.221999999999991, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.990000000000006, -4.990000000000006, -4.9...","[-5.0083000000000055, -5.092540000000005, -5.1...","[0.19475974943503702, 0.08034877970448034, 0.1..."
7,LinearRegression,"[-4.9899999999999975, -6.222000000000003, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.794999999999992, -4.811511427408459, -4.9...","[-5.314792707758762, -5.250376887546194, -5.17...","[0.5420942432122171, 0.4727976910709499, 0.169..."
8,KNeighborsRegressor,"[-5.183333333333334, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.1...","[-5.666, -5.666, -5.368, -6.2329041285999995, ...","[0.14081350945290877, 0.14081350945290877, 0.1..."
9,SVR,"[-5.039878407664466, -5.8306326450491035, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.5142009354753565, -5.520836777903699, -5....","[-5.522728749783225, -5.521933631774096, -5.25...","[0.26250057407575483, 0.25646374229123237, 0.1..."


In [32]:
df_fingerprinter_fp.to_csv('results/Fingerprints/Results_Fingerprinter_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Fingerprinter_fp_MDCK.csv')

In [33]:
#GraphOnly fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Graphonly_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Graphonly_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_graph_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_graph_fp

X_train shape:  (51, 1024)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1024)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 18
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7037,0.6888,0.8388,-0.0877,-0.1242,0.0451,0.5315,0.5999,0.7291,0.0964,0.4084,0.3294
DecisionTreeRegressor,0.9848,0.7955,0.9924,-0.5223,-0.0,-0.023,0.3912,0.5149,0.6255,0.3349,0.5855,0.3923
RandomForestRegressor,0.8432,0.7617,0.9183,-0.3035,-0.0127,0.0584,0.4003,0.5372,0.6327,0.3194,0.5823,0.3591
GradientBoostingRegressor,0.9554,0.793,0.9775,-0.4769,-0.0295,-0.0026,0.3963,0.5316,0.6295,0.3263,0.5739,0.3923
AdaBoostRegressor,0.7409,0.7337,0.8607,-0.1452,0.1086,0.1708,0.3929,0.5297,0.6268,0.332,0.5847,0.3702
XGBRegressor,0.9296,0.7793,0.9641,-0.437,-0.0032,0.0038,0.4109,0.5528,0.641,0.3015,0.5504,0.2873
ExtraTreesRegressor,0.9829,0.7967,0.9914,-0.5194,-0.0251,-0.0291,0.399,0.5282,0.6316,0.3218,0.5711,0.3923
LinearRegression,0.8642,0.7506,0.9296,-0.3359,0.0659,0.1084,0.4189,0.5606,0.6472,0.2878,0.5534,0.4088
KNeighborsRegressor,0.8668,0.7656,0.931,-0.34,-0.0281,-0.0282,0.5632,0.5597,0.7505,0.0425,0.3367,0.3591
SVR,0.7731,0.711,0.8793,-0.1951,-0.0188,0.0118,0.4916,0.5312,0.7011,0.1643,0.4831,0.3978


In [34]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.552891276637582, -5.552891276637582, -5.55...","[0.11628391047220203, 0.11628391047220203, 0.1..."
1,DecisionTreeRegressor,"[-5.261666666666667, -6.2219999999999995, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -5.261666666666667, -6.7484120...","[-5.922, -5.922, -5.346666666666668, -6.609785...","[0.6559999999999998, 0.6559999999999998, 0.062..."
2,RandomForestRegressor,"[-5.3022464907867946, -6.193135364357865, -6.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.799674891774895, -5.799674891774895, -5.3...","[-5.68029762359589, -5.68029762359589, -5.3577...","[0.27015794046184377, 0.27015794046184377, 0.0..."
3,GradientBoostingRegressor,"[-5.25750871507881, -6.222580614444872, -6.222...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.2406161443326145, -6.2406161443326145, -5...","[-5.940087190573535, -5.940087190573535, -5.34...","[0.5969647341596444, 0.5969647341596444, 0.060..."
4,AdaBoostRegressor,"[-5.148333333333334, -6.359999999999999, -6.35...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.4275254784285725, -5.4275254784285725, -5...","[-5.644177282676623, -5.644177282676623, -5.37...","[0.30840309967909235, 0.30840309967909235, 0.1..."
5,XGBRegressor,"[-5.2615204, -6.2220006, -6.2220006, -5.887787...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.2485046, -6.2485046, -5.2615204, -6.73599...","[-5.965927, -5.965927, -5.346581, -6.555928, -...","[0.56540805, 0.56540805, 0.06268806, 0.3787084..."
6,ExtraTreesRegressor,"[-5.26166666666666, -6.221999999999991, -6.221...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -5.26166666666666, -6.74192791...","[-5.932270000000002, -5.932270000000002, -5.34...","[0.6354599999999963, 0.6354599999999963, 0.062..."
7,LinearRegression,"[-5.261666666666665, -6.222, -6.222, -5.657991...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.249999999999999, -6.249999999999999, -5.2...","[-5.964896012280277, -5.964896012280277, -5.34...","[0.570207975439447, 0.570207975439447, 0.06270..."
8,KNeighborsRegressor,"[-5.286666666666666, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.503333333333333, -5.503333333333333, -5.2...","[-5.472, -5.472, -5.430000000000001, -6.298406...","[0.18465102220134053, 0.18465102220134053, 0.1..."
9,SVR,"[-5.1700111837710025, -5.83019960096834, -5.83...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.5394878696579815, -5.5394878696579815, -5...","[-5.548150441912247, -5.548150441912247, -5.35...","[0.3393647411080058, 0.3393647411080058, 0.122..."


In [35]:
df_graph_fp.to_csv('results/Fingerprints/Results_Graphonly_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Graphonly_fp_MDCK.csv')

In [36]:
#KlekotaRoth fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/KlekotaRoth_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/KlekotaRoth_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRoth_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRoth_fp

X_train shape:  (51, 4860)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 4860)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 25
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test



0.668619522155389


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7764,0.7259,0.8811,-0.2002,-0.1743,-0.1977,0.5657,0.586,0.7521,0.0383,0.2047,0.2103
DecisionTreeRegressor,0.9239,0.6805,0.9612,-0.4282,0.2791,0.3538,0.249,0.4425,0.499,0.5767,0.7619,0.6298
RandomForestRegressor,0.7204,0.6628,0.8488,-0.1137,0.2405,0.2982,0.1941,0.3839,0.4406,0.67,0.8358,0.7228
GradientBoostingRegressor,0.9472,0.7335,0.9733,-0.4643,0.1228,0.1876,0.1961,0.3925,0.4428,0.6666,0.8203,0.6234
AdaBoostRegressor,0.788,0.7035,0.8877,-0.2181,0.1957,0.2251,0.1869,0.3746,0.4323,0.6823,0.8344,0.6179
XGBRegressor,0.9422,0.7118,0.9707,-0.4565,0.2095,0.2675,0.1979,0.389,0.4448,0.6636,0.8185,0.6566
ExtraTreesRegressor,0.9183,0.6607,0.9583,-0.4196,0.2457,0.3279,0.2383,0.4264,0.4882,0.5949,0.7739,0.7062
LinearRegression,1.4481,0.9169,1.2034,-1.2385,0.0009,0.0157,0.4075,0.5714,0.6384,0.3072,0.6499,0.5572
KNeighborsRegressor,0.7973,0.7288,0.8929,-0.2325,0.1925,0.2836,0.4118,0.5126,0.6417,0.3,0.6337,0.2873
SVR,0.7207,0.6888,0.8489,-0.1141,0.0698,0.0721,0.3877,0.5271,0.6227,0.3409,0.6145,0.6069


In [37]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.4384167708207585, -5.4384167708207585, -5....","[0.07334361034928548, 0.07334361034928548, 0.0..."
1,DecisionTreeRegressor,"[-5.295, -6.97, -5.195, -5.597095796, -6.92991...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -5.27, -6.927245587, -5.453495...","[-5.922, -5.922, -5.7763333333333335, -6.80979...","[0.6559999999999998, 0.6559999999999998, 0.918..."
2,RandomForestRegressor,"[-5.37972, -6.781705000000011, -5.301308330419...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.775941595740002, -5.81105392511928, -6.36...","[-5.668750459223013, -5.687732367614725, -6.20...","[0.3152534779723068, 0.32251412938011464, 0.27..."
3,GradientBoostingRegressor,"[-5.339806958570261, -6.904892676348301, -5.21...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.154907757913286, -6.1698402942863275, -6....","[-5.863404415832678, -5.867992960325095, -6.40...","[0.5869485313880611, 0.5886006848210488, 0.529..."
4,AdaBoostRegressor,"[-5.353803037375001, -6.922499999999999, -5.51...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.390746644052632, -5.546407395466668, -6.7...","[-5.504687798268892, -5.5358199485516995, -6.5...","[0.15339622565132824, 0.14252290656622219, 0.3..."
5,XGBRegressor,"[-5.2961392, -6.9689445, -5.142122, -6.027596,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.245496, -6.332196, -6.442806, -6.9264865,...","[-5.9783907, -5.9774256, -6.3995347, -6.644269...","[0.53710043, 0.46481588, 0.40066418, 0.5637051..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -6.970000000000017, -5.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -5.718499999999995, -6.9272455...","[-5.922000000000002, -5.920700000000002, -5.98...","[0.6559999999999959, 0.6553548351847225, 0.836..."
7,LinearRegression,"[-5.295000000000001, -6.9700000000000015, -4.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.250000000000002, -6.514546486530007, -8.0...","[-5.96530262896364, -6.521057604054944, -6.917...","[0.5693947420727226, 1.0214019602942757, 0.924..."
8,KNeighborsRegressor,"[-5.183333333333334, -6.906666666666666, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.503333333333333, -5.503333333333333, -4.8...","[-5.472, -5.472, -4.761333333333334, -6.220185...","[0.18465102220134053, 0.18465102220134053, 0.2..."
9,SVR,"[-5.313086057158264, -5.985939607960501, -5.48...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.7897277117251775, -5.792946768825151, -5....","[-5.638075979489695, -5.636847277089234, -5.29...","[0.4224325573178793, 0.415502513751978, 0.0580..."


In [38]:
df_KlekotaRoth_fp.to_csv('results/Fingerprints/Results_KlekotaRoth_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_KlekotaRoth_fp_MDCK.csv')

In [39]:
#KlekotaRoth Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/KlekotaRothCount_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/KlekotaRothCount_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRothCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRothCount_fp

X_train shape:  (51, 4860)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 4860)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 177
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 38
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 17
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 



0.34755488396138434


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7623,0.7334,0.8731,-0.1785,-0.0296,0.0596,0.4757,0.5762,0.6897,0.1913,0.455,0.4558
DecisionTreeRegressor,1.0967,0.7905,1.0472,-0.6953,0.2017,0.2662,0.5088,0.58,0.7133,0.135,0.6362,0.4738
RandomForestRegressor,0.6629,0.656,0.8142,-0.0247,0.2613,0.2825,0.315,0.492,0.5613,0.4645,0.6928,0.3471
GradientBoostingRegressor,0.8595,0.7102,0.9271,-0.3287,0.2016,0.2302,0.4566,0.5717,0.6757,0.2239,0.6511,0.4187
AdaBoostRegressor,0.6489,0.6456,0.8056,-0.0032,0.3153,0.3564,0.31,0.4914,0.5568,0.4729,0.7246,0.4077
XGBRegressor,0.8724,0.7256,0.934,-0.3487,0.2453,0.3061,0.3793,0.5184,0.6159,0.3551,0.6905,0.5289
ExtraTreesRegressor,0.847,0.7106,0.9203,-0.3094,0.1986,0.2199,0.4368,0.5635,0.6609,0.2574,0.6596,0.4518
LinearRegression,1.1626,0.8145,1.0782,-0.7972,0.0111,0.0509,1.2889,0.7265,1.1353,-1.1912,0.5051,0.4793
KNeighborsRegressor,0.741,0.6908,0.8608,-0.1454,0.2466,0.3258,0.418,0.5287,0.6465,0.2894,0.5666,0.3204
SVR,0.6763,0.6596,0.8223,-0.0454,0.1671,0.1417,0.3454,0.5253,0.5877,0.4128,0.6625,0.3802


In [40]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.646273967912512, -6.100853894779576, -6.34...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.646273967912512, -5.500581732532101, -5.7...","[-5.484328756914797, -5.377426208017633, -5.73...","[0.19271094120438179, 0.24682934479138555, 0.0..."
1,DecisionTreeRegressor,"[-5.295, -6.97, -4.73, -5.597095796, -6.929917...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -7.82, -6.927245587, -4.61, -5...","[-4.708, -5.9, -7.33, -6.809796469599999, -5.5...","[0.19599999999999973, 0.7, 0.6214177338956463,..."
2,RandomForestRegressor,"[-5.388479170829999, -6.096627910608894, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.13659546535203, -5.944299587750141, -6.70...","[-5.171218436726399, -5.778331449435433, -6.58...","[0.2297414553539115, 0.29074522217371146, 0.41..."
3,GradientBoostingRegressor,"[-5.307078762310983, -6.1433452090569665, -5.4...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.6942009558700875, -6.224347845846239, -7....","[-4.876018214373124, -5.934314785751728, -7.12...","[0.3726419259742494, 0.4817280750383375, 0.815..."
4,AdaBoostRegressor,"[-5.3731234502, -5.758, -5.853725935160001, -5...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.0333457856923065, -5.8608909024999996, -6...","[-5.1580099106256965, -5.79013358053692, -6.53...","[0.20916519510107348, 0.27089216134759325, 0.3..."
5,XGBRegressor,"[-5.2953153, -6.789957, -5.382924, -5.7190976,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.886671, -6.239877, -6.1954083, -6.926992,...","[-4.970583, -5.989621, -6.7077003, -6.8290367,...","[0.41299576, 0.46861646, 0.72191, 0.19605963, ..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -6.708700000000014, -5.3...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.631000000000009, -6.083473426140002, -7.6...","[-4.8463583197740085, -5.909614685228002, -7.0...","[0.4416890259014868, 0.5983225262748127, 0.694..."
7,LinearRegression,"[-5.571795709, -5.331886230831336, -4.02905688...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.396637152058657, -5.976127702592757, -10....","[-4.651433057414922, -5.382584089024481, -9.54...","[0.5141862702677118, 0.6177299295697716, 0.646..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.503333333333333, -5.503333333333333, -5.6...","[-5.472, -5.472, -5.482666666666667, -6.220185...","[0.18465102220134053, 0.18465102220134053, 0.0..."
9,SVR,"[-5.5499949765436405, -5.582080580038937, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.395633442130889, -5.883097707240995, -5.7...","[-5.399348638342914, -5.752008449919346, -5.68...","[0.3090633676309614, 0.29446790017240904, 0.10..."


In [41]:
df_KlekotaRothCount_fp.to_csv('results/Fingerprints/Results_KlekotaRoth_Count_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_KlekotaRoth_Count_fp_MDCK.csv')

In [42]:
#MACCS fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/MACCS_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/MACCS_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_MACCS_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_MACCS_fp

X_train shape:  (51, 166)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 166)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 1
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing 



0.35822785310430516




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7695,0.7186,0.8772,-0.1896,-0.1656,-0.182,0.5657,0.5871,0.7521,0.0383,0.2038,0.2103
DecisionTreeRegressor,1.0227,0.7926,1.0113,-0.5809,0.0742,0.1474,0.3916,0.5347,0.6258,0.3342,0.5928,0.469
RandomForestRegressor,0.8453,0.7543,0.9194,-0.3067,0.0604,0.1128,0.4321,0.5515,0.6573,0.2655,0.5342,0.3255
GradientBoostingRegressor,1.0106,0.7726,1.0053,-0.5623,-0.0217,0.0228,0.4066,0.528,0.6376,0.3088,0.571,0.4745
AdaBoostRegressor,0.81,0.7571,0.9,-0.2521,0.13,0.2009,0.44,0.5868,0.6634,0.2519,0.515,0.2703
XGBRegressor,0.9671,0.7906,0.9834,-0.495,0.0048,0.0533,0.4413,0.5647,0.6643,0.2498,0.5171,0.4359
ExtraTreesRegressor,1.0412,0.8008,1.0204,-0.6095,0.0473,0.1064,0.4131,0.5405,0.6428,0.2977,0.5653,0.4359
LinearRegression,0.8604,0.6987,0.9276,-0.33,0.1589,0.3216,0.5144,0.6177,0.7172,0.1256,0.5591,0.4248
KNeighborsRegressor,0.9168,0.7714,0.9575,-0.4173,0.0711,0.153,0.5381,0.5565,0.7335,0.0852,0.4555,0.2901
SVR,0.8291,0.739,0.9106,-0.2817,-0.0549,-0.0594,0.4931,0.525,0.7022,0.1617,0.4751,0.4028


In [43]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.444607587024519, -5.444607587024519, -5.44...","[0.07660067285868521, 0.07660067285868521, 0.0..."
1,DecisionTreeRegressor,"[-5.295, -6.906666666666666, -6.90666666666666...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.34, -4.6850000000000005, -6.748412...","[-5.922, -5.9399999999999995, -5.2882000000000...","[0.6559999999999998, 0.6659129072183537, 0.738..."
2,RandomForestRegressor,"[-5.3739799999999995, -6.8193583333333345, -6....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.785365000000001, -5.862533904681032, -4.8...","[-5.58041693282488, -5.615398868105131, -5.076...","[0.3727613415270713, 0.38368662633258865, 0.19..."
3,GradientBoostingRegressor,"[-5.252794031482575, -6.88253649380844, -6.882...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.222541999997616, -6.809201562448811, -4.7...","[-5.918385764352299, -6.128441812503406, -5.21...","[0.6356978156817632, 0.7839450381941758, 0.422..."
4,AdaBoostRegressor,"[-5.236666666666667, -6.875, -6.875, -5.757542...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.495, -6.12, -4.822, -6.199603009750001, -...","[-5.472389474650001, -5.737549046233334, -5.25...","[0.3609214205154662, 0.34136457342377896, 0.26..."
5,XGBRegressor,"[-5.294951, -6.9061813, -6.9061813, -5.72279, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.2486796, -6.2814302, -4.687504, -6.748048...","[-5.9768705, -5.9819894, -5.1174865, -6.494063...","[0.5436527, 0.54774797, 0.49094775, 0.50878865..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -6.906666666666649, -6.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.339999999999998, -4.68500000000000...","[-5.9251200000000015, -5.941560000000002, -5.1...","[0.6497599999999963, 0.6589628262656361, 0.598..."
7,LinearRegression,"[-5.321839713533342, -6.906666666666666, -6.90...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.196320572933323, -6.145328338738971, -5.3...","[-6.276144949011792, -6.369568718143008, -7.09...","[0.07998601035554936, 0.296464477933415, 0.887..."
8,KNeighborsRegressor,"[-5.286666666666666, -6.906666666666666, -6.90...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.503333333333333, -5.503333333333333, -4.7...","[-5.472, -5.472, -4.7, -6.155622903666667, -5....","[0.18465102220134053, 0.18465102220134053, 0.0..."
9,SVR,"[-5.339483766963183, -5.897711454315428, -5.89...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.09056301496793, -6.056839776128437, -4.98...","[-5.8431152276716345, -5.825878467151289, -4.9...","[0.40488388841953127, 0.37827187339151847, 0.0..."


In [44]:
df_MACCS_fp.to_csv('results/Fingerprints/Results_MACCS_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_MACCS_fp_MDCK.csv')

In [45]:
#PubChem fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/PubChem_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/PubChem_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_PubChem_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_PubChem_fp

X_train shape:  (51, 881)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 881)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 6
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the tr

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.696,0.6868,0.8343,-0.0759,-0.1047,0.0305,0.5304,0.5976,0.7283,0.0984,0.4284,0.3717
DecisionTreeRegressor,0.8744,0.7535,0.9351,-0.3516,0.1181,0.1702,0.397,0.5575,0.6301,0.325,0.5862,0.3873
RandomForestRegressor,0.7489,0.7101,0.8654,-0.1577,0.1443,0.1766,0.3776,0.5262,0.6145,0.3581,0.6056,0.3679
GradientBoostingRegressor,0.869,0.7483,0.9322,-0.3433,0.1101,0.1537,0.3844,0.5437,0.62,0.3465,0.5965,0.4011
AdaBoostRegressor,0.8041,0.7429,0.8967,-0.243,0.1033,0.1097,0.349,0.5033,0.5908,0.4066,0.6521,0.3679
XGBRegressor,0.8323,0.7166,0.9123,-0.2866,0.1487,0.2058,0.3932,0.5547,0.627,0.3316,0.5904,0.3873
ExtraTreesRegressor,0.865,0.7461,0.93,-0.3371,0.1263,0.1764,0.397,0.5575,0.6301,0.3251,0.5863,0.3873
LinearRegression,0.8498,0.7289,0.9218,-0.3137,0.1314,0.1413,0.3772,0.5189,0.6142,0.3587,0.604,0.3845
KNeighborsRegressor,0.7581,0.6792,0.8707,-0.1719,0.1431,0.2075,0.547,0.5862,0.7396,0.0701,0.4153,0.4066
SVR,0.718,0.6721,0.8474,-0.11,0.1221,0.0932,0.455,0.5218,0.6746,0.2264,0.5417,0.4398


In [46]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.558160996952212, -5.948035428178268, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.558160996952212, -5.558160996952212, -5.5...","[-5.567076712219281, -5.567076712219281, -5.56...","[0.1272708901920145, 0.1272708901920145, 0.127..."
1,DecisionTreeRegressor,"[-5.295, -6.2219999999999995, -6.2219999999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -5.295, -6.748412039, -5.73914...","[-5.948, -5.948, -5.470000000000001, -6.577568...","[0.604, 0.604, 0.1476143624448516, 0.341687398..."
2,RandomForestRegressor,"[-5.380913333333332, -6.193135364357865, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.880185143395239, -5.880185143395239, -5.3...","[-5.7037480742905755, -5.7037480742905755, -5....","[0.34439656858910195, 0.34439656858910195, 0.1..."
3,GradientBoostingRegressor,"[-5.33086499502761, -6.232040055044521, -6.232...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.205779559079003, -6.205779559079003, -5.3...","[-5.905872592757811, -5.905872592757811, -5.48...","[0.6112492768903168, 0.6112492768903168, 0.136..."
4,AdaBoostRegressor,"[-5.386670581, -6.359999999999999, -6.35999999...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.0956540701428565, -6.0956540701428565, -5...","[-5.701035575933333, -5.701035575933333, -5.53...","[0.5255181313142169, 0.5255181313142169, 0.191..."
5,XGBRegressor,"[-5.295287, -6.2218337, -6.2218337, -5.7391624...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.2474337, -6.2474337, -5.295287, -6.748368...","[-5.9309845, -5.9309845, -5.4700813, -6.545954...","[0.6340118, 0.6340118, 0.14750703, 0.4040401, ..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -6.221999999999991, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -5.2950000000000035, -6.748412...","[-5.947300000000001, -5.947300000000001, -5.47...","[0.6053999999999969, 0.6053999999999969, 0.147..."
7,LinearRegression,"[-5.6011764705882365, -6.221999999999999, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.63764705882353, -5.63764705882353, -5.601...","[-5.619219251336899, -5.619219251336899, -5.63...","[0.4910505110106937, 0.4910505110106937, 0.180..."
8,KNeighborsRegressor,"[-5.286666666666666, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.2...","[-5.666, -5.666, -5.430000000000001, -6.232904...","[0.14081350945290877, 0.14081350945290877, 0.1..."
9,SVR,"[-5.449844411179285, -5.830286437610772, -5.83...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.826164230663535, -5.826164230663535, -5.4...","[-5.684033741104497, -5.684033741104497, -5.54...","[0.2313210106073283, 0.2313210106073283, 0.136..."


In [47]:
df_PubChem_fp.to_csv('results/Fingerprints/Results_PubChem_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_PubChem_fp_MDCK.csv')

In [48]:
#Substructure fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Substructure_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Substructure_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_Substructure_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_Substructure_fp

X_train shape:  (51, 307)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 307)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 0
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.694363
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.549798
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of

  pearson_test, _ = pearsonr(y_test, predictions_test_mean)
  spearman_test, _ = spearmanr(y_test, predictions_test_mean)


0.16785552978927254
0.16683264642981266
0.15958016186535295
0.1703795471391767
0.1703896633692521
0.16194087316581762
-0.06517317244734078
-0.005025847259860994




0.10660882174864461




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6869,0.6819,0.8288,-0.0618,-0.3293,-0.3067,0.5884,0.6227,0.767,-0.0002,,
DecisionTreeRegressor,0.7918,0.6948,0.8898,-0.224,0.1116,0.1723,0.488,0.5595,0.6986,0.1704,0.4378,0.3563
RandomForestRegressor,0.7534,0.6977,0.868,-0.1646,0.0817,0.1408,0.4895,0.5637,0.6996,0.1679,0.4366,0.3037
GradientBoostingRegressor,0.7986,0.7077,0.8936,-0.2345,0.0834,0.1486,0.4901,0.5663,0.7001,0.1668,0.4338,0.3037
AdaBoostRegressor,0.8045,0.7372,0.8969,-0.2436,0.0083,0.0701,0.4944,0.5834,0.7031,0.1596,0.4085,0.3037
XGBRegressor,0.8102,0.7129,0.9001,-0.2525,0.088,0.131,0.488,0.5596,0.6986,0.1704,0.4378,0.3563
ExtraTreesRegressor,0.7941,0.7006,0.8911,-0.2276,0.1049,0.1588,0.488,0.5595,0.6986,0.1704,0.4378,0.3563
LinearRegression,0.7479,0.6872,0.8648,-0.1562,0.1189,0.1574,0.493,0.5694,0.7021,0.1619,0.4314,0.3037
KNeighborsRegressor,0.9029,0.7813,0.9502,-0.3958,-0.0022,0.089,0.6266,0.6128,0.7916,-0.0652,0.2363,0.3092
SVR,0.7302,0.6732,0.8545,-0.1289,0.1182,0.1394,0.5912,0.5839,0.7689,-0.005,0.3727,0.3271


In [49]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.753098213672638, -5.753098213672638, -5.75...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.753098213672638, -5.753098213672638, -5.7...","[-5.67155390815037, -5.67155390815037, -5.6715...","[0.06667572909639166, 0.06667572909639166, 0.0..."
1,DecisionTreeRegressor,"[-5.248000000000001, -6.2219999999999995, -6.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.248000000000001, -5.248000000000001, -5.2...","[-5.227047474747475, -5.227047474747475, -5.22...","[0.06409102527231963, 0.06409102527231963, 0.0..."
2,RandomForestRegressor,"[-5.262657238594738, -6.193135364357863, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.262657238594736, -5.262657238594736, -5.2...","[-5.228165137862137, -5.228165137862137, -5.22...","[0.06795565984511047, 0.06795565984511047, 0.0..."
3,GradientBoostingRegressor,"[-5.239946364820966, -6.205628348380743, -6.20...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.239946364820966, -5.239946364820966, -5.2...","[-5.222593143880853, -5.222593143880853, -5.22...","[0.0642889610917146, 0.0642889610917146, 0.064..."
4,AdaBoostRegressor,"[-5.141818181818182, -6.27, -6.27, -5.85171165...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.141818181818182, -5.141818181818182, -5.1...","[-5.227523410799726, -5.227523410799726, -5.22...","[0.11937436832431304, 0.11937436832431304, 0.1..."
5,XGBRegressor,"[-5.2481537, -6.221691, -6.221691, -5.8582783,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.2481537, -5.2481537, -5.2481537, -5.85827...","[-5.2272468, -5.2272468, -5.2272468, -5.807703...","[0.064050004, 0.064050004, 0.064050004, 0.0729..."
6,ExtraTreesRegressor,"[-5.247999999999995, -6.221999999999991, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.247999999999995, -5.247999999999995, -5.2...","[-5.227047474747475, -5.227047474747475, -5.22...","[0.06409102527231592, 0.06409102527231592, 0.0..."
7,LinearRegression,"[-5.200000000000001, -6.222, -6.222, -5.858261...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.200000000000001, -5.200000000000001, -5.2...","[-5.183047712235579, -5.183047712235579, -5.18...","[0.09545678219236824, 0.09545678219236824, 0.0..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.6...","[-5.6259999999999994, -5.6259999999999994, -5....","[0.16096100286853474, 0.16096100286853474, 0.1..."
9,SVR,"[-5.059962382312374, -5.829816096728328, -5.82...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.059962382312374, -5.059962382312374, -5.0...","[-5.100126817337005, -5.100126817337005, -5.10...","[0.057708193318498226, 0.057708193318498226, 0..."


In [50]:
df_Substructure_fp.to_csv('results/Fingerprints/Results_Substructure_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Substructure_fp_MDCK.csv')

In [51]:
#Substructure Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/SubstructureCount_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/SubstructureCount_test_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_SubstructureCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_SubstructureCount_fp

X_train shape:  (51, 307)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 307)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 3
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testin



0.324767405731352




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6813,0.6884,0.8254,-0.0532,0.0743,0.1752,0.4716,0.5574,0.6867,0.1983,0.5192,0.4101
DecisionTreeRegressor,0.9228,0.7283,0.9606,-0.4265,0.1862,0.2826,0.4878,0.5835,0.6984,0.1707,0.6104,0.3829
RandomForestRegressor,0.6837,0.662,0.8268,-0.0569,0.2588,0.2755,0.3344,0.4977,0.5783,0.4314,0.6754,0.3802
GradientBoostingRegressor,0.8236,0.7077,0.9075,-0.2731,0.198,0.2534,0.4014,0.5314,0.6335,0.3177,0.6659,0.5289
AdaBoostRegressor,0.7001,0.6874,0.8367,-0.0823,0.2339,0.1953,0.2652,0.4465,0.515,0.5491,0.7574,0.4573
XGBRegressor,0.9168,0.7407,0.9575,-0.4172,0.2182,0.3135,0.4759,0.5724,0.6898,0.191,0.6374,0.4187
ExtraTreesRegressor,0.8139,0.6934,0.9022,-0.2582,0.2424,0.2959,0.409,0.5417,0.6395,0.3047,0.6706,0.4408
LinearRegression,0.7815,0.6885,0.884,-0.208,0.2964,0.2859,0.7821,0.5948,0.8843,-0.3295,0.5789,0.573
KNeighborsRegressor,0.6696,0.634,0.8183,-0.0351,0.2981,0.3094,0.3977,0.5026,0.6306,0.3239,0.5834,0.3624
SVR,0.736,0.6888,0.8579,-0.1377,0.1127,0.0952,0.4415,0.5193,0.6644,0.2495,0.5425,0.4077


In [52]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.965745329504426, -5.965745329504426, -5.96...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.965745329504426, -5.540451101521032, -5.9...","[-5.719544843823677, -5.4137902827731335, -5.6...","[0.15359601056705807, 0.15486389547404905, 0.1..."
1,DecisionTreeRegressor,"[-5.295, -5.195, -6.97, -5.597095796, -6.92991...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.73, -6.25, -7.82, -6.927245587, -5.453495...","[-4.779999999999999, -6.244, -6.66800000000000...","[0.22645087767549038, 0.012000000000000099, 1...."
2,RandomForestRegressor,"[-5.408102455869999, -5.332574170829997, -6.52...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.092809278564007, -5.989568416857331, -6.7...","[-5.185977128007052, -5.983516601056534, -6.50...","[0.21500109989896082, 0.21131925066285692, 0.3..."
3,GradientBoostingRegressor,"[-5.326105804774353, -5.247943638226798, -6.89...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.935964109469702, -6.233585524594672, -6.9...","[-4.944951618875127, -6.098908743899988, -6.78...","[0.17561319844768833, 0.2869821943330493, 0.69..."
4,AdaBoostRegressor,"[-5.629859874000001, -5.88846153846154, -5.949...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.2292905595, -5.891419724941175, -5.949034...","[-5.314187798773334, -5.709613707686257, -6.03...","[0.10801011433693447, 0.26510914394588814, 0.2..."
5,XGBRegressor,"[-5.294992, -5.195351, -6.9684587, -5.719072, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.022625, -6.2498746, -7.047421, -6.9253483...","[-5.0143595, -6.145154, -6.9321012, -6.672248,...","[0.24009685, 0.20975846, 0.72851145, 0.5074491..."
6,ExtraTreesRegressor,"[-5.2950000000000035, -5.194999999999996, -6.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.9377000000000075, -6.25, -7.5051666666666...","[-5.090964350974342, -6.112917976365003, -6.81...","[0.2434378209427003, 0.27416404726999555, 0.76..."
7,LinearRegression,"[-5.31004418799253, -5.347058823529404, -6.076...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.042728462752941, -5.439568559636504, -8.2...","[-4.907157673926278, -5.2872154537543485, -8.3...","[0.4849634998391451, 0.49119349830531944, 0.30..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.78...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -5.6...","[-5.666, -5.666, -5.6259999999999994, -6.14811...","[0.14081350945290877, 0.14081350945290877, 0.1..."
9,SVR,"[-5.549524703515771, -5.8088465923891786, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.416662195074671, -5.766882434821846, -5.5...","[-5.514204265643601, -5.745455817431809, -5.65...","[0.1161309894522857, 0.14436930471275064, 0.09..."


In [53]:
df_SubstructureCount_fp.to_csv('results/Fingerprints/Results_Substructure_Count_fp_MDCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Substructure_Count_fp_MDCK.csv')

In [91]:
#Descriptors models
#2d RDKit descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2drdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2drdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 217)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 217)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 335
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 30
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 34
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of te



-1.549197709861739




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6144,0.633,0.7839,0.0502,0.26,0.321,0.3853,0.5321,0.6207,0.3451,0.6541,0.5096
DecisionTreeRegressor,1.2977,0.8937,1.1392,-1.0061,0.1172,0.2267,0.3965,0.5358,0.6297,0.326,0.6943,0.3274
RandomForestRegressor,0.7069,0.6854,0.8408,-0.0927,0.2157,0.2519,0.3114,0.4802,0.558,0.4706,0.713,0.3604
GradientBoostingRegressor,0.9661,0.7816,0.9829,-0.4934,0.129,0.1707,0.3572,0.4835,0.5977,0.3927,0.7209,0.4539
AdaBoostRegressor,0.7967,0.7064,0.8926,-0.2315,0.2161,0.2615,0.2981,0.4464,0.546,0.4933,0.7324,0.3879
XGBRegressor,1.0072,0.8138,1.0036,-0.5569,0.0438,0.0625,0.4038,0.54,0.6354,0.3136,0.7114,0.3879
ExtraTreesRegressor,0.8096,0.7086,0.8998,-0.2516,0.1956,0.2083,0.3799,0.5331,0.6164,0.3541,0.7062,0.3439
LinearRegression,3.1917,1.2767,1.7865,-3.9339,0.1092,0.0432,1.3923,0.8926,1.18,-1.367,0.3615,0.1953
KNeighborsRegressor,0.6755,0.6557,0.8219,-0.0442,0.283,0.3192,0.3357,0.4399,0.5794,0.4293,0.6572,0.4711
SVR,0.6259,0.6268,0.7911,0.0325,0.2918,0.2408,0.3195,0.4926,0.5652,0.4568,0.6937,0.3989


In [92]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.536143010992385, -6.156816999215223, -6.29...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.664411161024185, -5.792529113538625, -5.7...","[-5.666812029931583, -5.5713305480511535, -5.8...","[0.21835822613593237, 0.20002140796917076, 0.0..."
1,DecisionTreeRegressor,"[-5.65, -5.06, -6.97, -6.929917083, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.65, -4.94, -6.25, -6.929917083, -5.453495...","[-5.5760000000000005, -5.782, -6.3119999999999...","[0.3269617714657173, 0.482883008605604, 0.5778..."
2,RandomForestRegressor,"[-5.277899170829996, -5.309295784909999, -6.34...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.60912975092, -5.545679234792003, -6.35578...","[-5.568364454949996, -5.5855388292121635, -6.3...","[0.10992182033011405, 0.18689638979762557, 0.1..."
3,GradientBoostingRegressor,"[-5.10865391642442, -5.187262267679404, -6.903...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.895069707573125, -5.7681843646683815, -6....","[-5.693704748868479, -5.596495097337898, -6.70...","[0.2772084735419424, 0.2891247922495179, 0.423..."
4,AdaBoostRegressor,"[-5.242156934117649, -5.317665433749999, -6.82...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.30537681240909, -5.491019461714286, -5.65...","[-5.41521975647831, -5.479923217749817, -6.082...","[0.20463762584889672, 0.16790784110240162, 0.3..."
5,XGBRegressor,"[-4.922271, -5.8616314, -7.1029534, -5.7418523...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.7401524, -5.666066, -6.3228106, -6.709675...","[-5.6514807, -5.726673, -7.0044594, -6.63848, ...","[0.30661, 0.43052056, 0.7034079, 0.45411754, 0..."
6,ExtraTreesRegressor,"[-5.119283850859998, -5.1053, -6.7394941265700...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.15835, -5.91154917083, -7.550736713070008...","[-5.304833137838003, -5.851357010354502, -7.05...","[0.2612945064810428, 0.43699463695086216, 0.51..."
7,LinearRegression,"[-4.0, -7.956188319024213, -6.4103483481406025...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -9.254455419528936, -4.0, -6.840685888...","[-6.617857950485297, -7.527326511204049, -4.0,...","[2.7900054093189843, 1.9556044090199518, 0.0, ..."
8,KNeighborsRegressor,"[-5.3133333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.1...","[-5.6259999999999994, -5.4126666666666665, -6....","[0.16096100286853474, 0.2143766156401708, 0.24..."
9,SVR,"[-5.554470524037222, -5.224643213908656, -5.62...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.738491452700495, -5.999657212622324, -5.8...","[-5.658532807919006, -5.823951875610088, -5.74...","[0.07794986786521718, 0.2884268542234521, 0.08..."


In [93]:
result_df.to_csv('results/Descriptors/Results_2d_RDKit_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_RDKit_desc_MDCK.csv')

In [94]:
#2d Mordred descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df , prediction_df= train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 1436)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1436)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4840
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 360
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9013
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 666
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5689,0.6356,0.7543,0.1205,0.3732,0.4268,0.352,0.5043,0.5933,0.4017,0.6478,0.4374
DecisionTreeRegressor,1.0439,0.7474,1.0217,-0.6136,0.0828,0.1914,0.3952,0.5624,0.6286,0.3282,0.6884,0.3494
RandomForestRegressor,0.6209,0.6574,0.788,0.0401,0.2795,0.2783,0.3002,0.4847,0.5479,0.4896,0.7172,0.3329
GradientBoostingRegressor,0.6643,0.671,0.815,-0.0269,0.2614,0.2445,0.2856,0.4606,0.5345,0.5144,0.7596,0.3329
AdaBoostRegressor,0.6278,0.6222,0.7923,0.0296,0.327,0.3204,0.338,0.4779,0.5814,0.4254,0.7007,0.2889
XGBRegressor,0.7272,0.6806,0.8528,-0.1242,0.2845,0.2473,0.2722,0.4639,0.5217,0.5373,0.7764,0.3989
ExtraTreesRegressor,0.6327,0.656,0.7954,0.0219,0.342,0.3455,0.3436,0.5122,0.5862,0.4158,0.7151,0.3494
LinearRegression,1.4614,0.8733,1.2089,-1.2591,0.1213,0.2206,0.5443,0.5546,0.7377,0.0747,0.7306,0.5089
KNeighborsRegressor,0.6446,0.6277,0.8029,0.0036,0.3111,0.3154,0.3445,0.4684,0.5869,0.4143,0.6455,0.3834
SVR,0.6499,0.6327,0.8062,-0.0047,0.2437,0.2427,0.3151,0.4771,0.5613,0.4644,0.7162,0.4154


In [95]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.553189223277948, -5.762083386127124, -6.01...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.70104546213166, -5.56595266570999, -5.793...","[-5.529763782370577, -5.626879803377877, -6.14...","[0.20598848750017978, 0.17165051129096703, 0.2..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -4.73, -6.929917083, -6.9272455...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.73, -6.25, -5.93, -6.927245587, -6.927245...","[-5.3100000000000005, -6.164, -6.4866938849999...","[0.38590154184714004, 0.17199999999999988, 0.7..."
2,RandomForestRegressor,"[-5.651568240779997, -5.495541399969999, -5.73...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.446327593922331, -5.838984782265002, -5.9...","[-5.4983864986423, -5.789393364633268, -6.0415...","[0.14620118722672648, 0.17613586949685736, 0.0..."
3,GradientBoostingRegressor,"[-5.5954307166689485, -5.228433616335078, -5.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.006210706231372, -6.198551846962498, -6.0...","[-5.55715750776775, -5.989392374647627, -6.260...","[0.42810212182847185, 0.3053731089460274, 0.33..."
4,AdaBoostRegressor,"[-5.474897937454546, -5.36833919, -5.93, -5.98...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.2565051159583325, -6.048814726833334, -5....","[-5.267883186225952, -5.828556496673485, -6.18...","[0.2219154352238451, 0.21389420781883928, 0.33..."
5,XGBRegressor,"[-5.69612, -5.5184503, -4.8876624, -6.4128222,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.16211, -6.119285, -6.4525523, -6.9264083,...","[-5.6374235, -5.8563933, -6.106925, -6.6213274...","[0.31398037, 0.30552727, 0.3347825, 0.52454853..."
6,ExtraTreesRegressor,"[-5.690629635769994, -5.766462100400006, -6.12...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3981691634050035, -6.061690177500001, -6....","[-5.26019510979817, -5.877045608341501, -6.622...","[0.17984962935159388, 0.2655025927151172, 0.18..."
7,LinearRegression,"[-6.590483225494487, -5.513462344328166, -5.11...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-8.175032551649029, -7.201330799747298, -7.1...","[-6.270740945214986, -6.362757574720621, -7.91...","[1.065123527345538, 0.9775428087729081, 1.4649..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -5.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.713333333333333, -6.22...","[0.1707890186425604, 0.1707890186425604, 0.309..."
9,SVR,"[-5.578029705740392, -5.349600407183299, -5.65...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.501403493266325, -5.8910307255329135, -5....","[-5.536962996407043, -5.843872486173774, -5.80...","[0.15365119163203037, 0.21856428173859294, 0.1..."


In [96]:
result_df.to_csv('results/Descriptors/Results_2d_Mordred_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_Mordred_desc_MDCK.csv')

In [97]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [98]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [99]:
#2d RDKit descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 134)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 134)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 335
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 30
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 34
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of te



0.4985200968704806




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6144,0.633,0.7839,0.0502,0.26,0.321,0.3853,0.5321,0.6207,0.3451,0.6541,0.5096
DecisionTreeRegressor,1.06,0.8123,1.0296,-0.6386,0.1074,0.1519,0.3866,0.4691,0.6218,0.3427,0.6941,0.4484
RandomForestRegressor,0.6982,0.6791,0.8356,-0.0792,0.2282,0.2721,0.3064,0.4785,0.5536,0.4791,0.7159,0.3879
GradientBoostingRegressor,0.9427,0.7781,0.9709,-0.4573,0.1334,0.157,0.3377,0.4791,0.5811,0.4259,0.7391,0.4814
AdaBoostRegressor,0.776,0.7089,0.8809,-0.1996,0.2202,0.2704,0.3085,0.4474,0.5555,0.4755,0.7321,0.3329
XGBRegressor,1.0072,0.8138,1.0036,-0.5569,0.0438,0.0625,0.4038,0.54,0.6354,0.3136,0.7114,0.3879
ExtraTreesRegressor,0.7916,0.7043,0.8897,-0.2237,0.2105,0.2173,0.3716,0.5282,0.6096,0.3682,0.7173,0.3439
LinearRegression,3.1917,1.2767,1.7865,-3.9339,0.1092,0.0432,1.3923,0.8926,1.18,-1.367,0.3615,0.1953
KNeighborsRegressor,0.6755,0.6557,0.8219,-0.0442,0.283,0.3192,0.3357,0.4399,0.5794,0.4293,0.6572,0.4711
SVR,0.6259,0.6268,0.7911,0.0325,0.2918,0.2408,0.3195,0.4925,0.5652,0.4569,0.6937,0.3989


In [100]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.536143010992385, -6.156816999215223, -6.29...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.664411161024185, -5.792529113538625, -5.7...","[-5.666812029931583, -5.5713305480511535, -5.8...","[0.21835822613593237, 0.20002140796917076, 0.0..."
1,DecisionTreeRegressor,"[-5.65, -5.06, -6.97, -6.929917083, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.65, -4.94, -6.25, -6.927245587, -5.453495...","[-5.676, -5.74, -6.297449117399999, -6.7384734...","[0.39877813380374816, 0.6250119998848022, 0.41..."
2,RandomForestRegressor,"[-5.284349170829996, -5.3062113300750005, -6.3...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.604231192076643, -5.513535647592001, -6.3...","[-5.554722246491525, -5.5749159733453855, -6.3...","[0.13784766283058675, 0.17690268977790705, 0.1..."
3,GradientBoostingRegressor,"[-5.056555488020928, -5.199656544258757, -6.77...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.0276951123819735, -5.796909733677128, -6....","[-5.780324070898283, -5.6199896657596575, -6.6...","[0.25059911635927906, 0.2853438141874957, 0.38..."
4,AdaBoostRegressor,"[-5.28, -5.291249999999999, -6.419104859818183...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.646157390947368, -5.646157390947368, -6.3...","[-5.516144032722807, -5.56248130589114, -6.218...","[0.2613749018471754, 0.24580495804870453, 0.40..."
5,XGBRegressor,"[-4.922271, -5.8616314, -7.1029534, -5.7418523...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.7401524, -5.666066, -6.3228106, -6.709675...","[-5.6514807, -5.726673, -7.0044594, -6.63848, ...","[0.30661, 0.43052056, 0.7034079, 0.45411754, 0..."
6,ExtraTreesRegressor,"[-5.147999999999996, -5.079300000000002, -6.71...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.134937354730003, -6.08245, -7.34019074065...","[-5.337058525019502, -5.878268496912002, -7.10...","[0.2292245484979095, 0.44007863893686966, 0.44..."
7,LinearRegression,"[-4.0, -7.956188319024531, -6.410348348140615,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.0, -9.254455419529105, -4.0, -6.840685888...","[-6.617857950485292, -7.527326511204102, -4.0,...","[2.790005409318987, 1.955604409019984, 0.0, 0...."
8,KNeighborsRegressor,"[-5.3133333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.1...","[-5.6259999999999994, -5.4126666666666665, -6....","[0.16096100286853474, 0.2143766156401708, 0.24..."
9,SVR,"[-5.554476793179216, -5.224608127621062, -5.62...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.738480683323801, -5.999654458818311, -5.8...","[-5.658524315287802, -5.824015903459413, -5.74...","[0.07795456608354132, 0.28840360230986073, 0.0..."


In [101]:
result_df.to_csv('results/Descriptors/Results_2d_rdkit_const_rem_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_rdkit_const_rem_MDCK.csv')

In [102]:
#2d Mordred descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 1148)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1148)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4840
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 360
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9013
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 666
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5689,0.6356,0.7543,0.1205,0.3732,0.4268,0.352,0.5043,0.5933,0.4017,0.6478,0.4374
DecisionTreeRegressor,0.7908,0.7004,0.8893,-0.2225,0.2631,0.3437,0.523,0.5867,0.7232,0.1108,0.6004,0.3054
RandomForestRegressor,0.6339,0.6631,0.7962,0.02,0.251,0.2331,0.3067,0.4872,0.5538,0.4787,0.7127,0.3329
GradientBoostingRegressor,0.6524,0.667,0.8077,-0.0085,0.2847,0.272,0.2991,0.4615,0.5469,0.4915,0.7494,0.3439
AdaBoostRegressor,0.5955,0.6194,0.7717,0.0794,0.3695,0.3472,0.3019,0.4693,0.5495,0.4867,0.7435,0.3604
XGBRegressor,0.7272,0.6806,0.8528,-0.1242,0.2845,0.2473,0.2722,0.4639,0.5217,0.5373,0.7764,0.3989
ExtraTreesRegressor,0.6128,0.6407,0.7828,0.0528,0.3618,0.3453,0.333,0.5048,0.5771,0.4338,0.7253,0.3494
LinearRegression,1.4614,0.8733,1.2089,-1.2591,0.1213,0.2206,0.5443,0.5546,0.7377,0.0747,0.7306,0.5089
KNeighborsRegressor,0.6446,0.6277,0.8029,0.0036,0.3111,0.3154,0.3445,0.4684,0.5869,0.4143,0.6455,0.3834
SVR,0.6499,0.6327,0.8062,-0.0046,0.2437,0.2427,0.3151,0.4771,0.5613,0.4644,0.7162,0.4154


In [103]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.553189223277948, -5.762083386127124, -6.01...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.70104546213166, -5.56595266570999, -5.793...","[-5.529763782370577, -5.626879803377877, -6.14...","[0.20598848750017978, 0.17165051129096703, 0.2..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -5.1, -6.929917083, -6.92724558...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.61, -6.25, -6.85, -6.927245587, -6.927245...","[-5.438, -5.95, -6.4279156982000005, -6.532139...","[0.5309011207371858, 0.37947331922020555, 0.47..."
2,RandomForestRegressor,"[-5.609557636606427, -5.554604654275, -5.88675...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.51076090808, -5.862385785791669, -5.95240...","[-5.531368222817464, -5.7900167333438, -6.0475...","[0.1209440209332762, 0.16754963900400116, 0.09..."
3,GradientBoostingRegressor,"[-5.595618382103325, -5.239298104839103, -5.18...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.077696701712457, -6.154795266240203, -6.2...","[-5.6091619249240985, -5.939200936747936, -6.2...","[0.39686422781752306, 0.2622414307964284, 0.27..."
4,AdaBoostRegressor,"[-5.65, -5.291970879444444, -5.989595051454547...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.265958313761904, -5.619616091222222, -6.1...","[-5.454023363486295, -5.834459007663157, -6.31...","[0.19520960298612394, 0.24739244605639968, 0.1..."
5,XGBRegressor,"[-5.69612, -5.5184503, -4.8876624, -6.4128222,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.16211, -6.119285, -6.4525523, -6.9264083,...","[-5.6374235, -5.8563933, -6.106925, -6.6213274...","[0.31398037, 0.30552727, 0.3347825, 0.52454853..."
6,ExtraTreesRegressor,"[-5.673698365874995, -5.89139774658, -5.945885...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.429057026422499, -5.97715676172, -6.54777...","[-5.342987946713, -5.834944602552833, -6.61986...","[0.15193389815605052, 0.23651510206489035, 0.2..."
7,LinearRegression,"[-6.590483225494502, -5.513462344328177, -5.11...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-8.175032551649032, -7.201330799747323, -7.1...","[-6.270740945214962, -6.362757574720617, -7.91...","[1.065123527345553, 0.9775428087729097, 1.4649..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -5.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.713333333333333, -6.22...","[0.1707890186425604, 0.1707890186425604, 0.309..."
9,SVR,"[-5.578031046668349, -5.349593503063404, -5.65...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.5014291898712475, -5.891068502793287, -5....","[-5.53696304869614, -5.843868785348652, -5.802...","[0.15365230921208542, 0.21856237722668823, 0.1..."


In [104]:
result_df.to_csv('results/Descriptors/Results_2d_Mordred_const_rem_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_df_2d_Mordred_const_rem_MDCK.csv')

In [105]:
#2d RDKit descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_LVR_rdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_LVR_rdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 123)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 123)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 25
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 30
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [



-0.47806629881682805




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6795,0.6706,0.8243,-0.0504,0.134,0.2187,0.4107,0.5511,0.6409,0.3018,0.6337,0.3774
DecisionTreeRegressor,1.335,0.9054,1.1554,-1.0636,0.0841,0.1853,0.4186,0.5316,0.647,0.2884,0.6834,0.4099
RandomForestRegressor,0.6996,0.6832,0.8364,-0.0815,0.246,0.3046,0.3208,0.4891,0.5664,0.4547,0.702,0.3714
GradientBoostingRegressor,0.9325,0.7675,0.9657,-0.4415,0.1575,0.2121,0.359,0.4923,0.5992,0.3897,0.7172,0.3879
AdaBoostRegressor,0.7416,0.6837,0.8611,-0.1464,0.2566,0.322,0.3088,0.4459,0.5557,0.475,0.7244,0.3879
XGBRegressor,0.9844,0.7822,0.9922,-0.5218,0.0879,0.1402,0.401,0.5333,0.6333,0.3183,0.7091,0.3769
ExtraTreesRegressor,0.8288,0.7215,0.9104,-0.2812,0.1972,0.2321,0.3791,0.5344,0.6157,0.3555,0.7142,0.3439
LinearRegression,3.7071,1.368,1.9254,-4.7307,0.0383,0.054,0.8383,0.6388,0.9156,-0.4251,0.5393,0.542
KNeighborsRegressor,0.6909,0.6528,0.8312,-0.0681,0.2637,0.2795,0.3468,0.4659,0.5889,0.4104,0.6422,0.405
SVR,0.6231,0.6253,0.7894,0.0368,0.3002,0.2536,0.3195,0.4905,0.5653,0.4568,0.6935,0.4154


In [106]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.536143010992385, -6.156816999215223, -6.29...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.664411161024185, -5.792529113538625, -5.7...","[-5.6330422980809995, -5.632885645845008, -5.7...","[0.23685234576893643, 0.24203027799568494, 0.1..."
1,DecisionTreeRegressor,"[-4.94, -5.06, -5.93, -5.793469425, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.65, -5.65, -6.929917083, -5.793469425, -5...","[-5.562, -6.01, -6.6619668332, -6.5117182544, ...","[0.5296942514318991, 0.2939387691339812, 0.796..."
2,RandomForestRegressor,"[-5.261718559329996, -5.3724716266999994, -6.3...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.55181461126, -5.624769316419999, -6.32123...","[-5.529950919176185, -5.655219734989752, -6.32...","[0.11667761704561924, 0.21489849640850642, 0.1..."
3,GradientBoostingRegressor,"[-5.472807967724208, -5.181052058170596, -6.68...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.654375736074772, -5.612085049871201, -6.3...","[-5.665746381937298, -5.731151545358999, -6.55...","[0.21608019706210085, 0.33790986104865084, 0.3..."
4,AdaBoostRegressor,"[-5.28, -5.353444142529413, -6.7299834166, -6....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.258181818181819, -5.385, -6.1816799935714...","[-5.432285470340851, -5.490211166652814, -6.14...","[0.23123134159728428, 0.1636072816419407, 0.27..."
5,XGBRegressor,"[-4.92453, -5.9122243, -7.021858, -5.67806, -6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.732529, -5.738987, -6.4222336, -7.023955,...","[-5.6482544, -5.7494774, -7.0320344, -6.598863...","[0.40856382, 0.4536152, 0.6499573, 0.4812082, ..."
6,ExtraTreesRegressor,"[-5.069799999999999, -5.1142, -6.7781716267000...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.101350000000003, -5.95849578491, -7.51873...","[-5.313630160678004, -5.813896329955002, -7.17...","[0.3236274534874776, 0.48308935299843536, 0.57..."
7,LinearRegression,"[-6.185007936444608, -5.639479384692574, -6.58...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-10.0, -6.7463899214047185, -4.1272204282254...","[-6.4, -7.385993734634875, -6.485840113737058,...","[2.939387691339814, 2.323183415515883, 2.23661..."
8,KNeighborsRegressor,"[-5.3133333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.1...","[-5.6259999999999994, -5.6546666666666665, -6....","[0.16096100286853474, 0.14230796026770784, 0.2..."
9,SVR,"[-5.554014438214139, -5.205640452563704, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.726217474440237, -6.001449862936582, -5.8...","[-5.652744031977923, -5.836963844727428, -5.74...","[0.08030176667538513, 0.2861844633849581, 0.09..."


In [107]:
result_df.to_csv('results/Descriptors/Results_2d_rdkit_LVR_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_rdkit_LVR_MDCK.csv')

In [108]:
#2d Mordred descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
results_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
results_df

X_train shape:  (51, 769)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 769)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3168
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 236
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5888
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 435
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o



0.6723578691170898


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6351,0.6625,0.797,0.0182,0.2765,0.3301,0.355,0.5262,0.5958,0.3965,0.6574,0.5089
DecisionTreeRegressor,0.8087,0.7099,0.8993,-0.2501,0.3173,0.3547,0.3341,0.5172,0.5781,0.4319,0.7314,0.3659
RandomForestRegressor,0.6372,0.6654,0.7983,0.0149,0.2674,0.3016,0.311,0.4893,0.5576,0.4714,0.7155,0.3769
GradientBoostingRegressor,0.7982,0.7028,0.8934,-0.2339,0.2349,0.3294,0.386,0.5372,0.6213,0.3437,0.6976,0.3549
AdaBoostRegressor,0.6737,0.6573,0.8208,-0.0414,0.3061,0.2805,0.3234,0.4692,0.5687,0.4503,0.7205,0.3164
XGBRegressor,0.8373,0.7589,0.915,-0.2943,0.1962,0.2836,0.3578,0.5286,0.5982,0.3917,0.7174,0.3714
ExtraTreesRegressor,0.6605,0.6628,0.8127,-0.021,0.3014,0.318,0.3261,0.4995,0.571,0.4456,0.735,0.3494
LinearRegression,1.8336,0.9874,1.3541,-1.8344,-0.0161,0.1256,0.4838,0.5479,0.6955,0.1775,0.7426,0.5034
KNeighborsRegressor,0.6026,0.6162,0.7763,0.0684,0.3574,0.3727,0.3447,0.469,0.5871,0.414,0.6453,0.3834
SVR,0.6454,0.6358,0.8034,0.0023,0.2536,0.2482,0.3126,0.4695,0.5591,0.4685,0.7182,0.4209


In [109]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.536334600266275, -5.800070490585862, -5.79...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.674867140286591, -5.729655706549241, -6.1...","[-5.560723749408672, -5.760891408354489, -6.31...","[0.21328074346600934, 0.20087814245258503, 0.1..."
1,DecisionTreeRegressor,"[-5.65, -5.06, -4.94, -6.929917083, -5.4534952...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.73, -6.25, -5.93, -6.927245587, -6.569578...","[-5.33, -6.028293542049999, -6.4724391856, -6....","[0.43501724103763983, 0.28267907625068495, 0.3..."
2,RandomForestRegressor,"[-5.529595784909995, -5.668382901070001, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.451143759287333, -5.795538351944, -6.2418...","[-5.553947754013035, -5.797327275282467, -6.33...","[0.16261025029962267, 0.11341393266388204, 0.0..."
3,GradientBoostingRegressor,"[-5.683829602324278, -5.50648748716373, -5.099...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.037497427644231, -6.0824563930132864, -6....","[-5.472817582640283, -5.977716917554431, -6.51...","[0.2966781241909016, 0.24352436676583278, 0.24..."
4,AdaBoostRegressor,"[-5.449549484285713, -5.40780180375, -6.193087...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.036, -6.036666666666666, -6.0217347125, -...","[-5.381309963427077, -5.832756747893453, -6.19...","[0.28876656455100796, 0.20503960040015792, 0.2..."
5,XGBRegressor,"[-5.645642, -5.329582, -5.3889837, -6.4178996,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.319728, -6.036903, -6.3016453, -7.0434856...","[-5.4812574, -5.813936, -6.620009, -6.552999, ...","[0.09593115, 0.2658951, 0.39809063, 0.5484811,..."
6,ExtraTreesRegressor,"[-5.566999999999998, -5.884735054730003, -6.18...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.309784432580001, -6.102141764717499, -6.4...","[-5.424161645884835, -5.976622734714001, -6.69...","[0.15832369672300564, 0.20433303178798568, 0.4..."
7,LinearRegression,"[-6.547647100696484, -6.16724119836109, -4.681...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-9.113817059237686, -7.582885255139203, -6.9...","[-6.650955266440123, -6.1825085847037045, -7.7...","[1.4586895442816596, 1.2156490493841625, 1.691..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.713333333333333, -6.22...","[0.1707890186425604, 0.1707890186425604, 0.309..."
9,SVR,"[-5.5598566135288845, -5.360041381975508, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.570282041548942, -5.915425292105371, -5.8...","[-5.582650571975117, -5.851230286646432, -5.83...","[0.12973618164304945, 0.24311360800261486, 0.1..."


In [110]:
results_df.to_csv('results/Descriptors/Results_2d_Mordred_LVR_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_Mordred_LVR_MDCK.csv')

In [111]:
#2d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_MDCK.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-6.7922,46.133981,200.8102,123.721166,0,0,118,56,62,...,107.350908,1.916981,55.593344,24.924836,30.668508,13190.0,104.0,-0.452,276.0,1109
1,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.401998,1.970407,37.167232,17.862301,19.304931,10372.0,92.0,5.55,272.0,1018
2,0,-1.2947,1.676248,186.2062,119.10158,0,0,111,51,60,...,100.950171,1.979415,36.645624,17.922134,18.723489,9286.0,80.0,6.411,254.0,1017
3,0,-2.129,4.532641,277.7356,179.405714,0,0,172,74,98,...,142.514887,1.925877,56.524332,25.255908,31.268424,25829.0,128.0,8.158,360.0,1107
4,0,-8.4202,70.899768,189.3712,121.053994,0,0,114,56,58,...,109.785075,1.960448,56.48295,25.157078,31.325873,13235.0,98.0,-1.338,284.0,1112
5,0,-5.1194,26.208256,249.9738,164.364198,0,0,156,70,86,...,136.877717,1.955396,57.041638,25.360826,31.680811,22502.0,122.0,5.454,352.0,1115
6,0,-0.7316,0.535239,184.6007,118.29958,0,0,110,50,60,...,99.092833,1.981857,34.121681,15.400697,18.720983,8739.0,78.0,7.547,248.0,1840
7,0,-5.3114,28.21097,240.6548,158.177026,0,0,150,68,82,...,133.232246,1.959298,57.127492,25.386644,31.740847,21211.0,114.0,4.402,340.0,1111
8,0,-0.7316,0.535239,184.6007,118.29958,0,0,110,50,60,...,99.092833,1.981857,34.121681,15.400697,18.720983,8739.0,78.0,7.547,248.0,1844
9,0,-0.2899,0.084042,200.4822,122.245994,0,0,111,53,58,...,106.152092,2.00287,34.184233,15.422362,18.76187,10243.0,83.0,7.361,266.0,1841


In [112]:
df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_MDCK.csv')
df 


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,101.372024,2.447751,4.856077,...,11.210644,138.066274,1138.715439,6.469974,33296,140,416.0,486.0,34.055556,18.111111
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,101.370874,2.447978,4.860529,...,11.210644,138.066274,1138.715439,6.469974,33296,140,416.0,486.0,34.055556,18.111111
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,101.371868,2.44359,4.850728,...,11.210644,138.066274,1138.715439,6.469974,33296,140,416.0,486.0,34.055556,18.111111
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,97.295643,2.408298,4.816595,...,11.141644,120.163455,1114.715439,6.406411,31648,138,396.0,458.0,36.555556,17.777778
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.774478,2.436865,4.825853,...,11.126159,133.738666,1090.679054,6.492137,29794,130,394.0,458.0,31.222222,17.333333
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.685868,2.442093,4.836451,...,11.129393,133.743086,1090.679054,6.492137,29704,130,394.0,458.0,31.222222,17.333333
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.685868,2.442093,4.836451,...,11.129393,133.743086,1088.699789,6.404116,29704,130,394.0,458.0,31.222222,17.333333
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.774478,2.436865,4.825853,...,11.126159,133.738666,1088.699789,6.404116,29794,130,394.0,458.0,31.222222,17.333333
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,89.136837,2.437073,4.831019,...,11.025019,126.223556,1047.640226,6.466915,25439,124,358.0,415.0,31.75,16.611111
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,86.880063,2.403063,4.806126,...,11.046579,113.682027,1046.746739,6.085737,25829,128,360.0,414.0,38.055556,16.444444


In [113]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1109,C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N(C)C(...,-6.12,0,-6.7922,46.133981,200.8102,123.721166,0,0,...,6.732755,107.350908,1.916981,55.593344,24.924836,30.668508,13190.0,104.0,-0.452,276.0
1,1018,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-4.5,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.401998,1.970407,37.167232,17.862301,19.304931,10372.0,92.0,5.55,272.0
2,1017,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...,-6.55,0,-1.2947,1.676248,186.2062,119.10158,0,0,...,6.41849,100.950171,1.979415,36.645624,17.922134,18.723489,9286.0,80.0,6.411,254.0
3,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,0,-2.129,4.532641,277.7356,179.405714,0,0,...,6.085737,142.514887,1.925877,56.524332,25.255908,31.268424,25829.0,128.0,8.158,360.0
4,1112,C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](C)NC(=O)[...,-6.17,0,-8.4202,70.899768,189.3712,121.053994,0,0,...,6.933629,109.785075,1.960448,56.48295,25.157078,31.325873,13235.0,98.0,-1.338,284.0
5,1115,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.19,0,-5.1194,26.208256,249.9738,164.364198,0,0,...,6.324698,136.877717,1.955396,57.041638,25.360826,31.680811,22502.0,122.0,5.454,352.0
6,1840,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-4.48,0,-0.7316,0.535239,184.6007,118.29958,0,0,...,6.331431,99.092833,1.981857,34.121681,15.400697,18.720983,8739.0,78.0,7.547,248.0
7,1111,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C...,-6.22,0,-5.3114,28.21097,240.6548,158.177026,0,0,...,6.39081,133.232246,1.959298,57.127492,25.386644,31.740847,21211.0,114.0,4.402,340.0
8,1844,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-5.1,0,-0.7316,0.535239,184.6007,118.29958,0,0,...,6.331431,99.092833,1.981857,34.121681,15.400697,18.720983,8739.0,78.0,7.547,248.0
9,1841,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-4.41,0,-0.2899,0.084042,200.4822,122.245994,0,0,...,6.580557,106.152092,2.00287,34.184233,15.422362,18.76187,10243.0,83.0,7.361,266.0


In [114]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,0,-2.6816,7.190979,307.3786,190.818542,0,0,...,6.469974,162.717774,1.984363,57.439206,25.502169,31.937037,33296.0,140.0,7.952,416.0
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,0,-2.6816,7.190979,307.3786,190.818542,0,0,...,6.469974,162.720191,1.984393,57.435556,25.501133,31.934422,33296.0,140.0,7.952,416.0
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,0,-2.6816,7.190979,307.3786,190.818542,0,0,...,6.469974,162.7174,1.984359,57.440346,25.502649,31.937697,33296.0,140.0,7.952,416.0
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,0,-1.2456,1.551519,309.4986,187.298542,0,0,...,6.406411,156.626458,1.957831,56.630904,25.293944,31.33696,31648.0,138.0,7.786,396.0
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,0,-5.723,32.752729,279.5151,180.15337,0,0,...,6.492137,155.242276,1.990286,60.716736,28.525011,32.191725,29794.0,130.0,5.187,394.0
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,0,-5.723,32.752729,279.5151,180.15337,0,0,...,6.492137,155.232056,1.990155,60.685463,28.512533,32.17293,29704.0,130.0,5.187,394.0
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,0,-5.3689,28.825087,280.2868,182.444956,0,0,...,6.404116,155.232056,1.990155,57.684402,25.511472,32.17293,29704.0,130.0,7.084,394.0
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,0,-5.3689,28.825087,280.2868,182.444956,0,0,...,6.404116,155.242276,1.990286,57.714858,25.523133,32.191725,29794.0,130.0,7.084,394.0
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,0,-6.0254,36.305445,258.2126,172.486577,0,0,...,6.466915,143.466261,1.965291,59.644932,28.221894,28.405281,25439.0,124.0,7.222,358.0
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,0,-2.129,4.532641,277.7356,179.405714,0,0,...,6.085737,142.514887,1.925877,56.524332,25.255908,31.268424,25829.0,128.0,8.158,360.0


In [115]:
df_ordered.to_csv('features/Descriptors/Train_2d_padel_curated_MDCK.csv', index=False)

In [116]:
#2d test padel descriptors
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_MDCK.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-1.6634,2.7669,183.2267,118.29958,0,0,110,50,60,...,99.084602,1.981692,34.113502,15.397863,18.715639,8694.0,80.0,7.336,248.0,1842
1,0,-1.2217,1.492551,199.1082,122.245994,0,0,111,53,58,...,106.143859,2.002714,34.176054,15.419528,18.756526,10195.0,85.0,7.15,266.0,1843
2,0,-3.1134,9.69326,255.8998,162.779405,0,0,154,69,85,...,133.378413,1.93302,59.341751,28.115454,28.210938,22236.0,115.0,5.397,330.0,8168
3,0,-3.6834,13.567436,252.0938,160.844198,0,0,154,68,86,...,130.791399,1.923403,56.234345,25.152766,31.081579,21154.0,120.0,5.288,332.0,1108
4,0,-4.929,24.295041,252.2939,164.539405,0,0,155,70,85,...,136.601049,1.951444,59.784728,28.23528,28.53169,22971.0,116.0,5.752,338.0,8119
5,0,-3.9378,15.506269,249.8523,166.492991,0,0,158,71,87,...,139.131996,1.959606,56.808725,28.380436,28.428289,22927.0,118.0,7.404,354.0,6496
6,0,-4.0401,16.322408,244.1556,163.399405,0,0,155,70,85,...,137.31077,1.961582,56.609941,28.393162,28.216778,22212.0,114.0,7.691,348.0,6423
7,0,-3.1134,9.69326,255.8998,162.779405,0,0,154,69,85,...,133.381424,1.933064,59.345483,28.116662,28.212683,22278.0,115.0,5.397,330.0,8345
8,0,-5.0816,25.822659,254.9508,167.632991,0,0,158,71,87,...,138.417801,1.949546,59.630333,28.216709,28.395866,23619.0,120.0,6.424,344.0,8143
9,0,-1.2456,1.551519,309.4986,187.298542,0,0,174,80,94,...,156.622606,1.957783,56.617512,25.292701,31.324812,31648.0,138.0,7.786,396.0,1118


In [117]:
df = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_MDCK.csv')
df

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,103.838268,2.420071,4.840143,...,11.20091,126.480968,1198.80934,6.243799,37307,146,424.0,488.0,39.277778,19.111111
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,97.302139,2.412394,4.824787,...,11.141934,120.163745,1114.715439,6.406411,31648,138,396.0,458.0,36.555556,17.777778
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,95.621135,2.421653,4.790984,...,11.069213,133.614531,1082.652839,6.601542,30176,124,392.0,450.0,30.611111,17.222222
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,88.159241,2.431154,4.817857,...,10.985954,125.096326,1033.624576,6.500783,24767,120,352.0,406.0,30.888889,16.388889
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,85.381854,2.437654,4.83044,...,10.978917,124.00746,1021.624576,6.465978,23619,120,344.0,397.0,32.75,16.194444
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,83.965708,2.427767,4.806183,...,10.937792,122.874218,1007.608926,6.500703,22971,116,338.0,388.0,31.888889,15.972222
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,84.630944,2.429295,4.822808,...,11.018416,126.196276,1001.652505,6.339573,22927,118,354.0,409.0,32.472222,15.666667
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,81.585614,2.38103,4.762061,...,10.902997,108.14071,995.608926,6.464993,22236,115,330.0,376.0,33.75,15.638889
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,81.585744,2.384457,4.768915,...,10.902997,108.14071,995.608926,6.464993,22278,115,330.0,376.0,33.75,15.638889
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,83.165492,2.427826,4.818625,...,10.978746,125.067539,987.636855,6.371851,22212,114,348.0,400.0,31.611111,15.444444


In [118]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1842,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H...,-4.4,0,-1.6634,2.7669,183.2267,118.29958,0,0,...,6.331431,99.084602,1.981692,34.113502,15.397863,18.715639,8694.0,80.0,7.336,248.0
1,1843,CC[C@H](C)[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@...,-4.4,0,-1.2217,1.492551,199.1082,122.245994,0,0,...,6.580557,106.143859,2.002714,34.176054,15.419528,18.756526,10195.0,85.0,7.15,266.0
2,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,0,-3.1134,9.69326,255.8998,162.779405,0,0,...,6.464993,133.378413,1.93302,59.341751,28.115454,28.210938,22236.0,115.0,5.397,330.0
3,1108,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-5.18,0,-3.6834,13.567436,252.0938,160.844198,0,0,...,6.250992,130.791399,1.923403,56.234345,25.152766,31.081579,21154.0,120.0,5.288,332.0
4,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,0,-4.929,24.295041,252.2939,164.539405,0,0,...,6.500703,136.601049,1.951444,59.784728,28.23528,28.53169,22971.0,116.0,5.752,338.0
5,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,0,-3.9378,15.506269,249.8523,166.492991,0,0,...,6.339573,139.131996,1.959606,56.808725,28.380436,28.428289,22927.0,118.0,7.404,354.0
6,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,0,-4.0401,16.322408,244.1556,163.399405,0,0,...,6.371851,137.31077,1.961582,56.609941,28.393162,28.216778,22212.0,114.0,7.691,348.0
7,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,0,-3.1134,9.69326,255.8998,162.779405,0,0,...,6.464993,133.381424,1.933064,59.345483,28.116662,28.212683,22278.0,115.0,5.397,330.0
8,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,0,-5.0816,25.822659,254.9508,167.632991,0,0,...,6.465978,138.417801,1.949546,59.630333,28.216709,28.395866,23619.0,120.0,6.424,344.0
9,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,0,-1.2456,1.551519,309.4986,187.298542,0,0,...,6.406411,156.622606,1.957783,56.617512,25.292701,31.324812,31648.0,138.0,7.786,396.0


In [119]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,0,0.3088,0.095357,335.1404,205.860058,0,0,...,6.243799,168.348068,1.957536,56.90962,25.395733,31.513887,37307.0,146.0,10.656,424.0
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,0,-1.2456,1.551519,309.4986,187.298542,0,0,...,6.406411,156.622606,1.957783,56.617512,25.292701,31.324812,31648.0,138.0,7.786,396.0
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,0,-3.0908,9.553045,284.5918,178.444198,0,0,...,6.601542,155.443185,1.992861,56.706886,25.570597,31.136289,30176.0,124.0,9.1,392.0
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,0,-5.583,31.169889,254.9353,169.392991,0,0,...,6.500783,141.646739,1.967316,59.761039,28.23871,28.504056,24767.0,120.0,6.761,352.0
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,0,-5.0816,25.822659,254.9508,167.632991,0,0,...,6.465978,138.417801,1.949546,59.630333,28.216709,28.395866,23619.0,120.0,6.424,344.0
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,0,-4.929,24.295041,252.2939,164.539405,0,0,...,6.500703,136.601049,1.951444,59.784728,28.23528,28.53169,22971.0,116.0,5.752,338.0
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,0,-3.9378,15.506269,249.8523,166.492991,0,0,...,6.339573,139.131996,1.959606,56.808725,28.380436,28.428289,22927.0,118.0,7.404,354.0
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,0,-3.1134,9.69326,255.8998,162.779405,0,0,...,6.464993,133.378413,1.93302,59.341751,28.115454,28.210938,22236.0,115.0,5.397,330.0
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,0,-3.1134,9.69326,255.8998,162.779405,0,0,...,6.464993,133.381424,1.933064,59.345483,28.116662,28.212683,22278.0,115.0,5.397,330.0
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,0,-4.0401,16.322408,244.1556,163.399405,0,0,...,6.371851,137.31077,1.961582,56.609941,28.393162,28.216778,22212.0,114.0,7.691,348.0


In [120]:
df_ordered.to_csv('features/Descriptors/Test_2d_padel_curated_MDCK.csv', index=False)

In [121]:
#3d Train descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_padel_MDCK.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.256004,2.172194,3.034317,3.753694,4.466708,5.394142,6.339483,7.392947,8.425989,9.134625,...,0.430773,0.570059,0.561558,0.432793,42.100831,477.533777,1297.812522,0.430561,1.564411,1109
1,1.262165,2.177943,3.004435,3.729856,4.523804,5.39027,6.124069,6.792331,7.542445,8.406857,...,0.357485,0.515787,0.41317,0.367705,41.601416,492.190054,1941.736338,0.348469,1.296661,1115
2,1.259218,2.174627,3.008919,3.742963,4.547749,5.368845,6.1358,6.818897,7.644998,8.49268,...,0.406113,0.465361,0.44933,0.414936,45.650635,561.954227,1707.522217,0.421108,1.329627,1111
3,1.257218,2.181184,3.022483,3.737097,4.542256,5.410557,6.185571,6.926088,7.657778,8.342932,...,0.397144,0.491228,0.418054,0.41377,50.13167,708.259697,2879.776105,0.378069,1.323052,1119
4,1.261083,2.189218,3.014673,3.757476,4.586469,5.438855,6.191139,6.850095,7.534036,8.402071,...,0.236339,0.474576,0.468547,0.397864,49.473306,583.761805,2293.688495,0.517147,1.340987,1113
5,1.258727,2.171374,3.0054,3.737842,4.548808,5.285357,6.077064,6.788198,7.620392,8.450637,...,0.395429,0.498227,0.542829,0.40909,45.939045,605.192187,2479.875692,0.360048,1.450145,1116
6,1.262352,2.189819,3.015138,3.757234,4.579912,5.403821,6.152182,6.919833,7.736494,8.546811,...,0.363565,0.573106,0.495829,0.357174,50.483368,723.394093,3227.656451,0.354051,1.426109,1117
7,1.260828,2.173701,3.010288,3.742199,4.533928,5.297952,6.073064,6.812978,7.547346,8.206431,...,0.314569,0.610295,0.514721,0.380271,46.807509,602.937512,2571.793846,0.374928,1.505287,1110
8,1.252407,2.162436,2.998106,3.680852,4.473814,5.347391,6.113602,6.922488,7.721044,8.527176,...,0.312106,0.569435,0.564939,0.346947,49.411357,642.128957,2510.56182,0.412748,1.48132,1107
9,1.264146,2.203915,3.013835,3.768807,4.634108,5.419729,6.128406,6.790934,7.355254,7.943152,...,0.300821,0.45996,0.382785,0.407258,29.280678,257.101389,962.636979,0.295483,1.250003,1841


In [122]:
df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_MDCK.csv')
df 

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,101.372024,2.447751,4.856077,...,11.210644,138.066274,1138.715439,6.469974,33296,140,416.0,486.0,34.055556,18.111111
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,101.370874,2.447978,4.860529,...,11.210644,138.066274,1138.715439,6.469974,33296,140,416.0,486.0,34.055556,18.111111
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,101.371868,2.44359,4.850728,...,11.210644,138.066274,1138.715439,6.469974,33296,140,416.0,486.0,34.055556,18.111111
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,97.295643,2.408298,4.816595,...,11.141644,120.163455,1114.715439,6.406411,31648,138,396.0,458.0,36.555556,17.777778
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.774478,2.436865,4.825853,...,11.126159,133.738666,1090.679054,6.492137,29794,130,394.0,458.0,31.222222,17.333333
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.685868,2.442093,4.836451,...,11.129393,133.743086,1090.679054,6.492137,29704,130,394.0,458.0,31.222222,17.333333
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.685868,2.442093,4.836451,...,11.129393,133.743086,1088.699789,6.404116,29704,130,394.0,458.0,31.222222,17.333333
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,96.774478,2.436865,4.825853,...,11.126159,133.738666,1088.699789,6.404116,29794,130,394.0,458.0,31.222222,17.333333
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,89.136837,2.437073,4.831019,...,11.025019,126.223556,1047.640226,6.466915,25439,124,358.0,415.0,31.75,16.611111
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,86.880063,2.403063,4.806126,...,11.046579,113.682027,1046.746739,6.085737,25829,128,360.0,414.0,38.055556,16.444444


In [123]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1109,C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N(C)C(...,-6.12,1.256004,2.172194,3.034317,3.753694,4.466708,5.394142,6.339483,...,0.522934,0.430773,0.570059,0.561558,0.432793,42.100831,477.533777,1297.812522,0.430561,1.564411
1,1115,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.19,1.262165,2.177943,3.004435,3.729856,4.523804,5.39027,6.124069,...,0.541495,0.357485,0.515787,0.41317,0.367705,41.601416,492.190054,1941.736338,0.348469,1.296661
2,1111,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C...,-6.22,1.259218,2.174627,3.008919,3.742963,4.547749,5.368845,6.1358,...,0.541292,0.406113,0.465361,0.44933,0.414936,45.650635,561.954227,1707.522217,0.421108,1.329627
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,1.257218,2.181184,3.022483,3.737097,4.542256,5.410557,6.185571,...,0.521568,0.397144,0.491228,0.418054,0.41377,50.13167,708.259697,2879.776105,0.378069,1.323052
4,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,1.261083,2.189218,3.014673,3.757476,4.586469,5.438855,6.191139,...,0.678098,0.236339,0.474576,0.468547,0.397864,49.473306,583.761805,2293.688495,0.517147,1.340987
5,1116,CC(C)C[C@@H]1NC(=O)[C@H](C(C)C)N(C)C(=O)[C@H]2...,-5.22,1.258727,2.171374,3.0054,3.737842,4.548808,5.285357,6.077064,...,0.511269,0.395429,0.498227,0.542829,0.40909,45.939045,605.192187,2479.875692,0.360048,1.450145
6,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,1.262352,2.189819,3.015138,3.757234,4.579912,5.403821,6.152182,...,0.539136,0.363565,0.573106,0.495829,0.357174,50.483368,723.394093,3227.656451,0.354051,1.426109
7,1110,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]...,-4.73,1.260828,2.173701,3.010288,3.742199,4.533928,5.297952,6.073064,...,0.583285,0.314569,0.610295,0.514721,0.380271,46.807509,602.937512,2571.793846,0.374928,1.505287
8,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,1.252407,2.162436,2.998106,3.680852,4.473814,5.347391,6.113602,...,0.608499,0.312106,0.569435,0.564939,0.346947,49.411357,642.128957,2510.56182,0.412748,1.48132
9,1841,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-4.41,1.264146,2.203915,3.013835,3.768807,4.634108,5.419729,6.128406,...,0.530322,0.300821,0.45996,0.382785,0.407258,29.280678,257.101389,962.636979,0.295483,1.250003


In [124]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,1.263546,2.188964,3.017526,3.765209,4.577209,5.343044,6.14086,...,0.456915,0.389421,0.437125,0.423452,0.392749,41.336971,526.265213,2498.864653,0.269505,1.253326
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,1.261083,2.189218,3.014673,3.757476,4.586469,5.438855,6.191139,...,0.678098,0.236339,0.474576,0.468547,0.397864,49.473306,583.761805,2293.688495,0.517147,1.340987
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,1.262352,2.189819,3.015138,3.757234,4.579912,5.403821,6.152182,...,0.539136,0.363565,0.573106,0.495829,0.357174,50.483368,723.394093,3227.656451,0.354051,1.426109
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,1.257218,2.181184,3.022483,3.737097,4.542256,5.410557,6.185571,...,0.521568,0.397144,0.491228,0.418054,0.41377,50.13167,708.259697,2879.776105,0.378069,1.323052
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,1.26484,2.189118,3.004847,3.76319,4.611307,5.340708,6.016435,...,0.608814,0.291627,0.525725,0.587748,0.381538,45.9272,563.59345,2321.907016,0.413221,1.495011
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,1.26418,2.188773,3.002362,3.747652,4.601199,5.395908,6.172696,...,0.588852,0.300853,0.582896,0.574504,0.339708,49.847196,684.019529,3153.991837,0.383278,1.497107
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,1.263115,2.188576,2.99919,3.746247,4.588929,5.352004,6.062018,...,0.481735,0.407498,0.473671,0.471289,0.327613,46.294702,631.823027,2835.55069,0.33385,1.272573
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,1.264507,2.187167,3.002901,3.769268,4.60464,5.374375,6.105402,...,0.572955,0.326123,0.47087,0.386186,0.283351,48.802297,661.127915,2901.775303,0.359432,1.140407
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,1.263895,2.182221,3.001866,3.740031,4.497325,5.328217,6.110589,...,0.601371,0.349128,0.518144,0.545022,0.364424,51.96254,693.944167,2204.104224,0.425748,1.427589
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,1.252407,2.162436,2.998106,3.680852,4.473814,5.347391,6.113602,...,0.608499,0.312106,0.569435,0.564939,0.346947,49.411357,642.128957,2510.56182,0.412748,1.48132


In [125]:
df_ordered.to_csv('features/Descriptors/Train_3d_padel_curated_MDCK.csv', index=False)

In [126]:
#3d test padel descriptors
df_test = pd.read_csv('features/Descriptors/Test_3d_padel_MDCK.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.262071,2.179842,2.996935,3.740084,4.656446,5.416241,6.070195,6.751989,7.460681,8.290323,...,0.252612,0.485191,0.393955,0.404621,49.499572,564.175291,1883.827401,0.530571,1.283767,6496
1,1.254018,2.163124,3.006562,3.718435,4.523601,5.384969,6.020719,6.77312,7.723639,8.487681,...,0.388502,0.523799,0.502842,0.421705,44.313812,550.245259,2036.042466,0.379606,1.448347,1108
2,1.265653,2.207782,3.013447,3.777693,4.664874,5.433991,6.20765,6.875381,7.479235,8.131504,...,0.392542,0.472921,0.421803,0.368107,30.606329,264.951713,795.132003,0.372567,1.262831,1843
3,1.257288,2.179452,3.024774,3.753688,4.541741,5.324373,6.037414,6.722839,7.487353,8.231476,...,0.291189,0.444583,0.539683,0.357508,41.316066,490.042107,2188.2047,0.349545,1.341774,1118
4,1.265005,2.203042,3.003715,3.75551,4.643888,5.425595,6.130606,6.860433,7.569793,8.533427,...,0.343748,0.509549,0.449964,0.374446,51.553116,710.235475,2723.391067,0.393219,1.33396,1121
5,1.264406,2.182123,3.012432,3.737651,4.522408,5.376301,6.170496,6.902463,7.681886,8.593614,...,0.406031,0.519823,0.544516,0.426875,45.4719,589.208424,2306.37861,0.37063,1.491215,8143
6,1.260128,2.172456,3.012249,3.736571,4.547769,5.361418,6.089385,6.848274,7.795227,8.770288,...,0.363249,0.588852,0.553764,0.271499,51.26691,711.978056,2701.028559,0.395248,1.414115,8345
7,1.264178,2.181512,3.004424,3.733909,4.602144,5.439434,6.21296,6.988305,8.017577,8.725276,...,0.404704,0.497301,0.484967,0.300594,49.677484,690.78496,2675.200579,0.387593,1.282862,8119
8,1.261144,2.182792,2.986502,3.71428,4.627305,5.440193,6.181705,6.951036,7.668839,8.441794,...,0.3461,0.511976,0.468069,0.382243,44.726179,551.597017,2126.801755,0.369175,1.362288,6423
9,1.260512,2.173203,3.018396,3.732142,4.4996,5.331553,6.11562,6.824954,7.602464,8.399841,...,0.412405,0.532151,0.496837,0.370279,43.965756,563.139411,2324.612455,0.349052,1.399267,8168


In [127]:
df = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_MDCK.csv')
df

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,103.838268,2.420071,4.840143,...,11.20091,126.480968,1198.80934,6.243799,37307,146,424.0,488.0,39.277778,19.111111
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,97.302139,2.412394,4.824787,...,11.141934,120.163745,1114.715439,6.406411,31648,138,396.0,458.0,36.555556,17.777778
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,95.621135,2.421653,4.790984,...,11.069213,133.614531,1082.652839,6.601542,30176,124,392.0,450.0,30.611111,17.222222
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,88.159241,2.431154,4.817857,...,10.985954,125.096326,1033.624576,6.500783,24767,120,352.0,406.0,30.888889,16.388889
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,85.381854,2.437654,4.83044,...,10.978917,124.00746,1021.624576,6.465978,23619,120,344.0,397.0,32.75,16.194444
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,83.965708,2.427767,4.806183,...,10.937792,122.874218,1007.608926,6.500703,22971,116,338.0,388.0,31.888889,15.972222
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,84.630944,2.429295,4.822808,...,11.018416,126.196276,1001.652505,6.339573,22927,118,354.0,409.0,32.472222,15.666667
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,81.585614,2.38103,4.762061,...,10.902997,108.14071,995.608926,6.464993,22236,115,330.0,376.0,33.75,15.638889
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,81.585744,2.384457,4.768915,...,10.902997,108.14071,995.608926,6.464993,22278,115,330.0,376.0,33.75,15.638889
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,83.165492,2.427826,4.818625,...,10.978746,125.067539,987.636855,6.371851,22212,114,348.0,400.0,31.611111,15.444444


In [128]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,1.262071,2.179842,2.996935,3.740084,4.656446,5.416241,6.070195,...,0.687047,0.252612,0.485191,0.393955,0.404621,49.499572,564.175291,1883.827401,0.530571,1.283767
1,1108,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-5.18,1.254018,2.163124,3.006562,3.718435,4.523601,5.384969,6.020719,...,0.531236,0.388502,0.523799,0.502842,0.421705,44.313812,550.245259,2036.042466,0.379606,1.448347
2,1843,CC[C@H](C)[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@...,-4.4,1.265653,2.207782,3.013447,3.777693,4.664874,5.433991,6.20765,...,0.522503,0.392542,0.472921,0.421803,0.368107,30.606329,264.951713,795.132003,0.372567,1.262831
3,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,1.257288,2.179452,3.024774,3.753688,4.541741,5.324373,6.037414,...,0.566363,0.291189,0.444583,0.539683,0.357508,41.316066,490.042107,2188.2047,0.349545,1.341774
4,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,1.265005,2.203042,3.003715,3.75551,4.643888,5.425595,6.130606,...,0.585065,0.343748,0.509549,0.449964,0.374446,51.553116,710.235475,2723.391067,0.393219,1.33396
5,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,1.264406,2.182123,3.012432,3.737651,4.522408,5.376301,6.170496,...,0.507722,0.406031,0.519823,0.544516,0.426875,45.4719,589.208424,2306.37861,0.37063,1.491215
6,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,1.260128,2.172456,3.012249,3.736571,4.547769,5.361418,6.089385,...,0.566916,0.363249,0.588852,0.553764,0.271499,51.26691,711.978056,2701.028559,0.395248,1.414115
7,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,1.264178,2.181512,3.004424,3.733909,4.602144,5.439434,6.21296,...,0.520358,0.404704,0.497301,0.484967,0.300594,49.677484,690.78496,2675.200579,0.387593,1.282862
8,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,1.261144,2.182792,2.986502,3.71428,4.627305,5.440193,6.181705,...,0.566683,0.3461,0.511976,0.468069,0.382243,44.726179,551.597017,2126.801755,0.369175,1.362288
9,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,1.260512,2.173203,3.018396,3.732142,4.4996,5.331553,6.11562,...,0.486963,0.412405,0.532151,0.496837,0.370279,43.965756,563.139411,2324.612455,0.349052,1.399267


In [129]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,1.258625,2.178045,3.017671,3.72053,4.539254,5.370155,6.118056,...,0.538922,0.327869,0.524865,0.537605,0.374733,48.372354,683.620826,3396.096064,0.308384,1.437203
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,1.257288,2.179452,3.024774,3.753688,4.541741,5.324373,6.037414,...,0.566363,0.291189,0.444583,0.539683,0.357508,41.316066,490.042107,2188.2047,0.349545,1.341774
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,1.265005,2.203042,3.003715,3.75551,4.643888,5.425595,6.130606,...,0.585065,0.343748,0.509549,0.449964,0.374446,51.553116,710.235475,2723.391067,0.393219,1.33396
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,1.265596,2.183941,2.997808,3.74788,4.558068,5.387794,6.072208,...,0.510944,0.431114,0.440987,0.473812,0.40642,50.076001,689.240384,2342.004744,0.413087,1.321219
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,1.264406,2.182123,3.012432,3.737651,4.522408,5.376301,6.170496,...,0.507722,0.406031,0.519823,0.544516,0.426875,45.4719,589.208424,2306.37861,0.37063,1.491215
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,1.264178,2.181512,3.004424,3.733909,4.602144,5.439434,6.21296,...,0.520358,0.404704,0.497301,0.484967,0.300594,49.677484,690.78496,2675.200579,0.387593,1.282862
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,1.262071,2.179842,2.996935,3.740084,4.656446,5.416241,6.070195,...,0.687047,0.252612,0.485191,0.393955,0.404621,49.499572,564.175291,1883.827401,0.530571,1.283767
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,1.260512,2.173203,3.018396,3.732142,4.4996,5.331553,6.11562,...,0.486963,0.412405,0.532151,0.496837,0.370279,43.965756,563.139411,2324.612455,0.349052,1.399267
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,1.260128,2.172456,3.012249,3.736571,4.547769,5.361418,6.089385,...,0.566916,0.363249,0.588852,0.553764,0.271499,51.26691,711.978056,2701.028559,0.395248,1.414115
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,1.261144,2.182792,2.986502,3.71428,4.627305,5.440193,6.181705,...,0.566683,0.3461,0.511976,0.468069,0.382243,44.726179,551.597017,2126.801755,0.369175,1.362288


In [130]:
df_ordered.to_csv('features/Descriptors/Test_3d_padel_curated_MDCK.csv', index=False)

In [131]:
#2d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 1444)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1444)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 255
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7306
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 533
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5961,0.6385,0.7721,0.0785,0.3346,0.4345,0.3518,0.497,0.5932,0.4019,0.6416,0.4924
DecisionTreeRegressor,0.877,0.7149,0.9365,-0.3557,0.1436,0.1671,0.3547,0.5295,0.5956,0.397,0.7057,0.3769
RandomForestRegressor,0.6054,0.6416,0.7781,0.0641,0.3164,0.3461,0.3099,0.4942,0.5567,0.4732,0.707,0.3329
GradientBoostingRegressor,0.6708,0.6579,0.819,-0.037,0.2967,0.3218,0.3615,0.5231,0.6013,0.3854,0.6847,0.3164
AdaBoostRegressor,0.5981,0.623,0.7734,0.0754,0.3506,0.3781,0.3265,0.4916,0.5714,0.4449,0.702,0.3439
XGBRegressor,0.7197,0.6682,0.8483,-0.1125,0.2712,0.3061,0.4233,0.5436,0.6506,0.2804,0.6281,0.3219
ExtraTreesRegressor,0.6807,0.6676,0.825,-0.0522,0.282,0.3173,0.3478,0.5026,0.5897,0.4088,0.7047,0.3384
LinearRegression,1.3123,0.8872,1.1456,-1.0286,0.2001,0.1822,0.966,0.6423,0.9829,-0.6423,0.6406,0.6025
KNeighborsRegressor,0.5899,0.6026,0.7681,0.088,0.3845,0.3882,0.3567,0.4905,0.5972,0.3936,0.6362,0.449
SVR,0.6624,0.6444,0.8139,-0.024,0.2248,0.2369,0.3024,0.4747,0.5499,0.4859,0.7269,0.4209


In [132]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.403401796264953, -5.801362849910222, -6.40...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.578397389477615, -5.398929028023123, -5.3...","[-5.456002966533946, -5.589594080550133, -6.00...","[0.15829755552137664, 0.19194530482391176, 0.3..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -5.06, -5.400722167, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.25, -5.35, -5.453495214, -6.929917...","[-5.4334491173999995, -6.109483658749999, -6.1...","[0.7774683100598736, 0.2810326824999997, 0.680..."
2,RandomForestRegressor,"[-5.503513251674998, -5.390131088038332, -5.85...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.438158710462497, -5.756114955468336, -6.0...","[-5.474578210372668, -5.7609898221064, -5.9915...","[0.2297848378425105, 0.1734376116907267, 0.129..."
3,GradientBoostingRegressor,"[-5.3768402880168855, -5.5012534746316994, -5....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.035225263867772, -6.206340441327234, -5.7...","[-5.290232511373617, -5.949156646637054, -6.08...","[0.28422953163014747, 0.34495294695443174, 0.3..."
4,AdaBoostRegressor,"[-5.428174651999999, -5.339598512, -5.84428571...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.074999999999999, -6.144137553285714, -5.5...","[-5.277395854313333, -5.8449290582106785, -5.8...","[0.3020795118044788, 0.29665430089880984, 0.25..."
5,XGBRegressor,"[-5.268087, -5.429548, -5.2928333, -5.811682, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.187222, -6.2283454, -6.0013194, -6.294338...","[-5.1422596, -5.9502053, -6.1759214, -6.487588...","[0.40034124, 0.3403726, 0.2232477, 0.5249209, ..."
6,ExtraTreesRegressor,"[-5.518724182937495, -5.409638708249998, -6.15...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.105184952140003, -6.061422106417499, -6.3...","[-5.220390062361003, -5.853894964167167, -6.51...","[0.2877532063996415, 0.27915582197438066, 0.28..."
7,LinearRegression,"[-5.93357995743354, -5.581520558444441, -4.108...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.840275322159753, -5.898250267305669, -10....","[-6.245935243676091, -5.995156869402008, -9.18...","[0.38906781926790107, 0.6047810638525762, 0.77..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.98...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.4...","[-5.6546666666666665, -5.666000000000001, -6.5...","[0.14230796026770795, 0.14081350945290821, 0.2..."
9,SVR,"[-5.577049341417479, -5.319152653371145, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.668521468550275, -5.922926449935176, -5.7...","[-5.680455935240749, -5.884504614739118, -5.72...","[0.1555225405025675, 0.24280557670405, 0.11238..."


In [133]:
result_df.to_csv('results/Descriptors/Results_2D_padel_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_MDCK.csv')

In [134]:
#2d padel descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 992)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 992)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 255
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7306
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 533
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5961,0.6385,0.7721,0.0785,0.3346,0.4345,0.3518,0.497,0.5932,0.4019,0.6416,0.4924
DecisionTreeRegressor,1.0721,0.7976,1.0354,-0.6572,0.0551,0.0802,0.4454,0.5858,0.6674,0.2428,0.6718,0.3769
RandomForestRegressor,0.6095,0.6448,0.7807,0.0579,0.3096,0.3422,0.309,0.4946,0.5559,0.4747,0.7088,0.3329
GradientBoostingRegressor,0.6542,0.6464,0.8088,-0.0112,0.3167,0.3537,0.353,0.5242,0.5942,0.3999,0.6934,0.3164
AdaBoostRegressor,0.6284,0.6416,0.7927,0.0286,0.3176,0.3416,0.2956,0.4683,0.5437,0.4975,0.731,0.3494
XGBRegressor,0.7197,0.6682,0.8483,-0.1125,0.2712,0.3061,0.4233,0.5436,0.6506,0.2804,0.6281,0.3219
ExtraTreesRegressor,0.6603,0.656,0.8126,-0.0208,0.3052,0.334,0.3325,0.4885,0.5766,0.4348,0.7144,0.3769
LinearRegression,1.3123,0.8872,1.1456,-1.0286,0.2001,0.1822,0.966,0.6423,0.9829,-0.6423,0.6406,0.6025
KNeighborsRegressor,0.5899,0.6026,0.7681,0.088,0.3845,0.3882,0.3567,0.4905,0.5972,0.3936,0.6362,0.449
SVR,0.6623,0.6444,0.8138,-0.0239,0.225,0.2369,0.3024,0.4747,0.5499,0.4859,0.7269,0.4209


In [135]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.403401796264953, -5.801362849910222, -6.40...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.578397389477615, -5.398929028023123, -5.3...","[-5.456002966533946, -5.589594080550133, -6.00...","[0.15829755552137664, 0.19194530482391176, 0.3..."
1,DecisionTreeRegressor,"[-5.65, -5.06, -5.06, -5.793469425, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.25, -6.34, -5.309384159, -6.929917...","[-5.7845234873999996, -6.388, -6.4179667323999...","[1.2245132566979549, 0.13629380029920649, 0.75..."
2,RandomForestRegressor,"[-5.483237143234995, -5.443904415684999, -5.88...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.4674368109359985, -5.754115323200001, -5....","[-5.47905631107283, -5.758963871622236, -6.020...","[0.19462603774082476, 0.1557406356407126, 0.18..."
3,GradientBoostingRegressor,"[-5.376889113842746, -5.434854725263953, -5.43...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.0901367715997825, -6.167809427820997, -5....","[-5.379353544744598, -5.970190494912644, -6.02...","[0.36237626956475594, 0.28568023244925306, 0.3..."
4,AdaBoostRegressor,"[-5.3182048668, -5.458891233125001, -5.6885478...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.0825, -5.980655693846154, -5.616389587884...","[-5.341388842486364, -5.878873885856188, -5.95...","[0.2999271165116116, 0.2646538016568159, 0.322..."
5,XGBRegressor,"[-5.268087, -5.429548, -5.2928333, -5.811682, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.187222, -6.2283454, -6.0013194, -6.294338...","[-5.1422596, -5.9502053, -6.1759214, -6.487588...","[0.40034124, 0.3403726, 0.2232477, 0.5249209, ..."
6,ExtraTreesRegressor,"[-5.539766940449997, -5.464338857130002, -6.22...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.153200000000003, -5.8744348275375025, -6....","[-5.296561267994502, -5.845203012711002, -6.40...","[0.23135270253408194, 0.2506521521291158, 0.22..."
7,LinearRegression,"[-5.933579957433534, -5.581520558444448, -4.10...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.840275322159753, -5.8982502673056665, -10...","[-6.245935243676091, -5.995156869402008, -9.18...","[0.3890678192678994, 0.604781063852574, 0.7704..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -5.98...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.4...","[-5.6546666666666665, -5.666000000000001, -6.5...","[0.14230796026770795, 0.14081350945290821, 0.2..."
9,SVR,"[-5.577050067873874, -5.319139821801227, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.668565275900687, -5.922988097419806, -5.7...","[-5.680507704487146, -5.884555251776673, -5.72...","[0.15553094941624324, 0.24282524766983227, 0.1..."


In [136]:
result_df.to_csv('results/Descriptors/Results_2D_padel_const_rem_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_const_rem_MDCK.csv')

In [137]:
#2d padel descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 642)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 642)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2115
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 158
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4595
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 337
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6071,0.632,0.7792,0.0615,0.3153,0.4181,0.3582,0.532,0.5985,0.3911,0.644,0.4429
DecisionTreeRegressor,1.0897,0.8485,1.0439,-0.6846,-0.0984,-0.0872,0.3552,0.5178,0.596,0.3962,0.7409,0.4814
RandomForestRegressor,0.634,0.6681,0.7963,0.0199,0.2717,0.3219,0.3049,0.4866,0.5521,0.4817,0.7223,0.3769
GradientBoostingRegressor,0.7305,0.7206,0.8547,-0.1292,0.185,0.2543,0.3031,0.4884,0.5506,0.4847,0.7537,0.3659
AdaBoostRegressor,0.6037,0.6306,0.777,0.0668,0.3474,0.418,0.2878,0.4587,0.5365,0.5107,0.756,0.3824
XGBRegressor,0.7427,0.6901,0.8618,-0.1481,0.2269,0.2701,0.4035,0.5524,0.6353,0.314,0.6761,0.3604
ExtraTreesRegressor,0.6306,0.6453,0.7941,0.0251,0.3239,0.327,0.3474,0.5066,0.5894,0.4095,0.7188,0.3989
LinearRegression,1.7155,0.9672,1.3098,-1.6519,0.0494,0.072,1.1521,0.7444,1.0734,-0.9586,0.5977,0.5585
KNeighborsRegressor,0.6255,0.63,0.7909,0.033,0.342,0.3497,0.3479,0.473,0.5898,0.4086,0.6504,0.4441
SVR,0.6519,0.6428,0.8074,-0.0078,0.2441,0.2473,0.2998,0.4621,0.5476,0.4903,0.7397,0.3824


In [138]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.67049629146648, -5.805221112870568, -5.974...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.539770782456108, -5.8016739941095885, -6....","[-5.480996963787577, -5.740174259279463, -6.33...","[0.17991654081101616, 0.1389107118572342, 0.27..."
1,DecisionTreeRegressor,"[-5.65, -5.06, -5.93, -6.929917083, -5.4534952...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.25, -6.25, -7.82, -6.927245587, -6.22, -6...","[-5.9878313964, -6.164, -6.187915698199999, -6...","[0.5946257546140773, 0.17199999999999988, 0.94..."
2,RandomForestRegressor,"[-5.482292255343995, -5.494367730160002, -6.07...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.537773878516168, -5.722922395426667, -6.2...","[-5.536498807018068, -5.846024492133536, -6.28...","[0.16374334310249322, 0.18203203370318802, 0.1..."
3,GradientBoostingRegressor,"[-5.451700027605727, -5.288713339684267, -5.83...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.429106075033576, -5.596073813594208, -6.7...","[-5.623181096296291, -5.979260365304353, -6.47...","[0.16020219964190013, 0.36289724128792544, 0.2..."
4,AdaBoostRegressor,"[-5.6239874607600004, -5.481968892944445, -5.7...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.49, -6.070804529117647, -6.26, -6.8262354...","[-5.516, -5.913437180820452, -6.35789329294761...","[0.2979664410634193, 0.2768390658724569, 0.185..."
5,XGBRegressor,"[-5.717794, -5.180936, -5.9201336, -6.508095, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.402845, -5.97451, -7.2954636, -6.9263606,...","[-5.416144, -6.0192065, -6.530117, -6.6797028,...","[0.19568738, 0.26684746, 0.4905301, 0.4867136,..."
6,ExtraTreesRegressor,"[-5.404169138677499, -5.483422383457501, -6.17...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.335944082569998, -6.182733850860001, -6.4...","[-5.3114532831215, -6.026817914924, -6.7569244...","[0.2456982340651495, 0.2751763576727589, 0.370..."
7,LinearRegression,"[-5.313971086140253, -5.182319228102754, -4.38...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-7.1480375976584245, -5.764346225545939, -10...","[-6.024871027257919, -5.935492072391254, -9.44...","[0.5955075777816224, 1.04428875768273, 0.78134..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.6...","[-5.713333333333333, -5.713333333333333, -6.58...","[0.1707890186425604, 0.1707890186425604, 0.275..."
9,SVR,"[-5.5629726314868755, -5.274607552446739, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.6811635852896005, -5.933635474735576, -5....","[-5.67638554944171, -5.8658612303191635, -5.74...","[0.14498050761268588, 0.2546827263646525, 0.11..."


In [139]:
result_df.to_csv('results/Descriptors/Results_2D_padel_LVR_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_const_LVR_MDCK.csv')

In [140]:
#2d All descriptors
df_train_padel = pd.read_csv('features/Descriptors/Train_2d_padel_curated_MDCK.csv')
df_train_rdkit = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_MDCK.csv')
df_train_mordred = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_MDCK.csv')

df_2d_train = df_train_rdkit.merge(df_train_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_train

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,14.92293,14.92293,0.059501,-1.162678,0.23895,27.097561,1139.494,...,6.469974,162.717774,1.984363,57.439206,25.502169,31.937037,33296.0,140.0,7.952,416.0
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,15.128047,15.128047,0.075261,-1.145488,0.23895,27.097561,1139.494,...,6.469974,162.720191,1.984393,57.435556,25.501133,31.934422,33296.0,140.0,7.952,416.0
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,14.908525,14.908525,0.051146,-1.177248,0.23895,27.097561,1139.494,...,6.469974,162.7174,1.984359,57.440346,25.502649,31.937697,33296.0,140.0,7.952,416.0
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,14.724823,14.724823,0.030635,-1.196597,0.240803,26.125,1115.472,...,6.406411,156.626458,1.957831,56.630904,25.293944,31.33696,31648.0,138.0,7.786,396.0
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,14.918925,14.918925,0.008728,-1.166833,0.250056,27.358974,1091.406,...,6.492137,155.242276,1.990286,60.716736,28.525011,32.191725,29794.0,130.0,5.187,394.0
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,14.912312,14.912312,0.031806,-1.168185,0.250056,27.358974,1091.406,...,6.492137,155.232056,1.990155,60.685463,28.512533,32.17293,29704.0,130.0,5.187,394.0
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,14.926201,14.926201,0.060156,-1.148185,0.252488,27.358974,1089.434,...,6.404116,155.232056,1.990155,57.684402,25.511472,32.17293,29704.0,130.0,7.084,394.0
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,14.950175,14.950175,0.063054,-1.159021,0.252488,27.358974,1089.434,...,6.404116,155.242276,1.990286,57.714858,25.523133,32.191725,29794.0,130.0,7.084,394.0
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,14.598899,14.598899,0.013996,-1.178122,0.218577,28.506849,1048.403,...,6.466915,143.466261,1.965291,59.644932,28.221894,28.405281,25439.0,124.0,7.222,358.0
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,14.543197,14.543197,0.095971,-1.130815,0.222694,27.567568,1047.438,...,6.085737,142.514887,1.925877,56.524332,25.255908,31.268424,25829.0,128.0,8.158,360.0


In [141]:
df_2d_train.to_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv', index=False)

In [142]:
df_test_padel = pd.read_csv('features/Descriptors/Test_2d_padel_curated_MDCK.csv')
df_test_rdkit = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_MDCK.csv')
df_test_mordred = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_MDCK.csv')

df_2d_test = df_test_rdkit.merge(df_test_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_test

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,15.211581,15.211581,0.031713,-1.206465,0.159924,25.27907,1199.634,...,6.243799,168.348068,1.957536,56.90962,25.395733,31.513887,37307.0,146.0,10.656,424.0
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,14.834083,14.834083,0.037197,-1.184826,0.240803,26.125,1115.472,...,6.406411,156.622606,1.957783,56.617512,25.292701,31.324812,31648.0,138.0,7.786,396.0
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,14.571211,14.571211,0.032067,-1.213008,0.153969,27.307692,1083.386,...,6.601542,155.443185,1.992861,56.706886,25.570597,31.136289,30176.0,124.0,9.1,392.0
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,14.50746,14.50746,0.015782,-1.174377,0.21837,27.694444,1034.376,...,6.500783,141.646739,1.967316,59.761039,28.23871,28.504056,24767.0,120.0,6.761,352.0
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,14.432367,14.432367,0.010248,-1.18198,0.206072,27.492958,1022.365,...,6.465978,138.417801,1.949546,59.630333,28.216709,28.395866,23619.0,120.0,6.424,344.0
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,14.337271,14.337271,0.011275,-1.179506,0.205195,26.642857,1008.338,...,6.500703,136.601049,1.951444,59.784728,28.23528,28.53169,22971.0,116.0,5.752,338.0
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,14.798396,14.798396,0.082911,-1.653049,0.157962,27.464789,1002.309,...,6.339573,139.131996,1.959606,56.808725,28.380436,28.428289,22927.0,118.0,7.404,354.0
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,14.228063,14.228063,0.008269,-1.181029,0.216153,26.15942,996.327,...,6.464993,133.378413,1.93302,59.341751,28.115454,28.210938,22236.0,115.0,5.397,330.0
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,14.235704,14.235704,0.008614,-1.181469,0.216153,26.15942,996.327,...,6.464993,133.381424,1.933064,59.345483,28.116662,28.212683,22278.0,115.0,5.397,330.0
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,14.765338,14.765338,0.082043,-1.652577,0.137104,27.528571,988.282,...,6.371851,137.31077,1.961582,56.609941,28.393162,28.216778,22212.0,114.0,7.691,348.0


In [143]:
df_2d_test.to_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv', index=False)

In [144]:
#2d All descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
# X_test = X_test.select_dtypes(include=['number'])
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 3097)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 3097)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8625
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 645
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16751
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1233
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6042,0.6524,0.7773,0.066,0.3254,0.404,0.3575,0.5034,0.5979,0.3922,0.6329,0.4759
DecisionTreeRegressor,1.1485,0.8288,1.0717,-0.7755,0.0245,0.0547,0.4134,0.5915,0.643,0.2972,0.7378,0.4649
RandomForestRegressor,0.6194,0.6554,0.787,0.0426,0.2862,0.3142,0.3045,0.4863,0.5518,0.4823,0.717,0.3329
GradientBoostingRegressor,0.6149,0.6308,0.7842,0.0494,0.3491,0.383,0.3347,0.5123,0.5785,0.431,0.7343,0.3604
AdaBoostRegressor,0.6617,0.6437,0.8134,-0.0228,0.2891,0.2931,0.2876,0.4536,0.5363,0.511,0.7465,0.3879
XGBRegressor,0.7467,0.6846,0.8641,-0.1542,0.2347,0.2279,0.3926,0.5407,0.6265,0.3326,0.6928,0.4429
ExtraTreesRegressor,0.6596,0.6478,0.8122,-0.0197,0.3064,0.3163,0.3348,0.4973,0.5786,0.4309,0.7175,0.3494
LinearRegression,1.2503,0.8521,1.1182,-0.9328,0.2074,0.216,0.9142,0.6284,0.9561,-0.5542,0.651,0.5695
KNeighborsRegressor,0.6515,0.6256,0.8071,-0.0071,0.3031,0.3179,0.3412,0.4607,0.5841,0.42,0.6493,0.4441
SVR,0.655,0.6382,0.8093,-0.0126,0.2352,0.2358,0.31,0.4795,0.5568,0.4729,0.718,0.3934


In [145]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.398834542352998, -5.795523563234602, -6.36...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.585463287833846, -5.4145381101328525, -5....","[-5.476619920922344, -5.56661498267826, -5.999...","[0.1665891663924703, 0.16808800470005247, 0.29..."
1,DecisionTreeRegressor,"[-5.65, -5.65, -5.06, -6.97, -6.927245587, -6....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.25, -5.93, -6.927245587, -6.927245...","[-6.0434491174, -6.13, -6.515983416599999, -6....","[1.1240680806425183, 0.23999999999999985, 0.84..."
2,RandomForestRegressor,"[-5.553200507706662, -5.4985353854399985, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.445707530563093, -5.802792691306667, -5.9...","[-5.560778472209314, -5.788078864083781, -6.03...","[0.17738181744197687, 0.14882569425607337, 0.1..."
3,GradientBoostingRegressor,"[-5.590257603796829, -5.380282892026746, -5.29...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.2901638753288855, -6.232781485105157, -6....","[-5.602745047754733, -5.999065098833316, -6.23...","[0.31044032421438894, 0.30307227743267245, 0.2..."
4,AdaBoostRegressor,"[-5.485615466499999, -5.350972968, -5.85047193...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.452081935684212, -5.850471938681818, -6.1...","[-5.382674045262297, -5.785748822383507, -6.19...","[0.1646535072236347, 0.2403622750728152, 0.181..."
5,XGBRegressor,"[-5.426199, -5.6558065, -5.412433, -6.6082435,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3269997, -6.0302267, -7.1883125, -6.92458...","[-5.50333, -5.7673287, -6.5794244, -6.609311, ...","[0.2507678, 0.3387965, 0.6849785, 0.5494461, 0..."
6,ExtraTreesRegressor,"[-5.711576382314997, -5.680839695740002, -6.04...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3472217196425005, -5.859398327020002, -6....","[-5.291143905661835, -5.831583293602, -6.51034...","[0.23056017867042272, 0.24108760597925202, 0.2..."
7,LinearRegression,"[-6.236416024962559, -5.52986614368669, -4.327...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-7.23594085809661, -6.508919449657144, -10.0...","[-6.258621755568813, -6.077857718614071, -9.05...","[0.8120853763196212, 0.9792849689826595, 1.419..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.713333333333333, -6.16...","[0.1707890186425604, 0.1707890186425604, 0.368..."
9,SVR,"[-5.574691339590781, -5.323536379155233, -5.63...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.597179296276665, -5.919215563644628, -5.8...","[-5.607106067035657, -5.867991475465223, -5.75...","[0.12090419436010286, 0.23802966296425523, 0.1..."


In [146]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_MDCK.csv')

In [147]:
#2d All descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 2274)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 2274)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8625
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 645
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16751
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1233
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6042,0.6524,0.7773,0.066,0.3254,0.404,0.3575,0.5034,0.5979,0.3922,0.6329,0.4759
DecisionTreeRegressor,0.904,0.7264,0.9508,-0.3974,0.1806,0.2039,0.5131,0.6335,0.7163,0.1277,0.6648,0.3494
RandomForestRegressor,0.6202,0.6514,0.7876,0.0412,0.288,0.3323,0.3019,0.4846,0.5495,0.4867,0.7218,0.3439
GradientBoostingRegressor,0.6447,0.6467,0.8029,0.0034,0.3349,0.3709,0.3524,0.522,0.5936,0.4009,0.7207,0.3164
AdaBoostRegressor,0.6007,0.6351,0.7751,0.0714,0.3471,0.3793,0.3182,0.4813,0.5641,0.4591,0.7173,0.3439
XGBRegressor,0.7467,0.6846,0.8641,-0.1542,0.2347,0.2279,0.3926,0.5407,0.6265,0.3326,0.6928,0.4429
ExtraTreesRegressor,0.6608,0.6546,0.8129,-0.0215,0.3091,0.3083,0.3504,0.5136,0.5919,0.4043,0.7085,0.3384
LinearRegression,1.2503,0.8521,1.1182,-0.9328,0.2074,0.216,0.9142,0.6284,0.9561,-0.5542,0.651,0.5695
KNeighborsRegressor,0.6515,0.6256,0.8071,-0.0071,0.3031,0.3179,0.3412,0.4607,0.5841,0.42,0.6493,0.4441
SVR,0.655,0.6383,0.8093,-0.0126,0.2352,0.2358,0.31,0.4795,0.5568,0.4729,0.7179,0.3934


In [148]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.398834542352998, -5.795523563234602, -6.36...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.585463287833846, -5.4145381101328525, -5....","[-5.476619920922344, -5.56661498267826, -5.999...","[0.1665891663924703, 0.16808800470005247, 0.29..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -5.06, -6.929917083, -6.5695784...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.25, -6.85, -6.927245587, -6.569578...","[-5.720000000000001, -6.13, -6.8499667324, -6....","[1.080499884312812, 0.23999999999999985, 0.898..."
2,RandomForestRegressor,"[-5.52422900831333, -5.497882969735, -5.831207...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.361069749624762, -5.751979597910001, -6.0...","[-5.5782871018265485, -5.767366189096625, -6.0...","[0.20441446472621344, 0.12260881001284571, 0.1..."
3,GradientBoostingRegressor,"[-5.524701153267153, -5.368936213247834, -5.30...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.279559945661648, -6.221010606906194, -6.2...","[-5.583699685034971, -5.972095366631095, -6.22...","[0.3009049799065836, 0.3533322487085882, 0.231..."
4,AdaBoostRegressor,"[-5.437988247636364, -5.517905422333333, -5.92...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.33, -5.936171729555557, -5.75609981414285...","[-5.410123695047283, -5.853026274263493, -6.00...","[0.22498218253691746, 0.2576885874418354, 0.23..."
5,XGBRegressor,"[-5.426199, -5.6558065, -5.412433, -6.6082435,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3269997, -6.0302267, -7.1883125, -6.92458...","[-5.50333, -5.7673287, -6.5794244, -6.609311, ...","[0.2507678, 0.3387965, 0.6849785, 0.5494461, 0..."
6,ExtraTreesRegressor,"[-5.593799999999999, -5.59201741677, -6.036024...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.437719138677502, -6.0010197360075, -6.682...","[-5.295184672046334, -5.884049819499002, -6.54...","[0.19220333505250942, 0.2600573195935361, 0.21..."
7,LinearRegression,"[-6.236416024962551, -5.529866143686691, -4.32...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-7.235940858096592, -6.508919449657154, -10....","[-6.258621755568816, -6.077857718614073, -9.05...","[0.8120853763196096, 0.9792849689826618, 1.419..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.713333333333333, -6.16...","[0.1707890186425604, 0.1707890186425604, 0.368..."
9,SVR,"[-5.574682147532249, -5.3235259430087085, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.597206175103655, -5.919260725273355, -5.8...","[-5.607146731713048, -5.868059981146523, -5.75...","[0.12090097647892548, 0.2380574289997938, 0.11..."


In [149]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_const_rem_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_const_rem_MDCK.csv')

In [150]:
#2d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 1534)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1534)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5567
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 419
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10865
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 802
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6202,0.6549,0.7875,0.0413,0.298,0.4191,0.3628,0.5291,0.6024,0.3832,0.6366,0.4374
DecisionTreeRegressor,1.1278,0.8467,1.062,-0.7435,0.0881,0.0771,0.4273,0.6015,0.6537,0.2736,0.7205,0.4099
RandomForestRegressor,0.6388,0.6706,0.7992,0.0126,0.2598,0.2851,0.3109,0.5044,0.5576,0.4715,0.7269,0.3769
GradientBoostingRegressor,0.7691,0.7112,0.877,-0.1889,0.1861,0.2655,0.3247,0.5108,0.5699,0.4479,0.7452,0.3824
AdaBoostRegressor,0.6392,0.6399,0.7995,0.0119,0.3055,0.3204,0.2864,0.4559,0.5351,0.5132,0.7482,0.3549
XGBRegressor,0.7669,0.6984,0.8757,-0.1855,0.2632,0.3223,0.5071,0.6044,0.7121,0.138,0.6302,0.4154
ExtraTreesRegressor,0.6573,0.6615,0.8108,-0.0161,0.2911,0.285,0.3514,0.5109,0.5928,0.4027,0.7148,0.3604
LinearRegression,1.5107,0.9304,1.2291,-1.3353,0.1402,0.1425,0.9913,0.6984,0.9956,-0.6853,0.6566,0.619
KNeighborsRegressor,0.6246,0.624,0.7903,0.0344,0.3493,0.3468,0.336,0.4529,0.5796,0.4288,0.6563,0.4601
SVR,0.6481,0.6385,0.805,-0.0019,0.2497,0.2574,0.308,0.4708,0.555,0.4764,0.7238,0.4209


In [151]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.472616748078145, -5.709233727000539, -5.93...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.517975188264973, -5.695571364617792, -6.3...","[-5.511361046759125, -5.7354806977985024, -6.3...","[0.17524771543909062, 0.18003445083722755, 0.1..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -4.94, -6.929917083, -5.4007221...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.25, -5.93, -6.927245587, -6.927245...","[-5.927915698199999, -6.394, -6.43388243060000...","[1.0828239861196156, 0.28799999999999987, 0.78..."
2,RandomForestRegressor,"[-5.518439023949997, -5.592659368202501, -5.95...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.545579830323331, -5.8574180560233335, -6....","[-5.591887287326967, -5.8672337871178675, -6.3...","[0.16839517998077558, 0.16324390444848222, 0.1..."
3,GradientBoostingRegressor,"[-5.4230059651136235, -5.419638455641846, -5.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.281923325393852, -6.2331740472817385, -6....","[-5.673037845530611, -6.14261411905416, -6.503...","[0.25168091248566377, 0.2516192704814364, 0.28..."
4,AdaBoostRegressor,"[-5.536528333235293, -5.397920423312501, -5.93...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.22, -5.711712335857143, -6.21200000000000...","[-5.434631060228022, -5.801481927526995, -6.35...","[0.24830455517216377, 0.16656163215923694, 0.1..."
5,XGBRegressor,"[-5.545838, -5.4548984, -5.105486, -6.7403717,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3349123, -6.447411, -7.447191, -6.9274735...","[-5.415261, -5.878441, -6.8081627, -6.549703, ...","[0.2844535, 0.3828769, 0.61360866, 0.52515084,..."
6,ExtraTreesRegressor,"[-5.547721626699999, -5.708027148260004, -6.29...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3742241829375015, -5.9507209872675, -6.50...","[-5.320330680593001, -5.953831516577667, -6.72...","[0.22860814751030448, 0.24184585377215528, 0.3..."
7,LinearRegression,"[-5.555143134125656, -5.257041750450064, -4.51...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-7.350648060853656, -6.6020549318248625, -10...","[-6.938244370143295, -6.038778975207282, -9.12...","[1.641474628054934, 1.3157624250802908, 1.6327..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.6546666666666665, -6.2...","[0.1707890186425604, 0.14230796026770795, 0.30..."
9,SVR,"[-5.558808402299759, -5.284232892418047, -5.63...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.631340628677342, -5.941054707923339, -5.8...","[-5.622393674289135, -5.862704498277151, -5.78...","[0.09667155426660792, 0.25314773372649585, 0.1..."


In [152]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_LVR_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_LVR_MDCK.csv')

In [153]:
#2d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 1534)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1534)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5567
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 419
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10865
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 802
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6202,0.6549,0.7875,0.0413,0.298,0.4191,0.3628,0.5291,0.6024,0.3832,0.6366,0.4374
DecisionTreeRegressor,1.1278,0.8467,1.062,-0.7435,0.0881,0.0771,0.4273,0.6015,0.6537,0.2736,0.7205,0.4099
RandomForestRegressor,0.6388,0.6706,0.7992,0.0126,0.2598,0.2851,0.3109,0.5044,0.5576,0.4715,0.7269,0.3769
GradientBoostingRegressor,0.7691,0.7112,0.877,-0.1889,0.1861,0.2655,0.3247,0.5108,0.5699,0.4479,0.7452,0.3824
AdaBoostRegressor,0.6392,0.6399,0.7995,0.0119,0.3055,0.3204,0.2864,0.4559,0.5351,0.5132,0.7482,0.3549
XGBRegressor,0.7669,0.6984,0.8757,-0.1855,0.2632,0.3223,0.5071,0.6044,0.7121,0.138,0.6302,0.4154
ExtraTreesRegressor,0.6573,0.6615,0.8108,-0.0161,0.2911,0.285,0.3514,0.5109,0.5928,0.4027,0.7148,0.3604
LinearRegression,1.5107,0.9304,1.2291,-1.3353,0.1402,0.1425,0.9913,0.6984,0.9956,-0.6853,0.6566,0.619
KNeighborsRegressor,0.6246,0.624,0.7903,0.0344,0.3493,0.3468,0.336,0.4529,0.5796,0.4288,0.6563,0.4601
SVR,0.6481,0.6385,0.805,-0.0019,0.2497,0.2574,0.308,0.4708,0.555,0.4764,0.7238,0.4209


In [154]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.472616748078145, -5.709233727000539, -5.93...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.517975188264973, -5.695571364617792, -6.3...","[-5.511361046759125, -5.7354806977985024, -6.3...","[0.17524771543909062, 0.18003445083722755, 0.1..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -4.94, -6.929917083, -5.4007221...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.25, -5.93, -6.927245587, -6.927245...","[-5.927915698199999, -6.394, -6.43388243060000...","[1.0828239861196156, 0.28799999999999987, 0.78..."
2,RandomForestRegressor,"[-5.518439023949997, -5.592659368202502, -5.95...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.54557983032333, -5.8574180560233335, -6.3...","[-5.591887287326967, -5.8672337871178675, -6.3...","[0.16839517998077583, 0.16324390444848222, 0.1..."
3,GradientBoostingRegressor,"[-5.4230059651136235, -5.419638455641846, -5.1...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.281923325393852, -6.2331740472817385, -6....","[-5.673037845530611, -6.14261411905416, -6.503...","[0.25168091248566377, 0.2516192704814364, 0.28..."
4,AdaBoostRegressor,"[-5.536528333235293, -5.397920423312501, -5.93...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.22, -5.711712335857143, -6.21200000000000...","[-5.434631060228022, -5.801481927526995, -6.35...","[0.24830455517216377, 0.16656163215923694, 0.1..."
5,XGBRegressor,"[-5.545838, -5.4548984, -5.105486, -6.7403717,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3349123, -6.447411, -7.447191, -6.9274735...","[-5.415261, -5.878441, -6.8081627, -6.549703, ...","[0.2844535, 0.3828769, 0.61360866, 0.52515084,..."
6,ExtraTreesRegressor,"[-5.547721626699999, -5.708027148260003, -6.29...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.374224182937501, -5.950720987267499, -6.5...","[-5.320330680593001, -5.953831516577667, -6.72...","[0.22860814751030467, 0.24184585377215578, 0.3..."
7,LinearRegression,"[-5.555143134125656, -5.257041750450064, -4.51...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-7.350648060853656, -6.6020549318248625, -10...","[-6.938244370143295, -6.038778975207282, -9.12...","[1.641474628054934, 1.3157624250802908, 1.6327..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333333, -6.4...","[-5.713333333333333, -5.6546666666666665, -6.2...","[0.1707890186425604, 0.14230796026770795, 0.30..."
9,SVR,"[-5.558808402299759, -5.284232892418047, -5.63...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.631340628677342, -5.941054707923339, -5.8...","[-5.622393674289135, -5.862704498277151, -5.78...","[0.09667155426660792, 0.25314773372649585, 0.1..."


In [155]:
def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [156]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [157]:
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
X_train = df_train[selected_features] 
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv')
df_test =df_test.dropna()
X_test =  df_test[X_train.columns]
y_test =  df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 114)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 114)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 39
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 808
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 58
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of te



-0.6330695006446432




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6226,0.656,0.7891,0.0375,0.2877,0.3383,0.3476,0.4919,0.5896,0.409,0.6578,0.5254
DecisionTreeRegressor,1.0671,0.7701,1.033,-0.6496,0.2083,0.3295,0.3752,0.5203,0.6125,0.3622,0.7445,0.564
RandomForestRegressor,0.6419,0.6625,0.8012,0.0078,0.2704,0.3006,0.3152,0.4952,0.5614,0.4641,0.7171,0.3879
GradientBoostingRegressor,0.7878,0.6944,0.8876,-0.2179,0.2043,0.2876,0.3594,0.5258,0.5995,0.389,0.7213,0.3549
AdaBoostRegressor,0.5798,0.6209,0.7614,0.1037,0.3944,0.4145,0.2934,0.4753,0.5416,0.5013,0.7583,0.4429
XGBRegressor,0.7579,0.6859,0.8706,-0.1716,0.2641,0.3381,0.3923,0.5323,0.6264,0.333,0.7022,0.4594
ExtraTreesRegressor,0.574,0.6175,0.7576,0.1127,0.3987,0.4086,0.3549,0.5168,0.5957,0.3966,0.7249,0.3604
LinearRegression,1.6082,0.9624,1.2681,-1.486,0.207,0.1695,1.2126,0.8841,1.1012,-1.0615,0.6696,0.542
KNeighborsRegressor,0.7606,0.6888,0.8721,-0.1758,0.1972,0.1884,0.3584,0.4947,0.5987,0.3907,0.6323,0.3884
SVR,0.686,0.6628,0.8282,-0.0604,0.1776,0.1663,0.3053,0.4944,0.5525,0.481,0.7112,0.4814


In [158]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.548005856854436, -6.210156543053598, -6.16...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.548005856854436, -5.548005856854436, -6.3...","[-5.47724816891189, -5.598236636673891, -6.325...","[0.15491957815470736, 0.17661213959791738, 0.1..."
1,DecisionTreeRegressor,"[-4.82, -5.06, -5.1, -6.929917083, -6.85, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.65, -6.25, -7.82, -6.927245587, -5.309384...","[-5.886, -6.133915698200001, -7.054, -6.734347...","[0.2972944668169927, 0.41103590750392355, 0.70..."
2,RandomForestRegressor,"[-5.474668559329998, -5.5425625373599985, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.475277280235, -5.896353860989166, -6.5693...","[-5.5991971558295335, -5.902812703299766, -6.4...","[0.13982359958920706, 0.1548257564262451, 0.18..."
3,GradientBoostingRegressor,"[-5.26325773426252, -5.198277552465221, -5.182...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.18274530842576, -6.204779084159647, -7.02...","[-5.57058261422696, -6.047013222165445, -6.564...","[0.23180838479343693, 0.24323163023899508, 0.2..."
4,AdaBoostRegressor,"[-5.3116666666666665, -5.386910050214285, -5.8...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.08, -5.813744955375, -6.5886227935, -6.63...","[-5.488918470088571, -5.844599551275, -6.48321...","[0.22019870008053288, 0.29192962898354, 0.1924..."
5,XGBRegressor,"[-5.3968806, -5.194106, -5.0974054, -6.10519, ...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.3973656, -6.2309556, -6.8224926, -6.92582...","[-5.695837, -6.0262384, -6.5893965, -6.633462,...","[0.28172338, 0.22305664, 0.19155224, 0.3943546..."
6,ExtraTreesRegressor,"[-5.518493841589998, -5.577915665767501, -6.21...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.456367045267502, -6.10451744097, -6.94380...","[-5.4288558467231685, -6.014619735399, -6.9067...","[0.17119738183910374, 0.20203919661707137, 0.3..."
7,LinearRegression,"[-7.290057937284836, -6.011174879434157, -4.32...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-10.0, -7.379348057600826, -9.52464787639450...","[-8.075043486507795, -6.18436158787148, -8.466...","[2.474338730537475, 1.2613237426136972, 1.3218..."
8,KNeighborsRegressor,"[-5.613333333333333, -5.786666666666666, -6.07...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.3...","[-5.6259999999999994, -5.6546666666666665, -6....","[0.16096100286853474, 0.14230796026770784, 0.3..."
9,SVR,"[-5.611257330776977, -5.304271046195673, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.709943898132217, -6.117595258793534, -5.8...","[-5.658092948811175, -5.938993710063983, -5.82...","[0.09822627486864736, 0.28780911384719526, 0.1..."


In [159]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_LVR_remove_corr_features_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_LVRremove_corr_features_MDCK.csv')

In [160]:
#3d RDKit descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_RDKit_desc_MDCK.csv')
df_train = df_train.fillna(0)
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_RDKit_desc_MDCK.csv')
df_test = df_test.fillna(0)
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 11)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 11)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 3
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 10
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testin



-3.808580013206493




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7768,0.7398,0.8814,-0.2008,-0.2926,-0.2533,0.5624,0.6009,0.75,0.0439,0.2111,0.1376
DecisionTreeRegressor,1.4229,0.9007,1.1929,-1.1996,0.0124,0.0834,0.4126,0.5665,0.6423,0.2986,0.5843,0.2834
RandomForestRegressor,0.8687,0.7345,0.9321,-0.3429,0.0423,0.1092,0.4107,0.5506,0.6408,0.3018,0.5707,0.2091
GradientBoostingRegressor,1.2556,0.8804,1.1206,-0.941,-0.0511,0.0227,0.5155,0.6224,0.718,0.1237,0.4177,0.1953
AdaBoostRegressor,1.0699,0.7775,1.0343,-0.6539,-0.0427,0.1254,0.4363,0.5618,0.6605,0.2583,0.5174,0.3356
XGBRegressor,1.0532,0.775,1.0263,-0.6282,0.1043,0.1739,0.562,0.6448,0.7497,0.0445,0.3773,0.2036
ExtraTreesRegressor,1.0092,0.7886,1.0046,-0.56,0.0395,0.0895,0.4789,0.5916,0.692,0.1858,0.4426,0.2503
LinearRegression,0.9755,0.7751,0.9877,-0.508,0.0837,0.118,0.6175,0.6102,0.7858,-0.0498,0.3717,0.1155
KNeighborsRegressor,0.8682,0.743,0.9318,-0.3422,0.0884,0.0551,0.5154,0.6615,0.7179,0.1239,0.4066,0.2696
SVR,0.762,0.7033,0.8729,-0.178,0.0294,-0.006,0.5201,0.5702,0.7212,0.1158,0.517,0.2916


In [161]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.863188972880598, -5.643007454782493, -5.65...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.847361139563147, -5.785170772975546, -5.8...","[-5.776704030015402, -5.749025230670178, -5.82...","[0.1449470612082072, 0.1674137694644182, 0.146..."
1,DecisionTreeRegressor,"[-5.63, -7.76, -6.17, -5.400722167, -6.34, -5....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.27, -4.94, -5.65, -6.927245587, -5.241517...","[-5.6379156982, -5.016, -5.564439713600001, -6...","[0.4835895882584466, 0.3417952603533291, 0.186..."
2,RandomForestRegressor,"[-6.328722619590001, -6.803705458280001, -6.24...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.48231550354, -5.5531224875800005, -5.5927...","[-5.483814276568001, -5.5639481918980005, -5.6...","[0.17004579637783956, 0.08211744149626579, 0.1..."
3,GradientBoostingRegressor,"[-6.515545552123927, -7.650387024580412, -5.95...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.018757061986256, -5.468940987618433, -5.3...","[-5.335943392997036, -5.309824821708785, -5.23...","[0.18983678445029248, 0.13462463825258017, 0.2..."
4,AdaBoostRegressor,"[-6.4110811367857155, -7.119859497, -6.1092160...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.327142857142858, -5.27, -5.385, -6.057878...","[-5.421559090006349, -5.368887752662337, -5.42...","[0.200178571288708, 0.11629174163522532, 0.061..."
5,XGBRegressor,"[-6.0587716, -7.5935125, -5.782604, -5.475473,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.0637484, -5.1283937, -5.0460887, -6.00240...","[-5.2878046, -5.1547637, -5.2806525, -5.858015...","[0.15266897, 0.08986489, 0.19159171, 0.4680740..."
6,ExtraTreesRegressor,"[-6.246423757920001, -7.458844406520001, -6.19...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.273305700989998, -5.528882982480004, -5.5...","[-5.39958773046, -5.424152884638003, -5.528689...","[0.27671276593242744, 0.11752844789243957, 0.1..."
7,LinearRegression,"[-6.556498279021809, -5.810144106796543, -7.15...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.727734082963899, -5.68348841770661, -5.34...","[-4.8844879813655195, -5.610047807652637, -5.2...","[0.14381755797988482, 0.07437368912426277, 0.1..."
8,KNeighborsRegressor,"[-6.506639027666666, -6.88, -5.946666666666666...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.94, -6.053333333333334, -5.61162072033333...","[-5.184741895666667, -5.885667206533334, -5.74...","[0.48048421357202264, 0.27008056168897543, 0.2..."
9,SVR,"[-6.068087540599281, -5.564761655767922, -5.94...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-4.900907311403258, -5.264364958185834, -5.5...","[-5.141869119868303, -5.2525698349584395, -5.5...","[0.2180039633979658, 0.15869014465183331, 0.10..."


In [162]:
result_df.to_csv('results/Descriptors/Results_3D_RDKit_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_RDKit_desc_MDCK.csv')

In [163]:
#3d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_padel_curated_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_padel_curated_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 431)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 431)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1513
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 101
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5006
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 330
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7876,0.7662,0.8875,-0.2175,-0.0235,0.0038,0.4822,0.5304,0.6944,0.1802,0.4249,0.4512
DecisionTreeRegressor,0.6503,0.6583,0.8064,-0.0052,0.3902,0.3713,0.6434,0.6643,0.8021,-0.0937,0.3992,0.2448
RandomForestRegressor,0.4672,0.5377,0.6835,0.2777,0.5413,0.5753,0.3142,0.4644,0.5606,0.4658,0.7078,0.564
GradientBoostingRegressor,0.4946,0.5495,0.7033,0.2355,0.5029,0.5501,0.354,0.494,0.595,0.3982,0.6402,0.4649
AdaBoostRegressor,0.5108,0.5401,0.7147,0.2104,0.4737,0.5261,0.3492,0.4692,0.5909,0.4063,0.6402,0.5695
XGBRegressor,0.525,0.5823,0.7246,0.1884,0.461,0.4454,0.2593,0.4195,0.5092,0.5592,0.7639,0.597
ExtraTreesRegressor,0.4915,0.5646,0.7011,0.2401,0.4929,0.5137,0.3393,0.4629,0.5825,0.4232,0.6604,0.542
LinearRegression,0.9531,0.7966,0.9763,-0.4733,0.2097,0.133,1.0875,0.8369,1.0428,-0.8488,0.3468,0.3879
KNeighborsRegressor,0.5038,0.5542,0.7098,0.2212,0.4781,0.5236,0.4736,0.5421,0.6882,0.1949,0.4497,0.3576
SVR,0.5677,0.5932,0.7535,0.1224,0.3938,0.4004,0.4185,0.4899,0.6469,0.2886,0.5487,0.4154


In [164]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.837871221180832, -5.690961856159556, -5.89...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.733919084033116, -5.317926401298028, -6.0...","[-5.556334911315856, -5.569052437553351, -6.03...","[0.2865443556631922, 0.31667564973194223, 0.23..."
1,DecisionTreeRegressor,"[-6.85, -5.06, -5.06, -5.303671307, -5.3093841...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.35, -5.309384159, -6.85, -6.927245587, -5...","[-6.212000000000001, -5.8658768318000005, -7.0...","[0.852933760616849, 1.083107115533834, 0.68419..."
2,RandomForestRegressor,"[-5.962157405740004, -5.27284915855, -5.472032...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.918477825460001, -5.6472372448399994, -6....","[-5.851593753556001, -5.495281857982, -6.45764...","[0.11461876615617257, 0.08146468132159776, 0.2..."
3,GradientBoostingRegressor,"[-5.782370640549474, -5.167743135560243, -5.27...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.114134432615404, -5.414834497717565, -6.7...","[-6.022150271641392, -5.406778866567728, -6.57...","[0.21426536852519834, 0.09893074276342258, 0.2..."
4,AdaBoostRegressor,"[-5.9277957545, -5.20466782675, -5.10285714285...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.93, -5.396838867153846, -6.85526058257143...","[-5.7786284442380955, -5.344544115136324, -6.6...","[0.2244159001007405, 0.19362101911671056, 0.30..."
5,XGBRegressor,"[-5.8093762, -5.0663447, -5.059916, -6.033059,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.6907873, -5.6185784, -6.4229207, -6.24030...","[-5.885779, -5.4504633, -6.4145927, -6.2552977...","[0.36640412, 0.24192826, 0.34065518, 0.3414788..."
6,ExtraTreesRegressor,"[-5.891297472910002, -5.401496112600003, -5.62...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.796532035220002, -5.3343842321400015, -6....","[-5.738967335980001, -5.359579155846001, -6.47...","[0.21955466895030068, 0.053082903054313076, 0...."
7,LinearRegression,"[-6.432379302740648, -6.227333422319202, -6.06...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.110355040985546, -4.221564784213455, -7.4...","[-4.520505732874167, -4.489529245230671, -7.62...","[0.394551122922211, 0.32921359222547875, 0.311..."
8,KNeighborsRegressor,"[-5.583333333333333, -5.408287387000001, -5.53...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.206666666666666, -6.2...","[-5.560666666666666, -5.145999999999999, -6.40...","[0.14478182820291288, 0.059568821076503096, 0...."
9,SVR,"[-5.859151862543014, -5.596564111703541, -5.63...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.379117839060564, -5.244812879335735, -6.2...","[-5.43705106426585, -5.198781728036482, -6.148...","[0.09464240579825538, 0.08868458675167412, 0.1..."


In [165]:
result_df.to_csv('results/Descriptors/Results_3D_padel_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_padel_desc_MDCK.csv')

In [166]:
df_train_rdkit = pd.read_csv('features/Descriptors/Train_3d_RDKit_desc_MDCK.csv')
df_train_rdkit = df_train_rdkit.fillna(0)
df_train_padel = pd.read_csv('features/Descriptors/Train_3d_padel_curated_MDCK.csv')

df_3d_descriptors = df_train_rdkit.merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,19328.831547,34233.761153,44750.050869,0.431929,0.764999,6.568008,4e-05,...,0.456915,0.389421,0.437125,0.423452,0.392749,41.336971,526.265213,2498.864653,0.269505,1.253326
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,21639.60215,27307.094296,36849.470395,0.587243,0.741044,6.135683,3.4e-05,...,0.678098,0.236339,0.474576,0.468547,0.397864,49.473306,583.761805,2293.688495,0.517147,1.340987
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,17248.905161,39399.400463,51694.219247,0.333672,0.762163,6.894908,4.4e-05,...,0.539136,0.363565,0.573106,0.495829,0.357174,50.483368,723.394093,3227.656451,0.354051,1.426109
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,22581.208469,29452.518358,43118.710771,0.523699,0.683057,6.530789,3e-05,...,0.521568,0.397144,0.491228,0.418054,0.41377,50.13167,708.259697,2879.776105,0.378069,1.323052
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,16154.238685,38150.14025,49434.111276,0.326783,0.771737,6.893849,4.8e-05,...,0.608814,0.291627,0.525725,0.587748,0.381538,45.9272,563.59345,2321.907016,0.413221,1.495011
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,23639.124043,26977.427225,46733.28396,0.505831,0.577264,6.6782,2.4e-05,...,0.588852,0.300853,0.582896,0.574504,0.339708,49.847196,684.019529,3153.991837,0.383278,1.497107
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,23603.480714,31700.187785,47540.215575,0.496495,0.666808,6.87027,2.8e-05,...,0.481735,0.407498,0.473671,0.471289,0.327613,46.294702,631.823027,2835.55069,0.33385,1.272573
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,22427.323346,26243.804082,38022.131208,0.589849,0.690224,6.30779,3.1e-05,...,0.572955,0.326123,0.47087,0.386186,0.283351,48.802297,661.127915,2901.775303,0.359432,1.140407
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,16332.271101,28455.369883,38910.633626,0.419738,0.731301,6.317993,4.5e-05,...,0.601371,0.349128,0.518144,0.545022,0.364424,51.96254,693.944167,2204.104224,0.425748,1.427589
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,17940.055089,30254.745365,42764.314961,0.41951,0.707476,6.589371,3.9e-05,...,0.608499,0.312106,0.569435,0.564939,0.346947,49.411357,642.128957,2510.56182,0.412748,1.48132


In [167]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [168]:
df_3d_descriptors.to_csv('features/Descriptors/Train_3d_all_descriptors_MDCK.csv', index=False)

In [169]:
df_test_rdkit = pd.read_csv('features/Descriptors/Test_3d_RDKit_desc_MDCK.csv')
df_test_rdkit = df_test_rdkit.fillna(0)
df_test_padel = pd.read_csv('features/Descriptors/Test_3d_padel_curated_MDCK.csv')

df_3d_descriptors = df_test_rdkit.merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,22357.644558,37861.181847,52559.409996,0.425379,0.72035,6.856039,3.2e-05,...,0.538922,0.327869,0.524865,0.537605,0.374733,48.372354,683.620826,3396.096064,0.308384,1.437203
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,19794.004419,32295.38822,43015.692005,0.460158,0.750782,6.529163,3.8e-05,...,0.566363,0.291189,0.444583,0.539683,0.357508,41.316066,490.042107,2188.2047,0.349545,1.341774
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,18147.501441,36587.928085,50658.631132,0.358231,0.722245,6.974313,4e-05,...,0.585065,0.343748,0.509549,0.449964,0.374446,51.553116,710.235475,2723.391067,0.393219,1.33396
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,18985.055772,27600.224516,41557.897367,0.456834,0.664139,6.527399,3.5e-05,...,0.510944,0.431114,0.440987,0.473812,0.40642,50.076001,689.240384,2342.004744,0.413087,1.321219
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,18918.681724,24420.003395,37944.148485,0.498593,0.643578,6.304947,3.4e-05,...,0.507722,0.406031,0.519823,0.544516,0.426875,45.4719,589.208424,2306.37861,0.37063,1.491215
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,14567.155879,28123.521304,38667.611811,0.376728,0.727315,6.351596,5e-05,...,0.520358,0.404704,0.497301,0.484967,0.300594,49.677484,690.78496,2675.200579,0.387593,1.282862
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,11539.242278,31493.356816,36143.419934,0.319263,0.871344,6.284649,7.6e-05,...,0.687047,0.252612,0.485191,0.393955,0.404621,49.499572,564.175291,1883.827401,0.530571,1.283767
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,16026.114888,23552.590294,33895.660179,0.472807,0.694856,6.072283,4.3e-05,...,0.486963,0.412405,0.532151,0.496837,0.370279,43.965756,563.139411,2324.612455,0.349052,1.399267
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,15370.736859,22402.923302,28261.502638,0.543875,0.792701,5.756675,5.2e-05,...,0.566916,0.363249,0.588852,0.553764,0.271499,51.26691,711.978056,2701.028559,0.395248,1.414115
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,14373.054695,28219.289724,34294.814421,0.419103,0.822844,6.236939,5.7e-05,...,0.566683,0.3461,0.511976,0.468069,0.382243,44.726179,551.597017,2126.801755,0.369175,1.362288


In [170]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [171]:
df_3d_descriptors.to_csv('features/Descriptors/Test_3d_all_descriptors_MDCK.csv', index=False)

In [172]:
#3d All descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models_3dall = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_3dall, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 442)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 442)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 104
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 340
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7938,0.7692,0.8909,-0.2271,-0.0321,-0.0042,0.4823,0.5315,0.6944,0.1802,0.4247,0.4512
DecisionTreeRegressor,1.0377,0.7786,1.0187,-0.6041,0.1998,0.2571,0.5499,0.5483,0.7415,0.0652,0.4932,0.3824
RandomForestRegressor,0.465,0.5308,0.6819,0.2812,0.5458,0.5743,0.3092,0.4541,0.5561,0.4743,0.7124,0.5695
GradientBoostingRegressor,0.5197,0.5649,0.7209,0.1967,0.4713,0.5056,0.3566,0.499,0.5972,0.3937,0.6379,0.4374
AdaBoostRegressor,0.4328,0.5098,0.6579,0.331,0.5871,0.6294,0.3322,0.4646,0.5764,0.4352,0.6635,0.564
XGBRegressor,0.5388,0.5882,0.734,0.1671,0.4441,0.4449,0.2691,0.4322,0.5187,0.5425,0.7561,0.597
ExtraTreesRegressor,0.5175,0.5774,0.7194,0.2,0.4477,0.4319,0.3501,0.4705,0.5917,0.4048,0.6445,0.542
LinearRegression,0.8622,0.7315,0.9285,-0.3328,0.2747,0.2037,0.9529,0.8123,0.9762,-0.6199,0.3534,0.4044
KNeighborsRegressor,0.5288,0.5841,0.7272,0.1825,0.4437,0.4703,0.5038,0.572,0.7098,0.1435,0.4008,0.3466
SVR,0.5664,0.5945,0.7526,0.1245,0.3932,0.4216,0.4158,0.4825,0.6448,0.2932,0.5604,0.4429


In [173]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.837871221180832, -5.690961856159556, -5.89...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.733919084033116, -5.317926401298028, -6.0...","[-5.559022073543207, -5.574374334041286, -6.03...","[0.2891167107383283, 0.3159169028802354, 0.239..."
1,DecisionTreeRegressor,"[-6.569578491, -5.22, -5.22, -5.303671307, -5....",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.35, -5.35, -5.93, -6.927245587, -5.303671...","[-6.298, -5.397999999999999, -6.61399999999999...","[0.8304552968101295, 0.5004557922534216, 0.966..."
2,RandomForestRegressor,"[-5.886644928520001, -5.263258779780001, -5.36...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.958469749130002, -5.589211166320005, -6.5...","[-5.888140665762, -5.472325882242001, -6.44145...","[0.15035613854267982, 0.06166922607076245, 0.2..."
3,GradientBoostingRegressor,"[-5.930963039127117, -5.062173628323773, -5.16...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.1441924130801375, -5.537911862146065, -6....","[-5.976305596051986, -5.3634547539018, -6.5607...","[0.2964584079899877, 0.16082487882990454, 0.36..."
4,AdaBoostRegressor,"[-6.25, -5.137142857142856, -5.113333333333333...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.801368406076923, -5.253333333333333, -6.8...","[-5.78391633925348, -5.355111063227451, -6.461...","[0.12270308584541771, 0.1335543759903669, 0.34..."
5,XGBRegressor,"[-6.08846, -5.0673137, -5.0674872, -6.0345607,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.696285, -5.609829, -6.9147677, -6.2541738...","[-5.89615, -5.460052, -6.6644516, -6.19325, -5...","[0.26275367, 0.24402368, 0.5592756, 0.29224274..."
6,ExtraTreesRegressor,"[-6.028106844370004, -5.316536553679999, -5.59...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.79835102306, -5.365162132960003, -6.67743...","[-5.758603403784, -5.33152004009, -6.432154002...","[0.17869162197623314, 0.0583221513850547, 0.26..."
7,LinearRegression,"[-7.115310133291685, -6.028495464786285, -5.96...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.203364902647311, -4.3865801477960185, -6....","[-4.617870011492105, -4.435944686826875, -7.30...","[0.39443937274574814, 0.2726170324801912, 0.28..."
8,KNeighborsRegressor,"[-5.583333333333333, -5.408287387000001, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.206666666666666, -6.4...","[-5.560666666666666, -5.145999999999999, -6.44...","[0.14478182820291288, 0.059568821076503096, 0...."
9,SVR,"[-6.007534341177215, -5.546078596820685, -5.70...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.412896481687582, -5.29375446671756, -6.16...","[-5.449708881288661, -5.22091670614579, -6.107...","[0.09466492239606225, 0.08712203376711537, 0.1..."


In [174]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_MDCK.csv')

In [175]:
#3d All descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 442)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 442)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 104
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 340
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7938,0.7692,0.8909,-0.2271,-0.0321,-0.0042,0.4823,0.5315,0.6944,0.1802,0.4247,0.4512
DecisionTreeRegressor,1.0377,0.7786,1.0187,-0.6041,0.1998,0.2571,0.5499,0.5483,0.7415,0.0652,0.4932,0.3824
RandomForestRegressor,0.465,0.5308,0.6819,0.2812,0.5458,0.5743,0.3092,0.4541,0.5561,0.4743,0.7124,0.5695
GradientBoostingRegressor,0.5197,0.5649,0.7209,0.1967,0.4713,0.5056,0.3566,0.499,0.5972,0.3937,0.6379,0.4374
AdaBoostRegressor,0.4328,0.5098,0.6579,0.331,0.5871,0.6294,0.3322,0.4646,0.5764,0.4352,0.6635,0.564
XGBRegressor,0.5388,0.5882,0.734,0.1671,0.4441,0.4449,0.2691,0.4322,0.5187,0.5425,0.7561,0.597
ExtraTreesRegressor,0.5175,0.5774,0.7194,0.2,0.4477,0.4319,0.3501,0.4705,0.5917,0.4048,0.6445,0.542
LinearRegression,0.8622,0.7315,0.9285,-0.3328,0.2747,0.2037,0.9529,0.8123,0.9762,-0.6199,0.3534,0.4044
KNeighborsRegressor,0.5288,0.5841,0.7272,0.1825,0.4437,0.4703,0.5038,0.572,0.7098,0.1435,0.4008,0.3466
SVR,0.5664,0.5945,0.7526,0.1245,0.3932,0.4216,0.4158,0.4825,0.6448,0.2932,0.5604,0.4429


In [176]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_const_rem_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_const_rem_MDCK.csv')

In [177]:
#3d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_MDCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 370)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 370)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 87
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4199
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 277
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7618,0.7331,0.8728,-0.1777,0.0274,0.0781,0.4567,0.5233,0.6758,0.2236,0.4783,0.5089
DecisionTreeRegressor,0.9107,0.7261,0.9543,-0.4078,0.2503,0.2879,0.5351,0.5815,0.7315,0.0902,0.4241,0.3549
RandomForestRegressor,0.5201,0.5535,0.7212,0.1959,0.4427,0.5087,0.3236,0.4519,0.5688,0.4499,0.6973,0.6135
GradientBoostingRegressor,0.5481,0.57,0.7403,0.1527,0.4254,0.4812,0.4486,0.5183,0.6698,0.2374,0.5142,0.4209
AdaBoostRegressor,0.5602,0.563,0.7485,0.134,0.3917,0.433,0.3499,0.4555,0.5915,0.4051,0.638,0.5695
XGBRegressor,0.5146,0.5379,0.7174,0.2045,0.4868,0.5434,0.3611,0.512,0.601,0.386,0.6555,0.4429
ExtraTreesRegressor,0.5386,0.5921,0.7339,0.1674,0.4143,0.4467,0.3624,0.4777,0.602,0.3838,0.6264,0.542
LinearRegression,1.0833,0.8424,1.0408,-0.6746,0.1075,0.083,0.6974,0.6662,0.8351,-0.1856,0.5236,0.4842
KNeighborsRegressor,0.5425,0.6008,0.7365,0.1614,0.4197,0.4798,0.3886,0.4948,0.6234,0.3393,0.5979,0.5172
SVR,0.6104,0.6035,0.7813,0.0565,0.3219,0.3051,0.3735,0.4668,0.6111,0.3651,0.6408,0.5199


In [178]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.812772876329546, -5.677708137107258, -5.88...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.7087442118983045, -5.305590849064757, -6....","[-5.629838628331656, -5.544465162649217, -5.99...","[0.2432872968053441, 0.291300675157427, 0.1874..."
1,DecisionTreeRegressor,"[-6.34, -4.94, -5.06, -6.929917083, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.569578491, -5.65, -5.793469425, -6.569578...","[-6.2804391856, -5.3688608714, -6.4140587006, ...","[0.6406564117943924, 0.5115546360066238, 0.468..."
2,RandomForestRegressor,"[-5.953685100350001, -5.279664158810002, -5.37...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.053498501330004, -5.532893981240001, -6.5...","[-6.029058468714004, -5.4916381674179995, -6.3...","[0.09949515846933397, 0.03758754417144964, 0.2..."
3,GradientBoostingRegressor,"[-6.122727656868503, -5.0307587002566825, -5.2...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.307836033867727, -5.5420826846471325, -6....","[-6.202674990987974, -5.449856050004416, -6.38...","[0.3270602821766599, 0.19596624172390653, 0.27..."
4,AdaBoostRegressor,"[-6.153565379, -5.2410529627692295, -5.1408960...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.196596415166667, -5.329700690416665, -6.5...","[-5.896085469982539, -5.27768955111111, -6.378...","[0.29200737735487514, 0.04365256185844207, 0.2..."
5,XGBRegressor,"[-6.1222277, -5.1070056, -5.0919137, -5.780672...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.3238688, -5.67615, -7.0364385, -6.506613,...","[-6.0467625, -5.524417, -6.7239823, -6.4559145...","[0.27607512, 0.10554468, 0.46849626, 0.3791127..."
6,ExtraTreesRegressor,"[-5.958760617840004, -5.4144259057200035, -5.7...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.760247443460002, -5.423271837560004, -6.6...","[-5.786895090468001, -5.398370242066, -6.44544...","[0.17974459520193564, 0.07807718874141853, 0.2..."
7,LinearRegression,"[-6.6994995022378285, -5.783981668881271, -6.0...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.537847773985432, -4.244706561532601, -7.0...","[-4.889454706279375, -4.662136181411768, -7.43...","[0.55012986156203, 0.43246481831709493, 0.3196..."
8,KNeighborsRegressor,"[-5.583333333333333, -5.786666666666666, -5.44...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.206666666666666, -6.2...","[-5.6546666666666665, -5.167333333333334, -6.4...","[0.14230796026770795, 0.05836284967530473, 0.1..."
9,SVR,"[-5.8054395186444925, -5.462345637843317, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.438656631546392, -5.246895496558645, -6.1...","[-5.474902919457266, -5.197397646291375, -6.12...","[0.08590391269054908, 0.08966866510890902, 0.1..."


In [179]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_LVR_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_LVR_MDCK.csv')

In [180]:
#2d and 3d descriptors all
df_train_2d = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_MDCK.csv')
df_train_2d
df_train_3d = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_MDCK.csv')
df_train_3d

df_2d_3d_train = df_train_2d.merge(df_train_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_train.to_csv('features/Descriptors/Train_2d_3d_all_descriptors_MDCK.csv', index=False)
df_2d_3d_train

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1114,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.94,14.92293,14.92293,0.059501,-1.162678,0.23895,27.097561,1139.494,...,0.456915,0.389421,0.437125,0.423452,0.392749,41.336971,526.265213,2498.864653,0.269505,1.253326
1,1113,CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2...,-5.82,15.128047,15.128047,0.075261,-1.145488,0.23895,27.097561,1139.494,...,0.678098,0.236339,0.474576,0.468547,0.397864,49.473306,583.761805,2293.688495,0.517147,1.340987
2,1117,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.65,14.908525,14.908525,0.051146,-1.177248,0.23895,27.097561,1139.494,...,0.539136,0.363565,0.573106,0.495829,0.357174,50.483368,723.394093,3227.656451,0.354051,1.426109
3,1119,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.25,14.724823,14.724823,0.030635,-1.196597,0.240803,26.125,1115.472,...,0.521568,0.397144,0.491228,0.418054,0.41377,50.13167,708.259697,2879.776105,0.378069,1.323052
4,2428,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-5.35,14.918925,14.918925,0.008728,-1.166833,0.250056,27.358974,1091.406,...,0.608814,0.291627,0.525725,0.587748,0.381538,45.9272,563.59345,2321.907016,0.413221,1.495011
5,2446,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-6.85,14.912312,14.912312,0.031806,-1.168185,0.250056,27.358974,1091.406,...,0.588852,0.300853,0.582896,0.574504,0.339708,49.847196,684.019529,3153.991837,0.383278,1.497107
6,2445,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C(=O...,-5.27,14.926201,14.926201,0.060156,-1.148185,0.252488,27.358974,1089.434,...,0.481735,0.407498,0.473671,0.471289,0.327613,46.294702,631.823027,2835.55069,0.33385,1.272573
7,2427,CCN1CC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=...,-6.34,14.950175,14.950175,0.063054,-1.159021,0.252488,27.358974,1089.434,...,0.572955,0.326123,0.47087,0.386186,0.283351,48.802297,661.127915,2901.775303,0.359432,1.140407
8,8145,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-6.569578,14.598899,14.598899,0.013996,-1.178122,0.218577,28.506849,1048.403,...,0.601371,0.349128,0.518144,0.545022,0.364424,51.96254,693.944167,2204.104224,0.425748,1.427589
9,1107,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...,-4.61,14.543197,14.543197,0.095971,-1.130815,0.222694,27.567568,1047.438,...,0.608499,0.312106,0.569435,0.564939,0.346947,49.411357,642.128957,2510.56182,0.412748,1.48132


In [181]:
df_test_2d = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_MDCK.csv')
df_test_2d
df_test_3d = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_MDCK.csv')
df_test_3d

df_2d_3d_test = df_test_2d.merge(df_test_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_test.to_csv('features/Descriptors/Test_2d_3d_all_descriptors_MDCK.csv', index=False)
df_2d_3d_test

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1120,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-6.3,15.211581,15.211581,0.031713,-1.206465,0.159924,25.27907,1199.634,...,0.538922,0.327869,0.524865,0.537605,0.374733,48.372354,683.620826,3396.096064,0.308384,1.437203
1,1118,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](Cc2...,-5.35,14.834083,14.834083,0.037197,-1.184826,0.240803,26.125,1115.472,...,0.566363,0.291189,0.444583,0.539683,0.357508,41.316066,490.042107,2188.2047,0.349545,1.341774
2,1121,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-6.2,14.571211,14.571211,0.032067,-1.213008,0.153969,27.307692,1083.386,...,0.585065,0.343748,0.509549,0.449964,0.374446,51.553116,710.235475,2723.391067,0.393219,1.33396
3,8133,CCC[C@@H]1NC(=O)CN(CC)C(=O)[C@H](CC(C)C)NC(=O)...,-5.965681,14.50746,14.50746,0.015782,-1.174377,0.21837,27.694444,1034.376,...,0.510944,0.431114,0.440987,0.473812,0.40642,50.076001,689.240384,2342.004744,0.413087,1.321219
4,8143,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.676306,14.432367,14.432367,0.010248,-1.18198,0.206072,27.492958,1022.365,...,0.507722,0.406031,0.519823,0.544516,0.426875,45.4719,589.208424,2306.37861,0.37063,1.491215
5,8119,CCC[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)...,-5.51611,14.337271,14.337271,0.011275,-1.179506,0.205195,26.642857,1008.338,...,0.520358,0.404704,0.497301,0.484967,0.300594,49.677484,690.78496,2675.200579,0.387593,1.282862
6,6496,CC(=O)N1CCC[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[...,-5.06,14.798396,14.798396,0.082911,-1.653049,0.157962,27.464789,1002.309,...,0.687047,0.252612,0.485191,0.393955,0.404621,49.499572,564.175291,1883.827401,0.530571,1.283767
7,8168,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(C)[C@...,-5.9777,14.228063,14.228063,0.008269,-1.181029,0.216153,26.15942,996.327,...,0.486963,0.412405,0.532151,0.496837,0.370279,43.965756,563.139411,2324.612455,0.349052,1.399267
8,8345,CCOC(=O)[C@@H]1CSCC(=O)N[C@@H](CC)C(=O)N(CC(C)...,-6.342777,14.235704,14.235704,0.008614,-1.181469,0.216153,26.15942,996.327,...,0.566916,0.363249,0.588852,0.553764,0.271499,51.26691,711.978056,2701.028559,0.395248,1.414115
9,6423,CC(=O)N1CCC[C@@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)...,-7.21,14.765338,14.765338,0.082043,-1.652577,0.137104,27.528571,988.282,...,0.566683,0.3461,0.511976,0.468069,0.382243,44.726179,551.597017,2126.801755,0.369175,1.362288


In [182]:
#All 2d and 3d descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 3539)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 3539)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10183
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 749
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21909
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1573
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6551,0.6804,0.8094,-0.0127,0.2479,0.3164,0.3932,0.5127,0.627,0.3316,0.5787,0.3769
DecisionTreeRegressor,0.779,0.6813,0.8826,-0.2042,0.2808,0.2513,0.3609,0.4869,0.6008,0.3864,0.7259,0.5199
RandomForestRegressor,0.5214,0.5785,0.7221,0.194,0.4419,0.5111,0.2722,0.4447,0.5217,0.5373,0.757,0.575
GradientBoostingRegressor,0.4969,0.573,0.7049,0.2319,0.489,0.5061,0.3413,0.5061,0.5842,0.4198,0.6814,0.4594
AdaBoostRegressor,0.5126,0.5581,0.716,0.2076,0.4771,0.5435,0.269,0.4377,0.5186,0.5427,0.7431,0.6135
XGBRegressor,0.5723,0.6026,0.7565,0.1153,0.4061,0.4362,0.3584,0.507,0.5987,0.3906,0.6977,0.4924
ExtraTreesRegressor,0.5834,0.6217,0.7638,0.0982,0.379,0.4269,0.3152,0.4685,0.5614,0.4642,0.7085,0.4979
LinearRegression,0.7728,0.6965,0.8791,-0.1947,0.3376,0.2498,1.1598,0.7835,1.077,-0.9718,0.4587,0.4099
KNeighborsRegressor,0.6093,0.6176,0.7806,0.0581,0.3663,0.3544,0.3369,0.4582,0.5804,0.4272,0.656,0.5096
SVR,0.6213,0.6225,0.7882,0.0396,0.2998,0.272,0.342,0.4833,0.5848,0.4186,0.682,0.4154


In [183]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.385188685014141, -5.6070423083280865, -6.4...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.572068832530054, -5.398194853183809, -5.5...","[-5.542283309028968, -5.565932012645861, -5.90...","[0.18035447965365928, 0.2221526803914395, 0.19..."
1,DecisionTreeRegressor,"[-5.65, -5.33, -5.06, -5.793469425, -6.9299170...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.93, -6.25, -6.97, -6.927245587, -5.453495...","[-5.983915698200001, -6.0763305102, -6.6814491...","[0.41119974065844683, 0.2300002870641018, 0.81..."
2,RandomForestRegressor,"[-5.749899094070002, -5.431484174089998, -5.60...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.853891657530003, -5.5391950139400015, -6....","[-5.750280323782001, -5.485610486260001, -6.25...","[0.07942643670325907, 0.11938998875513067, 0.1..."
3,GradientBoostingRegressor,"[-5.721914710011272, -5.144315314957571, -5.27...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.250669991193625, -5.448801217398834, -6.4...","[-5.855233851259328, -5.527939220772208, -6.62...","[0.349039318273725, 0.22007150330142053, 0.348..."
4,AdaBoostRegressor,"[-5.650000000000001, -5.173333333333333, -5.30...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.65, -5.4648029049230775, -6.2526377125833...","[-5.516862187262499, -5.419313030922145, -6.44...","[0.19065124654119572, 0.15119961509681928, 0.2..."
5,XGBRegressor,"[-5.805778, -5.108463, -5.2194166, -6.3724723,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.1840267, -5.850236, -6.5278716, -6.856331...","[-5.809245, -5.5791035, -6.7244287, -6.4792223...","[0.31351328, 0.2342805, 0.42852277, 0.51039153..."
6,ExtraTreesRegressor,"[-5.6754582644000005, -5.51351238957, -5.74303...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.52496130623, -5.566031033970001, -6.79660...","[-5.474677365296001, -5.531012374278001, -6.57...","[0.1928856578959878, 0.1944379799505357, 0.123..."
7,LinearRegression,"[-6.306705139828235, -5.789430564074796, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.308278163647166, -5.138626863179043, -8.7...","[-5.214300693191923, -5.416699115715996, -8.85...","[0.296587525538262, 0.7078409193883037, 0.4117..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -5.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.6...","[-5.6546666666666665, -5.666000000000001, -6.4...","[0.14230796026770795, 0.14081350945290821, 0.1..."
9,SVR,"[-5.4976492771903125, -5.328093634502577, -5.6...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.466860485958558, -5.533279948765775, -5.8...","[-5.510218916926129, -5.566038306879546, -5.80...","[0.11877498263283152, 0.21783368413245263, 0.1..."


In [184]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_MDCK.csv')

In [185]:
#All 2d and 3d descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 2716)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 2716)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10183
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 749
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21909
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1573
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.6551,0.6804,0.8094,-0.0127,0.2479,0.3164,0.3932,0.5127,0.627,0.3316,0.5787,0.3769
DecisionTreeRegressor,0.9528,0.7685,0.9761,-0.473,0.2094,0.1735,0.5209,0.6305,0.7217,0.1145,0.5418,0.3989
RandomForestRegressor,0.521,0.5814,0.7218,0.1946,0.4438,0.5242,0.2656,0.4382,0.5154,0.5484,0.765,0.575
GradientBoostingRegressor,0.4639,0.552,0.6811,0.2828,0.5351,0.5562,0.3523,0.5124,0.5936,0.401,0.6729,0.4759
AdaBoostRegressor,0.5573,0.5859,0.7465,0.1385,0.4184,0.4662,0.284,0.4264,0.5329,0.5172,0.731,0.564
XGBRegressor,0.5723,0.6026,0.7565,0.1153,0.4061,0.4362,0.3584,0.507,0.5987,0.3906,0.6977,0.4924
ExtraTreesRegressor,0.619,0.6435,0.7868,0.0431,0.3234,0.3281,0.3053,0.4682,0.5526,0.4809,0.7124,0.4319
LinearRegression,0.7728,0.6965,0.8791,-0.1947,0.3376,0.2498,1.1598,0.7835,1.077,-0.9718,0.4587,0.4099
KNeighborsRegressor,0.6093,0.6176,0.7806,0.0581,0.3663,0.3544,0.3369,0.4582,0.5804,0.4272,0.656,0.5096
SVR,0.6213,0.6225,0.7882,0.0395,0.2998,0.272,0.342,0.4833,0.5848,0.4186,0.682,0.4154


In [186]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.385188685014141, -5.6070423083280865, -6.4...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.572068832530054, -5.398194853183809, -5.5...","[-5.542283309028968, -5.565932012645861, -5.90...","[0.18035447965365928, 0.2221526803914395, 0.19..."
1,DecisionTreeRegressor,"[-5.65, -5.06, -5.33, -6.929917083, -5.4007221...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.85, -6.25, -5.93, -6.927245587, -5.303671...","[-5.7659156982, -6.188, -6.651999999999999, -6...","[0.8071235786004356, 0.4569638935408353, 1.004..."
2,RandomForestRegressor,"[-5.7671244946, -5.47353290083, -5.56611199528...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.840858227130002, -5.47524905246, -6.25252...","[-5.757386442873999, -5.497085648210001, -6.25...","[0.10291923567494045, 0.09381271627035664, 0.0..."
3,GradientBoostingRegressor,"[-5.706851004899697, -5.11217875624988, -5.265...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.216939686838095, -5.435487836881435, -6.6...","[-5.896049542365603, -5.527895374787274, -6.69...","[0.3035200699225455, 0.2191428489343412, 0.330..."
4,AdaBoostRegressor,"[-5.5106911800625, -5.202832213687499, -5.3644...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.5106911800625, -5.507348526642856, -6.173...","[-5.556474800675, -5.449582230788852, -6.34286...","[0.19484934534479553, 0.12248377593261675, 0.2..."
5,XGBRegressor,"[-5.805778, -5.108463, -5.2194166, -6.3724723,...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-6.1840267, -5.850236, -6.5278716, -6.856331...","[-5.809245, -5.5791035, -6.7244287, -6.4792223...","[0.31351328, 0.2342805, 0.42852277, 0.51039153..."
6,ExtraTreesRegressor,"[-5.624187237, -5.489488923189999, -5.81584303...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.425353157690001, -5.370200240910002, -6.7...","[-5.438967877492, -5.5296410195500005, -6.5316...","[0.21678243841412795, 0.2204941854076667, 0.10..."
7,LinearRegression,"[-6.306705139828233, -5.789430564074801, -5.61...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.30827816364717, -5.138626863179046, -8.72...","[-5.2143006931919205, -5.4166991157159945, -8....","[0.2965875255382647, 0.7078409193883047, 0.411..."
8,KNeighborsRegressor,"[-5.6433333333333335, -5.786666666666666, -5.9...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.613333333333333, -5.613333333333334, -6.6...","[-5.6546666666666665, -5.666000000000001, -6.4...","[0.14230796026770795, 0.14081350945290821, 0.1..."
9,SVR,"[-5.49764705779118, -5.328094158535443, -5.641...",0 -6.300000 1 -5.350000 2 -6.200000 3...,"[[-5.466910041704217, -5.533331044797736, -5.8...","[-5.510241160085745, -5.566067659577447, -5.80...","[0.11877784788486262, 0.2178481789110824, 0.10..."


In [187]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_const_rem_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_const_rem_MDCK.csv')

In [188]:
#All 2d and 3d descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_MDCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_MDCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 1904)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 1904)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6872
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 506
[LightGBM] [Info] Start training from score -5.753098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15064
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1079
[LightGBM] [Info] Start training from score -5.673825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.7086,0.7043,0.8418,-0.0955,0.161,0.2433,0.4106,0.517,0.6408,0.302,0.5552,0.4429
DecisionTreeRegressor,1.0126,0.8244,1.0063,-0.5654,0.1473,0.1949,0.396,0.5095,0.6293,0.3269,0.655,0.4594
RandomForestRegressor,0.5504,0.6012,0.7419,0.1492,0.3907,0.4802,0.2802,0.4624,0.5293,0.5237,0.7537,0.6245
GradientBoostingRegressor,0.5385,0.5888,0.7339,0.1675,0.4348,0.5488,0.387,0.5208,0.6221,0.3421,0.6178,0.4704
AdaBoostRegressor,0.5484,0.5515,0.7405,0.1523,0.4332,0.5336,0.2681,0.4427,0.5178,0.5442,0.7463,0.6575
XGBRegressor,0.5697,0.5914,0.7548,0.1194,0.3995,0.4236,0.4767,0.5736,0.6904,0.1896,0.5752,0.3934
ExtraTreesRegressor,0.5428,0.6034,0.7367,0.161,0.42,0.4529,0.3044,0.4737,0.5517,0.4825,0.7095,0.586
LinearRegression,0.9834,0.7889,0.9916,-0.5201,0.188,0.1624,1.1075,0.7275,1.0524,-0.8828,0.5218,0.5695
KNeighborsRegressor,0.6187,0.6299,0.7866,0.0435,0.3527,0.3727,0.3545,0.4874,0.5954,0.3973,0.6462,0.4772
SVR,0.6326,0.6335,0.7953,0.0221,0.2873,0.264,0.3259,0.4675,0.5709,0.446,0.7177,0.564


In [189]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_LVR_MDCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_LVR_MDCK.csv')

In [54]:
#Stacked architecture model
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [55]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [56]:
from tqdm import tqdm
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Descriptors/Train_2d_3d_all_descriptors_MDCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Descriptors/Test_2d_3d_all_descriptors_MDCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Train/All_fingerprints_train_MDCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Test/All_fingerprints_test_MDCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_mdck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_mdck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Atomic/Train_all_atomic_desc_MDCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Atomic/Test_all_atomic_desc_MDCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
target_column = 'Permeability'
def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),   
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(51, 170)
(13, 170)
(51, 90)
(13, 90)
(51, 625)
(13, 625)
(51, 8)
(13, 8)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
      ID                                             SMILES  Permeability  \
47  1017  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...     -6.550000   
43  1018  CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...     -4.500000   
9   1107  CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N...     -4.610000   
41  1109  C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N(C)C(...     -6.120000   
11  1110  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[

In [57]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 5-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=5, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=5, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (5 fold CV)': mse_train,
        'Train MAE (5 fold CV)': mae_train,
        'Train RMSE (5 fold CV)': rmse_train,
        'Train R2 (5 fold CV)': r2_train,
        'Train PCC (5 fold CV)': pearson_train,
        'Train SCC (5 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A
Training models: 1it [00:00,  7.05it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 576
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 41
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1756
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 119
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1603
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 112
[LightGBM] [Info] Start training 


Training models: 2it [00:01,  1.01it/s][A
Training models: 3it [00:02,  1.07it/s][A
Training models: 4it [00:03,  1.31it/s][A
Training models: 5it [00:04,  1.08it/s][A
Training models: 6it [00:05,  1.00s/it][A
Training models: 10it [00:05,  1.70it/s][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:05<00:17,  5.89s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 6
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 9
[LightGBM] [Info] Start training from score


Training models: 1it [00:00,  2.90it/s][A




Training models: 2it [00:01,  1.04s/it][A
Training models: 3it [00:02,  1.45it/s][A
Training models: 4it [00:02,  1.87it/s][A
Training models: 5it [00:03,  1.74it/s][A
Training models: 6it [00:04,  1.30it/s][A
Training models: 10it [00:04,  2.18it/s][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:10<00:10,  5.12s/it]
Training models: 0it [00:00, ?it/s][A
Training models: 1it [00:00,  4.83it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2025
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 135
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7982
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 525
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7961
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 524
[LightGBM] [Info] Start trainin


Training models: 2it [00:01,  1.04it/s][A
Training models: 3it [00:04,  1.85s/it][A
Training models: 4it [00:05,  1.58s/it][A
Training models: 5it [00:07,  1.74s/it][A
Training models: 6it [00:09,  1.57s/it][A
Training models: 8it [00:09,  1.21it/s][A
Training models: 10it [00:10,  1.02s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [00:20<00:07,  7.46s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 0
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 0
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training from score -5.655243
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 1
[LightGBM] [Info] Start training


Training models: 2it [00:01,  1.13it/s][A
Training models: 3it [00:01,  1.69it/s][A
Training models: 5it [00:02,  2.46it/s][A

Training models: 10it [00:04,  2.47it/s][A
Processing dataframe pairs: 100%|██████████| 4/4 [00:24<00:00,  6.19s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (51, 40)
Dimensions of meta_features_test: (13, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 3
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 32
[LightGBM

Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.672759,0.67784,0.820219,-0.039984,0.173311,0.190018,0.4127,0.532822,0.642417,0.298399,0.588641,0.493113
DecisionTreeRegressor,1.007549,0.782618,1.003767,-0.557519,0.123583,0.246589,0.485365,0.620955,0.696681,0.174866,0.457657,0.277854
RandomForestRegressor,0.6439,0.650612,0.802434,0.004628,0.26917,0.358289,0.341112,0.528744,0.584048,0.4201,0.666757,0.3989
GradientBoostingRegressor,0.790305,0.730456,0.888991,-0.221692,0.171261,0.225797,0.40618,0.563063,0.637323,0.309481,0.558272,0.316369
AdaBoostRegressor,0.760666,0.690092,0.872162,-0.175875,0.209868,0.291342,0.359522,0.526723,0.599602,0.388802,0.63475,0.310867
XGBRegressor,0.755171,0.682201,0.869006,-0.16738,0.208131,0.290369,0.374868,0.511002,0.612265,0.362714,0.603567,0.3989
ExtraTreesRegressor,0.609198,0.627755,0.780511,0.058272,0.31577,0.398833,0.445883,0.577411,0.667745,0.241986,0.500408,0.261348
LinearRegression,8.711717,2.495084,2.951562,-12.466998,0.089163,0.043912,1.84042,1.114424,1.356621,-2.128767,-0.131687,-0.294361
KNeighborsRegressor,0.672736,0.671215,0.820205,-0.039949,0.185641,0.173522,0.468766,0.603475,0.684665,0.203084,0.456649,0.244842
SVR,0.703345,0.695584,0.838657,-0.087265,0.099546,0.066925,0.440483,0.525037,0.663689,0.251165,0.565083,0.277854


In [58]:
results_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Results_5_folds_stacked_archi_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Prediction_data_5_folds_stacked_archi_MDCK.csv')

In [59]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 10-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=10, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=10, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (10 fold CV)': mse_train,
        'Train MAE (10 fold CV)': mae_train,
        'Train RMSE (10 fold CV)': rmse_train,
        'Train R2 (10 fold CV)': r2_train,
        'Train PCC (10 fold CV)': pearson_train,
        'Train SCC (10 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2471
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 159
[LightGBM] [Info] Start training from score -5.697548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2614
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 160
[LightGBM] [Info] Start training from score -5.693573
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2543
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 159
[LightGBM] [Info] Start trainin


Training models: 1it [00:00,  2.92it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2541
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 159
[LightGBM] [Info] Start training from score -5.634214
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2553
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 159
[LightGBM] [Info] Start training from score -5.717778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2539
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 160
[LightGBM] [Info] Start trainin


Training models: 2it [00:03,  1.99s/it][A
Training models: 3it [00:05,  1.96s/it][A
Training models: 4it [00:06,  1.60s/it][A
Training models: 5it [00:08,  1.72s/it][A
Training models: 6it [00:10,  1.99s/it][A
Training models: 7it [00:11,  1.40s/it][A
Training models: 10it [00:11,  1.20s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:11<00:35, 11.97s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 15
[LightGBM] [Info] Start training from score -5.697548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 142
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 23
[LightGBM] [Info] Start training from score -5.693573
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 116
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 19
[LightGBM] [Info] Start training from


Training models: 1it [00:00,  2.91it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 119
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 19
[LightGBM] [Info] Start training from score -5.641280



Training models: 2it [00:03,  1.90s/it][A
Training models: 3it [00:03,  1.28s/it][A
Training models: 4it [00:04,  1.01s/it][A
Training models: 5it [00:05,  1.13s/it][A
Training models: 6it [00:08,  1.63s/it][A
Training models: 7it [00:08,  1.17s/it][A
Training models: 10it [00:09,  1.08it/s][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:21<00:20, 10.36s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10335
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 622
[LightGBM] [Info] Start training from score -5.697548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10574
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 622
[LightGBM] [Info] Start training from score -5.693573
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10574
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 622
[LightGBM] [Info] Start trai


Training models: 1it [00:00,  1.65it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10574
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 622
[LightGBM] [Info] Start training from score -5.717778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10574
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 622
[LightGBM] [Info] Start training from score -5.641280



Training models: 2it [00:03,  2.07s/it][A
Training models: 3it [00:10,  4.15s/it][A
Training models: 4it [00:12,  3.55s/it][A
Training models: 5it [01:04, 20.84s/it][A
Training models: 6it [01:13, 16.85s/it][A
Training models: 7it [01:14, 11.54s/it][A
Training models: 9it [01:16,  6.50s/it][A
Training models: 10it [01:16,  7.66s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [01:37<00:40, 40.60s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 1
[LightGBM] [Info] Start training from score -5.697548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 2
[LightGBM] [Info] Start training from score -5.693573
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 1
[LightGBM] [Info] Start training from score


Training models: 1it [02:23, 143.72s/it][A




Training models: 2it [02:31, 63.75s/it] [A
Training models: 3it [02:31, 34.80s/it][A
Training models: 4it [02:32, 21.13s/it][A
Training models: 5it [02:46, 18.60s/it][A

Training models: 10it [02:50, 17.05s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [04:28<00:00, 67.07s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (51, 40)
Dimensions of meta_features_test: (13, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 618
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 40
[LightGBM] [Info] Start training from score -5.697548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 642
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 40
[LightG



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 40
[LightGBM] [Info] Start training from score -5.641280
LGBMRegressor Evaluation completed: Test R2 score: 0.315969307144479
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.30938630240977083




RandomForestRegressor Evaluation completed: Test R2 score: 0.4968325536353675
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.30981015223818675
AdaBoostRegressor Evaluation completed: Test R2 score: 0.40945978620869417
XGBRegressor Evaluation completed: Test R2 score: 0.3211462673156895
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.37224168827583015
LinearRegression Evaluation completed: Test R2 score: -0.3331303192510191
KNeighborsRegressor Evaluation completed: Test R2 score: 0.26824945515748644
SVR Evaluation completed: Test R2 score: 0.24929234952007218
MLPRegressor Evaluation completed: Test R2 score: -0.48373968843332626


Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.551521,0.617126,0.742645,0.147432,0.393938,0.395538,0.402364,0.539872,0.634322,0.315969,0.580779,0.464925
DecisionTreeRegressor,1.019389,0.771138,1.009648,-0.575822,0.217659,0.40134,0.406236,0.547043,0.637367,0.309386,0.572756,0.453921
RandomForestRegressor,0.538691,0.581182,0.733956,0.167266,0.43813,0.481504,0.295976,0.481985,0.544037,0.496833,0.720522,0.431912
GradientBoostingRegressor,0.667545,0.631001,0.817034,-0.031924,0.344811,0.426661,0.405987,0.527839,0.637171,0.30981,0.62732,0.431912
AdaBoostRegressor,0.654182,0.630173,0.808815,-0.011267,0.325032,0.427893,0.347371,0.487988,0.589382,0.40946,0.723742,0.547456
XGBRegressor,0.648811,0.613275,0.805488,-0.002964,0.378363,0.425485,0.399319,0.521251,0.631917,0.321146,0.646088,0.492435
ExtraTreesRegressor,0.567393,0.603428,0.753255,0.122896,0.3962,0.47078,0.369263,0.491644,0.60767,0.372242,0.652011,0.42641
LinearRegression,2.92197,1.295671,1.709377,-3.516924,0.279559,0.279425,0.784181,0.68451,0.88554,-0.33313,0.586585,0.613481
KNeighborsRegressor,0.666412,0.63789,0.816341,-0.030173,0.202036,0.228738,0.430434,0.5719,0.656075,0.268249,0.553076,0.415406
SVR,0.64251,0.651104,0.801567,0.006777,0.251528,0.227924,0.441585,0.521238,0.664519,0.249292,0.617169,0.387896


In [60]:
results_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Results_10_folds_stacked_archi_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Prediction_data_10_folds_stacked_archi_MDCK.csv')

In [61]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 15-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=15, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=15, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (15 fold CV)': mse_train,
        'Train MAE (15 fold CV)': mae_train,
        'Train RMSE (15 fold CV)': rmse_train,
        'Train R2 (15 fold CV)': r2_train,
        'Train PCC (15 fold CV)': pearson_train,
        'Train SCC (15 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 159
[LightGBM] [Info] Start training from score -5.715950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2617
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 160
[LightGBM] [Info] Start training from score -5.679033
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2658
[LightGBM] [Info] Number of data points in the train set: 


Training models: 1it [00:00,  1.87it/s][A

[LightGBM] [Info] Total Bins 2641
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 160
[LightGBM] [Info] Start training from score -5.687277
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2660
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 160
[LightGBM] [Info] Start training from score -5.678070
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2644
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 160
[LightGBM] [Info] Start training from score -5.644560



Training models: 2it [00:13,  7.91s/it][A
Training models: 3it [00:16,  5.66s/it][A
Training models: 4it [00:18,  4.08s/it][A
Training models: 5it [06:39, 139.96s/it][A
Training models: 6it [06:54, 97.63s/it] [A
Training models: 7it [06:58, 66.99s/it][A
Training models: 9it [07:09, 37.39s/it][A
Training models: 10it [07:09, 43.00s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [07:09<21:29, 430.00s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 145
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 23
[LightGBM] [Info] Start training from score -5.715950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 138
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 23
[LightGBM] [Info] Start training from score -5.679033
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 138
[LightGBM] [Info] Number of data points in the train set: 47, n


Training models: 1it [00:00,  2.16it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 24
[LightGBM] [Info] Start training from score -5.644560



Training models: 2it [00:06,  3.49s/it][A
Training models: 3it [00:06,  2.27s/it][A
Training models: 4it [00:07,  1.74s/it][A
Training models: 5it [06:07, 130.94s/it][A
Training models: 6it [06:11, 87.76s/it] [A
Training models: 7it [06:11, 59.11s/it][A
Training models: 10it [06:12, 37.28s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [13:22<13:12, 396.38s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10785
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 622
[LightGBM] [Info] Start training from score -5.715950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10794
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 622
[LightGBM] [Info] Start training from score -5.679033
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 622
[LightGBM] [Info] Start trai


Training models: 1it [00:01,  1.03s/it][A




Training models: 2it [00:05,  3.14s/it][A
Training models: 3it [00:16,  6.45s/it][A
Training models: 4it [00:20,  5.52s/it][A
Training models: 5it [00:26,  5.91s/it][A
Training models: 6it [00:30,  5.20s/it][A
Training models: 7it [00:30,  3.55s/it][A
Training models: 8it [00:30,  2.45s/it][A
Training models: 9it [00:34,  2.74s/it][A
Training models: 10it [00:34,  3.45s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [13:57<03:51, 231.12s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 2
[LightGBM] [Info] Start training from score -5.715950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 2
[LightGBM] [Info] Start training from score -5.679033
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 2
[LightGBM] [Info] Start training from score


Training models: 1it [00:00,  2.75it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 2
[LightGBM] [Info] Start training from score -5.699143
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 1
[LightGBM] [Info] Start training from score -5.629351
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 2
[LightGBM] [Info] Start training from score


Training models: 2it [00:05,  2.94s/it][A
Training models: 3it [00:05,  1.83s/it][A
Training models: 4it [00:05,  1.19s/it][A
Training models: 5it [00:07,  1.43s/it][A

Training models: 10it [00:12,  1.28s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [14:10<00:00, 212.54s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (51, 40)
Dimensions of meta_features_test: (13, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 654
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 40
[LightGBM] [Info] Start training from score -5.715950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 661
[LightGBM] [Info] Number of data points in the train set: 47, number of used features: 40
[LightG



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 662
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from score -5.652255
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000464 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 662
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from score -5.699143
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from

Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.514498,0.586608,0.717285,0.204663,0.452797,0.445583,0.482803,0.589332,0.69484,0.179221,0.460191,0.253444
DecisionTreeRegressor,0.812082,0.687269,0.901156,-0.255356,0.29921,0.237343,0.351141,0.490043,0.592571,0.403051,0.753973,0.668501
RandomForestRegressor,0.575855,0.591858,0.758851,0.109815,0.381461,0.451277,0.271042,0.47872,0.520617,0.539221,0.757333,0.442917
GradientBoostingRegressor,0.594772,0.606723,0.771214,0.080573,0.398492,0.371502,0.339599,0.476943,0.582751,0.422673,0.743347,0.569464
AdaBoostRegressor,0.617556,0.613691,0.785847,0.045352,0.358927,0.344495,0.299102,0.499617,0.546902,0.491518,0.733347,0.376892
XGBRegressor,0.78139,0.663404,0.883963,-0.207911,0.287151,0.287292,0.304233,0.498019,0.551574,0.482795,0.713454,0.541954
ExtraTreesRegressor,0.487118,0.537305,0.697938,0.246989,0.503174,0.549062,0.358384,0.510474,0.598652,0.390736,0.682964,0.404402
LinearRegression,2.437114,1.23825,1.561126,-2.767409,0.302817,0.246561,0.591434,0.646079,0.769048,-0.005456,0.656641,0.508941
KNeighborsRegressor,0.464257,0.504093,0.681364,0.282329,0.537362,0.455033,0.402119,0.550019,0.634128,0.316386,0.637926,0.53095
SVR,0.596423,0.63954,0.772284,0.07802,0.325459,0.303763,0.43906,0.537055,0.662616,0.253585,0.633955,0.442917


In [62]:
results_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Results_15_folds_stacked_archi_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Prediction_data_15_folds_stacked_archi_MDCK.csv')

In [63]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 20-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=20, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=20, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (20 fold CV)': mse_train,
        'Train MAE (20 fold CV)': mae_train,
        'Train RMSE (20 fold CV)': rmse_train,
        'Train R2 (20 fold CV)': r2_train,
        'Train PCC (20 fold CV)': pearson_train,
        'Train SCC (20 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2656
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 160
[LightGBM] [Info] Start training from score -5.707480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2643
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 160
[LightGBM] [Info] Start training from score -5.659572
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2691
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 160
[LightGBM] [Info] Start trainin


Training models: 1it [00:00,  1.71it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2718
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 160
[LightGBM] [Info] Start training from score -5.677426
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2684
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 160
[LightGBM] [Info] Start training from score -5.672026
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2694
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 160
[LightGBM] [Info] Start trainin


Training models: 2it [00:07,  4.02s/it][A
Training models: 3it [00:11,  4.03s/it][A
Training models: 4it [00:13,  3.31s/it][A
Training models: 5it [00:17,  3.50s/it][A
Training models: 6it [00:22,  4.03s/it][A
Training models: 7it [00:22,  2.78s/it][A
Training models: 9it [00:23,  1.83s/it][A
Training models: 10it [00:24,  2.41s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:24<01:12, 24.11s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 166
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 27
[LightGBM] [Info] Start training from score -5.707480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 25
[LightGBM] [Info] Start training from score -5.659572
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 164
[LightGBM] [Info] Number of data points in the train set: 48, n


Training models: 1it [00:00,  1.75it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 158
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 26
[LightGBM] [Info] Start training from score -5.705283
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 28
[LightGBM] [Info] Start training from score -5.688109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 25
[LightGBM] [Info] Start training from


Training models: 2it [00:06,  3.99s/it][A
Training models: 3it [00:08,  2.67s/it][A
Training models: 4it [00:09,  2.09s/it][A
Training models: 5it [00:11,  2.24s/it][A
Training models: 6it [00:16,  3.19s/it][A
Training models: 7it [00:16,  2.19s/it][A
Training models: 10it [00:18,  1.82s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:42<00:41, 20.62s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10985
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 622
[LightGBM] [Info] Start training from score -5.707480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11017
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 622
[LightGBM] [Info] Start training from score -5.659572
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10985
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 622
[LightGBM] [Info] Start trai


Training models: 1it [00:00,  1.12it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11196
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 622
[LightGBM] [Info] Start training from score -5.627732



Training models: 2it [00:07,  4.05s/it][A
Training models: 3it [00:21,  8.64s/it][A
Training models: 4it [00:26,  7.44s/it][A
Training models: 5it [00:36,  8.20s/it][A
Training models: 6it [00:41,  7.15s/it][A
Training models: 7it [00:41,  4.87s/it][A
Training models: 8it [00:41,  3.37s/it][A
Training models: 9it [00:46,  3.69s/it][A
Training models: 10it [00:46,  4.66s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [01:28<00:32, 32.49s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 2
[LightGBM] [Info] Start training from score -5.707480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 2
[LightGBM] [Info] Start training from score -5.659572
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 2
[LightGBM] [Info] Start training from score


Training models: 1it [00:00,  2.08it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 1
[LightGBM] [Info] Start training from score -5.678344
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 2
[LightGBM] [Info] Start training from score -5.627732



Training models: 2it [00:06,  3.99s/it][A
Training models: 3it [00:07,  2.47s/it][A
Training models: 4it [00:07,  1.61s/it][A
Training models: 5it [00:09,  1.64s/it][A
Training models: 6it [00:14,  2.80s/it][A

Training models: 10it [00:16,  1.63s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [01:45<00:00, 26.31s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (51, 40)
Dimensions of meta_features_test: (13, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 663
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from score -5.707480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 664
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightG



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from score -5.651817
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 666
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from score -5.684306
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 663
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 40
[LightGBM] [Info] Start training from



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 689
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 40
[LightGBM] [Info] Start training from score -5.686916
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 691
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 40
[LightGBM] [Info] Start training from score -5.670997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 40
[LightGBM] [Info] Start training from



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 687
[LightGBM] [Info] Number of data points in the train set: 49, number of used features: 40
[LightGBM] [Info] Start training from score -5.627732
LGBMRegressor Evaluation completed: Test R2 score: 0.2769506741903335
DecisionTreeRegressor Evaluation completed: Test R2 score: -0.05905947796992872
RandomForestRegressor Evaluation completed: Test R2 score: 0.4470351741344034
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.24120233177403916
AdaBoostRegressor Evaluation completed: Test R2 score: 0.4686715620707932
XGBRegressor Evaluation completed: Test R2 score: 0.3219808117702513
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.3309441327917443
LinearRegression Evaluation completed: Test R2 score: 0.07313465814924291
KNeighborsRegressor Evaluation completed: Test R2 score: 

Unnamed: 0,Train MSE (20 fold CV),Train MAE (20 fold CV),Train RMSE (20 fold CV),Train R2 (20 fold CV),Train PCC (20 fold CV),Train SCC (20 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.635044,0.688536,0.796896,0.018318,0.237011,0.245481,0.425316,0.552818,0.652162,0.276951,0.533966,0.454545
DecisionTreeRegressor,0.913954,0.670114,0.956009,-0.412835,0.288062,0.447734,0.622966,0.658357,0.789282,-0.059059,0.280419,0.178817
RandomForestRegressor,0.536702,0.579232,0.732599,0.17034,0.444478,0.497025,0.325268,0.488703,0.570323,0.447035,0.672367,0.514443
GradientBoostingRegressor,0.630371,0.566901,0.793959,0.025542,0.399837,0.504627,0.446344,0.544581,0.66809,0.241202,0.517032,0.162311
AdaBoostRegressor,0.61508,0.608444,0.78427,0.049179,0.351791,0.363998,0.312541,0.485675,0.559053,0.468672,0.709441,0.453921
XGBRegressor,0.850271,0.720651,0.922101,-0.31439,0.256318,0.244168,0.398828,0.551599,0.631528,0.321981,0.575448,0.343879
ExtraTreesRegressor,0.520016,0.565823,0.721122,0.196133,0.463733,0.502048,0.393556,0.520277,0.62734,0.330944,0.600502,0.459423
LinearRegression,2.311653,1.196307,1.520412,-2.573467,0.412747,0.371756,0.545206,0.572366,0.73838,0.073135,0.67276,0.596974
KNeighborsRegressor,0.484336,0.51048,0.695942,0.25129,0.50964,0.512953,0.396439,0.530327,0.629634,0.326042,0.689736,0.453921
SVR,0.630563,0.649474,0.79408,0.025245,0.250612,0.232222,0.422743,0.517094,0.650187,0.281325,0.652071,0.376892


In [64]:
results_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Results_20_folds_stacked_archi_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/Stacked/Prediction_data_20_folds_stacked_archi_MDCK.csv')

In [65]:
#Saving the best model
#KR fingerprints (4680 fp)

import os
import joblib 


def train_and_test_predict(models, X_train, y_train, X_test, y_test, save_dir='models_MDCK'):
   
    os.makedirs(save_dir, exist_ok=True)

    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        fold_no = 1
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            fold_model_path = os.path.join(save_dir, f"{model_name}_fold{fold_no}_mdck.joblib")
            joblib.dump(model, fold_model_path)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -4.0)
            test_predictions_folds.append(predictions_test_fold)

            fold_no += 1

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df

df_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Train/KlekotaRoth_train_MDCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Test/KlekotaRoth_test_MDCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Saving the scaler
joblib.dump(scaler, '/home/users/akshay/PCPpred/MDCK/models_MDCK/scaler_KR_fp_mdck.joblib')

models = [
    AdaBoostRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df


X_train shape:  (51, 4860)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 4860)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
AdaBoostRegressor,0.788,0.7035,0.8877,-0.2181,0.1957,0.2251,0.1869,0.3746,0.4323,0.6823,0.8344,0.6179


In [66]:
models_dir = '/home/users/akshay/PCPpred/MDCK/models_MDCK' 
scaler_path = '/home/users/akshay/PCPpred/MDCK/models_MDCK/scaler_KR_fp_mdck.joblib' 
model_base_name = 'AdaBoostRegressor'                   
n_folds = 5                                    


df_new_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Test/KlekotaRoth_test_MDCK.csv')

X_new_test_features = df_new_test.drop(columns=['ID', 'SMILES','Permeability'], errors='ignore')
y_test = df_new_test['Permeability']

scaler = joblib.load(scaler_path)
X_new_scaled = scaler.transform(X_new_test_features)
X_new_scaled = pd.DataFrame(X_new_scaled, columns=X_new_test_features.columns,index=X_new_test_features.index)

all_fold_preds = []

for fold in range(1, n_folds + 1):
    fold_model_path = os.path.join(models_dir, f"{model_base_name}_fold{fold}_mdck.joblib")
    fold_model = joblib.load(fold_model_path)
    preds = fold_model.predict(X_new_scaled)
    preds = np.clip(preds, -10, -4.0)  
    all_fold_preds.append(preds)


all_fold_preds = np.array(all_fold_preds)
mean_prediction = np.mean(all_fold_preds, axis=0)

mse_test = mean_squared_error(y_test, mean_prediction)
print(f"{mse_test:.4f}")
mae_test = mean_absolute_error(y_test, mean_prediction)
print(f"{mae_test:.4f}")
rmse_test = np.sqrt(mse_test)
print(f"{rmse_test:.4f}")
r2_test = r2_score(y_test, mean_prediction)
print(f"{r2_test:.4f}")
pearson_test, _ = pearsonr(y_test, mean_prediction)
print(f"{pearson_test:.4f}")
spearman_test, _ = spearmanr(y_test, mean_prediction)
print(f"{spearman_test:.4f}")

# df_new_test['Predicted_Permeability'] = mean_prediction
# df_new_test.to_csv('path_to_save_predictions.csv', index=False)

print("Prediction on new data complete.")


0.1869
0.3746
0.4323
0.6823
0.8344
0.6179
Prediction on new data complete.
