In [1]:
print('start')

start


In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -4.0)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [4]:
#Monomeric models
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)

    df.columns = [clean_name(col) for col in df.columns]
    return df

In [8]:
#Monomer composition
df_mc_train = pd.read_csv('features/Monomeric/Train_mon_comp_RRCK.csv')
df_mc_train = clean_feature_names(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_RRCK.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(140, 385)
(140,)
(36, 385)
(36,)
0.280044644383886
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 6
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 5
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [In



-0.1523863356331674




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.329,0.4135,0.5736,0.266,0.5733,0.6095,0.5204,0.4745,0.7214,0.28,0.58,0.6476
LGBMRegressor,0.4048,0.4958,0.6363,0.0969,0.324,0.3603,0.6288,0.6047,0.793,0.13,0.5633,0.594
XGBRegressor,0.3129,0.3986,0.5594,0.302,0.6115,0.6525,0.5367,0.4883,0.7326,0.2575,0.5682,0.6344
DecisionTreeRegressor,0.4315,0.4872,0.6569,0.0374,0.5098,0.5391,0.5973,0.5021,0.7729,0.1736,0.5013,0.5516
RandomForestRegressor,0.3136,0.4054,0.56,0.3005,0.5616,0.5891,0.5607,0.5015,0.7488,0.2243,0.543,0.603
GradientBoostingRegressor,0.3096,0.3922,0.5564,0.3094,0.57,0.6085,0.5183,0.476,0.7199,0.283,0.6211,0.7345
AdaBoostRegressor,0.3477,0.4777,0.5897,0.2244,0.4848,0.5234,0.5692,0.5464,0.7545,0.2125,0.6104,0.6589
SVR,0.3121,0.415,0.5586,0.3038,0.5562,0.5492,0.5708,0.5009,0.7555,0.2103,0.6531,0.7202
LinearRegression,0.514,0.5146,0.7169,-0.1465,0.374,0.3985,0.5939,0.5858,0.7706,0.1784,0.587,0.6572
KNeighborsRegressor,0.3399,0.4172,0.583,0.2419,0.5407,0.582,0.5766,0.4985,0.7593,0.2023,0.5856,0.6588


In [9]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-5.5688999999999975, -6.250800000000006, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0078, -6.779999999999983, -6.041100000000...","[-6.166559999999996, -6.588079999999986, -5.84...","[0.30527894522877996, 0.38383999999999396, 0.1..."
1,LGBMRegressor,"[-5.435194447807869, -5.921453487707277, -5.51...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.435194447807869, -5.435194447807869, -5.9...","[-5.509803594592938, -5.509803594592938, -5.89...","[0.03786140795581648, 0.03786140795581648, 0.0..."
2,XGBRegressor,"[-5.9810386, -5.500048, -5.1609173, -6.1795087...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.422107, -6.7394586, -6.1410713, -6.149780...","[-6.427549, -6.584222, -5.9452333, -6.2241383,...","[0.26102582, 0.3183277, 0.18391402, 0.10323646..."
3,DecisionTreeRegressor,"[-6.13, -6.62, -5.16, -6.62, -5.57, -4.4, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -6.78, -6.62, -6.46, -6.62, -5.16, -5...","[-6.354, -6.614, -5.806, -6.459999999999999, -...","[0.3539830504416844, 0.3320000000000001, 0.483..."
4,RandomForestRegressor,"[-5.924199999999997, -5.947850000000005, -5.21...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.164599999999999, -6.409699999999994, -5.8...","[-6.1583279999999965, -6.292819999999993, -5.8...","[0.1555075841751775, 0.25220180332423986, 0.13..."
5,GradientBoostingRegressor,"[-5.817499453013974, -5.838142326436363, -5.33...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.1689812224792, -6.233907101757087, -6.155...","[-6.183862133078874, -6.171055250076803, -5.91...","[0.18301260031667324, 0.16747660768436032, 0.1..."
6,AdaBoostRegressor,"[-5.877826086956522, -5.908867924528305, -5.64...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.055, -6.055, -5.908867924528305, -5.87185...","[-6.10849123817359, -6.1010860433683956, -5.92...","[0.12507011895366335, 0.1383020008478662, 0.09..."
7,SVR,"[-5.713091198547051, -6.179863522237482, -5.32...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.891126011806585, -6.20512562729422, -5.81...","[-5.916854618327262, -6.126788114997646, -5.85...","[0.05928054160495901, 0.19105987633333996, 0.0..."
8,LinearRegression,"[-5.23933658157549, -6.202358993197268, -5.667...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-7.261432253238778, -6.467908498202312, -7.3...","[-6.893803674706957, -6.263292777677549, -6.71...","[0.3294713874495542, 0.3656202288705681, 0.826..."
9,KNeighborsRegressor,"[-5.716666666666666, -6.19, -5.013333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.260000000000001, -6.260000000000001, -5.8...","[-6.173333333333334, -6.114000000000001, -5.91...","[0.12044362443345336, 0.2338698593472688, 0.07..."


In [10]:
result_df.to_csv('results/Monomeric/Monomer_comp_results_RRCK.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_prediction_data_RRCK.csv')

In [13]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [15]:
df_mc_train = pd.read_csv('features/Monomeric/Train_mon_comp_RRCK.csv')
df_mc_train = clean_feature_names(df_mc_train)
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_RRCK.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(140, 67)
(140,)
(36, 67)
(36,)
0.28493794847367626
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000012 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 6
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 5
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [In



-0.23079075675008176




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.3171,0.4029,0.5631,0.2926,0.5898,0.6217,0.5169,0.4706,0.7189,0.2849,0.5823,0.6483
LGBMRegressor,0.4048,0.4958,0.6363,0.0969,0.324,0.3603,0.6288,0.6047,0.793,0.13,0.5633,0.594
XGBRegressor,0.3129,0.3986,0.5594,0.302,0.6115,0.6525,0.5367,0.4883,0.7326,0.2575,0.5682,0.6344
DecisionTreeRegressor,0.408,0.4715,0.6388,0.0898,0.5321,0.57,0.6262,0.5108,0.7913,0.1337,0.4739,0.5542
RandomForestRegressor,0.3126,0.4063,0.5591,0.3026,0.5621,0.5936,0.5564,0.5006,0.7459,0.2303,0.549,0.6071
GradientBoostingRegressor,0.3105,0.3902,0.5572,0.3074,0.5699,0.6098,0.5193,0.475,0.7207,0.2815,0.62,0.7273
AdaBoostRegressor,0.3415,0.4718,0.5844,0.2383,0.5004,0.5083,0.5654,0.54,0.7519,0.2178,0.6111,0.628
SVR,0.3121,0.415,0.5587,0.3038,0.5562,0.5492,0.5709,0.5009,0.7555,0.2103,0.6531,0.7214
LinearRegression,0.514,0.5146,0.7169,-0.1465,0.374,0.3986,0.5939,0.5858,0.7706,0.1784,0.587,0.6572
KNeighborsRegressor,0.3363,0.4173,0.5799,0.2498,0.5469,0.5845,0.579,0.5005,0.7609,0.199,0.5841,0.6519


In [16]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-5.624399999999997, -6.334700000000006, -5.16...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9974, -6.779999999999983, -6.112600000000...","[-6.158539999999997, -6.587919999999985, -5.85...","[0.3089407036957032, 0.3841599999999943, 0.142..."
1,LGBMRegressor,"[-5.435194447807869, -5.921453487707277, -5.51...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.435194447807869, -5.435194447807869, -5.9...","[-5.509803594592938, -5.509803594592938, -5.89...","[0.03786140795581648, 0.03786140795581648, 0.0..."
2,XGBRegressor,"[-5.9810386, -5.500048, -5.1609173, -6.1795087...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.422107, -6.7394586, -6.1410713, -6.149780...","[-6.427549, -6.584222, -5.9452333, -6.2241383,...","[0.26102582, 0.3183277, 0.18391402, 0.10323646..."
3,DecisionTreeRegressor,"[-6.13, -6.62, -5.16, -5.58, -5.57, -4.4, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -6.78, -5.58, -6.46, -5.58, -5.16, -5...","[-6.354, -6.614, -5.68, -6.459999999999999, -5...","[0.3539830504416844, 0.3320000000000001, 0.394..."
4,RandomForestRegressor,"[-5.901899999999996, -5.951250000000007, -5.20...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.114299999999998, -6.403599999999995, -5.9...","[-6.140839999999996, -6.286539999999993, -5.85...","[0.18114299986474697, 0.25951306402568464, 0.1..."
5,GradientBoostingRegressor,"[-5.817499453013974, -5.855639926213294, -5.33...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.194981222479201, -6.233907101757087, -6.1...","[-6.189062133078874, -6.171055250076803, -5.91...","[0.18288523834479858, 0.16747660768436032, 0.1..."
6,AdaBoostRegressor,"[-5.953717948717944, -5.954999999999996, -5.79...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.010352112676055, -6.010352112676055, -5.9...","[-6.0679815336463205, -6.0679815336463205, -5....","[0.1803695346277304, 0.1803695346277304, 0.083..."
7,SVR,"[-5.713044593865235, -6.179886545929588, -5.32...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.891087559751947, -6.205122660812568, -5.8...","[-5.916845826101662, -6.126789610588135, -5.85...","[0.05928397270291572, 0.19106032390527164, 0.0..."
8,LinearRegression,"[-5.239336581575468, -6.202358993197267, -5.66...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-7.261432253238787, -6.467908498202323, -7.3...","[-6.893803674706964, -6.263292777677553, -6.71...","[0.32947138744955473, 0.3656202288705669, 0.82..."
9,KNeighborsRegressor,"[-5.716666666666666, -6.19, -5.013333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.260000000000001, -6.260000000000001, -5.8...","[-6.173333333333334, -6.114000000000001, -5.91...","[0.12044362443345336, 0.2338698593472688, 0.07..."


In [17]:
const_col

['Ala_indol_2_yl_',
 'dAla_indol_2_yl_',
 'Me_Ala_indol_2_yl_',
 'Ala_5_Tet_',
 '2Abz',
 'Aib',
 'Aoc_2_',
 '5_Ava',
 'Bal',
 'Me_Bal',
 'HOCOCH2_Bal',
 'Cys_EtO2H__NH2',
 'dCha',
 'D',
 'Asp_piperidide',
 'Asp_OMe_',
 'Asp_Ph_2_NH2__',
 'dAsp_pyrrol_1_yl_',
 'E',
 'Glu_NH2',
 'Glu_3R_Me_',
 'Glu_OMe_',
 'dGlu_OMe_',
 'Phe_4_F_',
 'dPhe_4_F_',
 'Phe_4_CF3_',
 'Phe_4_NO2_',
 'Phe_CHF2_',
 'dPhe_3_4_diF_',
 'Et_Phe',
 'H2NEt_Phe',
 'Me_Phe_3_Cl_',
 'Me_Phe_4_Cl_',
 'Me_Phe_a_b_dehydro_',
 'G',
 'Bn_Gly',
 'Bn_4_Cl__Gly',
 'Bu_Gly',
 'Et_Gly',
 'EtOEt_Gly',
 'HOCOCH2_Gly_ol',
 'MeOEt_Gly',
 'NH2Bu_Gly',
 'Pr_Gly',
 'PhEt_Gly',
 'cHexCH2_Gly',
 'isoamyl_Gly',
 'pentyl_Gly',
 '3_pyridylethyl_Gly',
 'd_N__O_Gly_allyl_',
 'GABA',
 'H',
 'Me_Hph',
 'bHph',
 'Hph_2_Cl_',
 'Hph_3_Cl_',
 'Hph_4_Cl_',
 'Hse_Et_',
 'dHyp',
 'Hyp_Et_',
 'dI',
 'meI',
 'Me_dI',
 '_N__O_xiIle',
 'd_N__O_aIle',
 'K',
 'dK',
 'Me_dK',
 'Lys_Ac_',
 'Lys_Cbz_',
 'Lys_iPr_',
 'Lys_Me_',
 'Lys_Me2_',
 'Lys_Tfa_',
 'aMeLeu',

In [18]:
result_df.to_csv('results/Monomeric/Monomer_comp_constRemoval_results_RRCK.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_constRemoval_prediction_data_RRCK.csv')

In [14]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [20]:
df_train = pd.read_csv('features/Monomeric/Train_mon_comp_RRCK.csv')
df_mc_train = clean_feature_names(df_train)
df_mc_train = df_mc_train.drop(['ID','SMILES','Permeability'],axis=1)
df_mc, const_col = remove_low_variance_columns(df_mc_train)
X_train = df_mc
y_train = df_train['Permeability']
print(X_train.shape)
print(y_train.shape)

df_mc_test = pd.read_csv('features/Monomeric/Test_mon_comp_RRCK.csv')
df_mc_test = clean_feature_names(df_mc_test)
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

(140, 10)
(140,)
(36, 10)
(36,)
0.09368244903816247
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 5
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 5
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 112, number of used 



-0.8037102097709128




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.3764,0.4626,0.6135,0.1604,0.5,0.5175,0.6551,0.5824,0.8094,0.0937,0.4381,0.5117
LGBMRegressor,0.4048,0.4958,0.6363,0.0969,0.324,0.3603,0.6288,0.6047,0.793,0.13,0.5633,0.594
XGBRegressor,0.4519,0.5061,0.6722,-0.008,0.4258,0.4475,0.6647,0.5804,0.8153,0.0804,0.4226,0.5123
DecisionTreeRegressor,0.4305,0.491,0.6561,0.0397,0.4621,0.474,0.6527,0.5669,0.8079,0.097,0.4318,0.5329
RandomForestRegressor,0.39,0.4805,0.6245,0.13,0.4428,0.4558,0.6268,0.5739,0.7917,0.1329,0.4608,0.5436
GradientBoostingRegressor,0.394,0.4811,0.6277,0.121,0.4321,0.4485,0.65,0.5813,0.8062,0.1008,0.4274,0.5646
AdaBoostRegressor,0.3669,0.4805,0.6057,0.1815,0.4278,0.4556,0.6216,0.5813,0.7884,0.14,0.5475,0.5918
SVR,0.3786,0.4721,0.6153,0.1554,0.4317,0.4591,0.6796,0.5812,0.8244,0.0598,0.4012,0.4766
LinearRegression,0.4287,0.5064,0.6548,0.0436,0.3137,0.3307,0.7635,0.6208,0.8738,-0.0563,0.2975,0.3447
KNeighborsRegressor,0.4235,0.5158,0.6508,0.0552,0.4067,0.4064,0.6903,0.6079,0.8308,0.0451,0.4102,0.4909


In [21]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.182500000000004, -5.54750000000001, -5.160...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.182500000000004, -6.182500000000004, -5.7...","[-6.2764333333333395, -6.2764333333333395, -5....","[0.06030382335400829, 0.06030382335400829, 0.1..."
1,LGBMRegressor,"[-5.435194447807869, -5.921453487707277, -5.51...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.435194447807869, -5.435194447807869, -5.9...","[-5.509803594592938, -5.509803594592938, -5.89...","[0.03786140795581648, 0.03786140795581648, 0.0..."
2,XGBRegressor,"[-6.1811376, -5.343238, -5.1590805, -6.388953,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.1811376, -6.1811376, -5.5787435, -6.06971...","[-6.2754636, -6.2754636, -5.6456575, -5.706023...","[0.060485173, 0.060485173, 0.3380656, 0.691104..."
3,DecisionTreeRegressor,"[-6.1825, -5.35, -5.16, -6.01, -5.461428571428...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.1825, -6.1825, -5.35, -6.46, -6.01, -5.16...","[-6.276433333333334, -6.276433333333334, -5.70...","[0.06030382335400576, 0.06030382335400576, 0.3..."
4,RandomForestRegressor,"[-6.143599523809526, -5.50570000000001, -5.194...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.143599523809526, -6.143599523809526, -5.7...","[-6.248704983405483, -6.248704983405483, -5.79...","[0.06472076353610155, 0.06472076353610155, 0.2..."
5,GradientBoostingRegressor,"[-6.082024192129701, -5.501094484382374, -5.20...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.082024192129701, -6.082024192129701, -5.7...","[-6.186388619039126, -6.186388619039126, -5.72...","[0.066904261945415, 0.066904261945415, 0.19542..."
6,AdaBoostRegressor,"[-6.01265625, -5.9071428571428575, -5.59070175...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.01265625, -6.01265625, -5.86142045454545,...","[-5.977907114779875, -5.977907114779875, -5.92...","[0.054015120548614265, 0.054015120548614265, 0..."
7,SVR,"[-5.769833116390693, -5.933091670841013, -5.30...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.769833116390693, -5.769833116390693, -5.6...","[-5.785732283117556, -5.785732283117556, -5.65...","[0.032079070297683455, 0.032079070297683455, 0..."
8,LinearRegression,"[-5.537915407041128, -5.70888468423566, -5.687...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.537915407041128, -5.537915407041128, -5.4...","[-5.589180640819773, -5.589180640819773, -5.46...","[0.043512307683369104, 0.043512307683369104, 0..."
9,KNeighborsRegressor,"[-6.286666666666666, -5.766666666666666, -5.93...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.286666666666666, -6.286666666666666, -5.7...","[-6.3533333333333335, -6.3533333333333335, -5....","[0.1167904105652515, 0.1167904105652515, 0.100..."


In [22]:
result_df.to_csv('results/Monomeric/Monomer_comp_LVR_results_RRCK.csv')
prediction_df.to_csv('results/Monomeric/Monomer_comp_LVR_prediction_data_RRCK.csv')

In [23]:
#AA composition
df_aac_train = pd.read_csv('features/Monomeric/Train_aac_RRCK.csv')
X_train = df_aac_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_aac_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_aac_test = pd.read_csv('features/Monomeric/Test_aac_RRCK.csv')
X_test = df_aac_test.drop(['ID','SMILES','Permeability'], axis=1)
y_test = df_aac_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
aac_comp,prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
aac_comp

(140, 21)
(140,)
(36, 21)
(36,)
0.21152972464058117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 60
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 8
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 7
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to rem



-1.8752243745538566




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.4865,0.535,0.6975,-0.0853,0.2871,0.3061,0.5699,0.5315,0.7549,0.2115,0.54,0.6021
LGBMRegressor,0.3765,0.4863,0.6136,0.1602,0.4093,0.4551,0.6052,0.5535,0.7779,0.1628,0.5577,0.6153
XGBRegressor,0.4549,0.5236,0.6745,-0.0147,0.3557,0.3768,0.656,0.5399,0.8099,0.0924,0.4372,0.548
DecisionTreeRegressor,0.4583,0.5243,0.677,-0.0223,0.399,0.3916,0.6988,0.5367,0.836,0.0332,0.4137,0.5036
RandomForestRegressor,0.423,0.5096,0.6503,0.0565,0.3578,0.3964,0.6695,0.5416,0.8182,0.0738,0.4413,0.5521
GradientBoostingRegressor,0.3899,0.4913,0.6245,0.1301,0.4263,0.45,0.6714,0.5433,0.8194,0.0712,0.4348,0.5538
AdaBoostRegressor,0.3715,0.49,0.6095,0.1712,0.4142,0.4522,0.6888,0.5843,0.8299,0.0471,0.4362,0.5114
SVR,0.4195,0.4884,0.6477,0.0642,0.3794,0.4044,0.7192,0.6009,0.8481,0.005,0.4441,0.4791
LinearRegression,0.3749,0.4868,0.6123,0.1636,0.4238,0.4152,0.782,0.6568,0.8843,-0.0819,0.3039,0.4169
KNeighborsRegressor,0.4902,0.5343,0.7002,-0.0936,0.2835,0.3369,0.685,0.6446,0.8277,0.0523,0.4798,0.4131


In [24]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-5.445449999999999, -6.079750000000003, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.1606, -6.365000000000006, -5.707621250000...","[-6.11379, -6.365, -5.7121053611111146, -6.367...","[0.06926147847108237, 0.26246904579397046, 0.0..."
1,LGBMRegressor,"[-5.959733429896936, -5.850785207180732, -5.51...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.084860559870292, -5.959733429896936, -5.8...","[-6.118872178235309, -6.051852551138583, -5.75...","[0.07920315170515813, 0.09407708752125336, 0.1..."
2,XGBRegressor,"[-5.9788427, -6.246716, -5.079993, -5.370795, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.129233, -6.364632, -5.430443, -6.4609346,...","[-6.179635, -6.3646026, -5.3644476, -6.5963073...","[0.14560117, 0.26127824, 0.19980119, 0.2718335..."
3,DecisionTreeRegressor,"[-6.365, -6.22, -5.08, -5.08, -6.05, -5.1, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.87, -6.365, -5.08, -6.46, -5.08, -5.08, -...","[-6.136, -6.365, -5.0725, -6.664, -5.1818, -5....","[0.2050707195091489, 0.2624690457939755, 0.100..."
4,RandomForestRegressor,"[-5.847411666666666, -6.115625000000008, -5.11...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.136050000000001, -6.264725000000001, -5.3...","[-6.1258365, -6.279342166666664, -5.4532602103...","[0.04986056077368814, 0.1532158502313333, 0.11..."
5,GradientBoostingRegressor,"[-5.819199211300774, -6.1570215442491465, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.167227238122383, -6.284029146743934, -5.4...","[-6.33441550854893, -6.281958413949489, -5.433...","[0.3349141307000285, 0.17081896150472703, 0.09..."
6,AdaBoostRegressor,"[-5.883111111111111, -5.977142857142859, -5.55...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.133846153846155, -6.133846153846155, -5.5...","[-6.148538461538462, -6.160126696832579, -5.53...","[0.13251245336140263, 0.12105412756167108, 0.2..."
7,SVR,"[-5.522594908354534, -5.860263707068752, -5.32...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.011452088036632, -6.050430877547241, -5.4...","[-5.994372703029188, -6.058943915570362, -5.50...","[0.06556461654402354, 0.030512062354142125, 0...."
8,LinearRegression,"[-5.552375842562193, -5.803340984923865, -5.65...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.926027265141383, -6.266721071876499, -5.4...","[-6.837752206181041, -6.269327159266599, -5.49...","[0.1492203173084247, 0.08490918876107027, 0.03..."
9,KNeighborsRegressor,"[-5.53, -6.246666666666667, -5.153333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.2, -6.2, -6.103333333333334, -5.02, -6.10...","[-6.208, -6.128666666666667, -5.96266666666666...","[0.02712931993250136, 0.17422335600538097, 0.3..."


In [25]:
aac_comp.to_csv('results/Monomeric/AAC_comp_results_RRCK.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_prediction_data_RRCK.csv')

In [27]:
#Constant column removal
df_mc_train = pd.read_csv('features/Monomeric/Train_aac_RRCK.csv')
df_mc_train, const_col = remove_constant_columns(df_mc_train)
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_aac_RRCK.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
aac_comp,prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
aac_comp

(140, 16)
(140,)
(36, 16)
(36,)
0.19096279552974194
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 60
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 8
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 7
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000024 seconds.
You can set `force_col_wise=true` to rem



-1.6096281368154113




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.481,0.5304,0.6935,-0.0729,0.2955,0.3176,0.5848,0.5387,0.7647,0.191,0.5256,0.5834
LGBMRegressor,0.3765,0.4863,0.6136,0.1602,0.4093,0.4551,0.6052,0.5535,0.7779,0.1628,0.5577,0.6153
XGBRegressor,0.4549,0.5236,0.6745,-0.0147,0.3557,0.3768,0.656,0.5399,0.8099,0.0924,0.4372,0.548
DecisionTreeRegressor,0.4491,0.5184,0.6702,-0.0019,0.4225,0.4169,0.6917,0.5335,0.8317,0.0431,0.4087,0.4853
RandomForestRegressor,0.4242,0.5101,0.6513,0.0538,0.3544,0.3981,0.6701,0.5408,0.8186,0.0729,0.4393,0.5497
GradientBoostingRegressor,0.3912,0.4926,0.6255,0.1272,0.4244,0.449,0.6773,0.5445,0.823,0.0631,0.4279,0.5395
AdaBoostRegressor,0.3623,0.4713,0.6019,0.1918,0.4391,0.4548,0.703,0.5829,0.8385,0.0274,0.4039,0.4899
SVR,0.4195,0.4883,0.6477,0.0643,0.3794,0.4044,0.7192,0.6009,0.8481,0.005,0.4441,0.4791
LinearRegression,0.3749,0.4868,0.6123,0.1636,0.4238,0.4152,0.782,0.6568,0.8843,-0.0819,0.3039,0.4169
KNeighborsRegressor,0.4905,0.5352,0.7003,-0.0941,0.2822,0.3368,0.685,0.6446,0.8277,0.0523,0.4798,0.4131


In [29]:
aac_comp.to_csv('results/Monomeric/AAC_comp_const_rem_results_RRCK.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_const_rem_prediction_data_RRCK.csv')

In [30]:
#LVR column removal
df_mc_train = pd.read_csv('features/Monomeric/Train_aac_RRCK.csv')
X_train = df_mc_train.drop(['ID','SMILES','Permeability'], axis=1)
X_train, const_col = remove_low_variance_columns(X_train)

y_train = df_mc_train['Permeability']
print(X_train.shape)
print(y_train.shape)
df_mc_test = pd.read_csv('features/Monomeric/Test_aac_RRCK.csv')
X_test = df_mc_test.drop(['ID','SMILES','Permeability'], axis=1)
X_test = X_test.drop(const_col, axis=1)
y_test = df_mc_test['Permeability']
print(X_test.shape)
print(y_test.shape)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_mc = [
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    xgb.XGBRegressor(random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    SVR(),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3), 
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_mc, X_train,y_train, X_test,  y_test)
result_df

(140, 7)
(140,)
(36, 7)
(36,)
0.19341181408286545
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 7
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 6
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remov



-1.5192921849176995




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
ExtraTreesRegressor,0.4342,0.5154,0.6589,0.0315,0.3809,0.4126,0.583,0.5592,0.7636,0.1934,0.4995,0.5794
LGBMRegressor,0.3755,0.4834,0.6128,0.1624,0.4112,0.4516,0.6874,0.5899,0.8291,0.049,0.4336,0.5221
XGBRegressor,0.4443,0.5228,0.6666,0.0089,0.4004,0.4212,0.7775,0.5957,0.8818,-0.0756,0.294,0.4028
DecisionTreeRegressor,0.4377,0.513,0.6616,0.0237,0.4266,0.4402,0.7876,0.588,0.8875,-0.0896,0.301,0.4203
RandomForestRegressor,0.4094,0.5041,0.6398,0.0868,0.4018,0.4282,0.7603,0.5917,0.872,-0.0518,0.3017,0.4071
GradientBoostingRegressor,0.3956,0.4992,0.6289,0.1176,0.4222,0.4487,0.6485,0.5418,0.8053,0.1028,0.4452,0.5529
AdaBoostRegressor,0.4106,0.5017,0.6408,0.0841,0.3258,0.3495,0.7166,0.5889,0.8465,0.0086,0.3853,0.47
SVR,0.4235,0.5085,0.6508,0.0553,0.3764,0.4069,0.6692,0.5688,0.818,0.0743,0.4777,0.4833
LinearRegression,0.4344,0.5305,0.6591,0.031,0.2643,0.3265,0.7726,0.6356,0.879,-0.0689,0.3271,0.397
KNeighborsRegressor,0.4834,0.5414,0.6953,-0.0784,0.3158,0.3341,0.7266,0.626,0.8524,-0.0052,0.4027,0.4447


In [31]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,ExtraTreesRegressor,"[-6.276100000000003, -6.003100000000007, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.08725, -6.365000000000006, -5.78661000000...","[-6.069866666666668, -6.365, -5.80375719047619...","[0.07233250613966397, 0.26246904579397046, 0.1..."
1,LGBMRegressor,"[-5.912808776851838, -5.819948965340556, -5.41...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.932633015093644, -5.912808776851838, -5.6...","[-6.024578784682684, -6.015399774427789, -5.57...","[0.08396016951161674, 0.09366470507500912, 0.1..."
2,XGBRegressor,"[-6.1519527, -6.2191067, -5.080466, -5.226611,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.167827, -6.3648834, -5.0976486, -6.462305...","[-6.1173487, -6.365303, -5.2530785, -6.6677756...","[0.19119553, 0.26052287, 0.18235432, 0.4141614..."
3,DecisionTreeRegressor,"[-6.365, -6.1, -5.08, -5.08, -5.91, -5.1, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -6.365, -5.08, -6.46, -5.08, -5.08, -...","[-6.042, -6.365, -5.039199999999999, -6.664, -...","[0.11070682002478427, 0.2624690457939755, 0.12..."
4,RandomForestRegressor,"[-6.245216666666667, -6.0409750000000075, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.084025000000001, -6.254675000000001, -5.3...","[-6.069848, -6.2641546666666645, -5.3402180569...","[0.07842518330938791, 0.17241789042065808, 0.0..."
5,GradientBoostingRegressor,"[-6.114012045283443, -6.0864352769053784, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.218462398795427, -6.2856600369179745, -5....","[-6.132020966277804, -6.299439858826505, -5.51...","[0.1203833140283602, 0.2038776572883081, 0.154..."
6,AdaBoostRegressor,"[-5.753684210526315, -5.909761904761905, -5.65...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.753684210526315, -5.753684210526315, -5.5...","[-5.956928508771929, -5.956928508771929, -5.51...","[0.12501810005612615, 0.12501810005612615, 0.1..."
7,SVR,"[-5.98992319355122, -5.972260664912005, -5.237...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8386451282676015, -5.853732722497016, -5....","[-5.843828489280463, -5.930674086585016, -5.79...","[0.09752068189665018, 0.06456168805123304, 0.1..."
8,LinearRegression,"[-5.418149858689573, -5.780950585809417, -5.62...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.4693359419534895, -5.4151910063339415, -5...","[-5.620607010366598, -5.5102855818128615, -5.5...","[0.12774697616740696, 0.08106265305873837, 0.0..."
9,KNeighborsRegressor,"[-6.06, -5.986666666666667, -5.153333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.2, -6.0, -5.830000000000001, -5.02, -5.43...","[-6.1033333333333335, -6.010666666666667, -5.7...","[0.16206994374857597, 0.07883597599517059, 0.2..."


In [32]:
result_df.to_csv('results/Monomeric/AAC_comp_LVR_results_RRCK.csv')
prediction_df.to_csv('results/Monomeric/AAC_comp_LVR_prediction_data_RRCK.csv')

In [33]:
#Atomic models
df_train = pd.read_csv('features/Atomic/Train_all_atomic_desc_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Atomic/Test_all_atomic_desc_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_degree = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_degree, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 23)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 23)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 146
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 9
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 148
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 9
[L



-1.63363847375732




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3421,0.4637,0.5849,0.2368,0.4873,0.4858,0.6364,0.5452,0.7978,0.1196,0.5592,0.6038
DecisionTreeRegressor,0.3953,0.4518,0.6287,0.1183,0.5309,0.584,0.3749,0.4312,0.6123,0.4813,0.7249,0.775
RandomForestRegressor,0.2706,0.4007,0.5202,0.3964,0.6348,0.6594,0.4167,0.4521,0.6455,0.4235,0.7715,0.8072
GradientBoostingRegressor,0.249,0.3752,0.499,0.4446,0.6705,0.689,0.3446,0.4103,0.5871,0.5232,0.7969,0.8081
AdaBoostRegressor,0.3068,0.4388,0.5539,0.3157,0.5716,0.6341,0.4961,0.5137,0.7043,0.3137,0.7565,0.8094
XGBRegressor,0.2726,0.4097,0.5221,0.392,0.6403,0.6462,0.379,0.4327,0.6156,0.4757,0.7633,0.7975
ExtraTreesRegressor,0.2224,0.3723,0.4716,0.5039,0.7155,0.7322,0.3485,0.4001,0.5904,0.5178,0.7723,0.8025
LinearRegression,0.5002,0.5224,0.7073,-0.1159,0.3531,0.4631,0.6137,0.5955,0.7834,0.1509,0.5205,0.5048
KNeighborsRegressor,0.3111,0.4221,0.5578,0.306,0.5945,0.6121,0.4769,0.489,0.6905,0.3403,0.698,0.7151
SVR,0.2766,0.4138,0.526,0.3829,0.6245,0.6147,0.5527,0.5113,0.7434,0.2354,0.615,0.6317


In [34]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.873665300227856, -5.8226616351027, -5.4099...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.04930585615838, -6.04930585615838, -5.438...","[-6.046574217109014, -6.046574217109014, -5.61...","[0.03122094867372097, 0.03122094867372097, 0.2..."
1,DecisionTreeRegressor,"[-6.13, -6.46, -5.08, -5.4, -5.32, -5.875, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -7.48, -6.46, -7.48, -5.08, -6...","[-6.516, -6.116, -6.5824, -6.412000000000001, ...","[0.40410889621486923, 0.332, 0.899452633549983..."
2,RandomForestRegressor,"[-5.995999999999998, -5.903415833333335, -5.12...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.103549999999998, -6.028899999999998, -6.2...","[-6.2518025, -6.127687499999997, -6.2816994761...","[0.1068714380693, 0.13607799096473877, 0.21659..."
3,GradientBoostingRegressor,"[-6.011366667190261, -5.865643415478956, -5.17...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.367825002993948, -6.048392355985687, -6.2...","[-6.493006530515169, -6.130452323495689, -6.30...","[0.1229687933832418, 0.19139470882583712, 0.19..."
4,AdaBoostRegressor,"[-6.083333333333333, -5.983939393939394, -5.34...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.123488372093023, -6.169142857142859, -6.4...","[-6.221943970714902, -6.250645238095238, -6.25...","[0.11786397921283219, 0.0939463174738281, 0.19..."
5,XGBRegressor,"[-6.1204615, -5.8016634, -5.0802007, -5.398436...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8901014, -5.952164, -6.7859526, -6.084969...","[-6.0394707, -6.102478, -6.426408, -5.978576, ...","[0.140527, 0.30000702, 0.440582, 0.2741827, 0...."
6,ExtraTreesRegressor,"[-6.097599999999996, -5.575700000000005, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.332800000000001, -5.949999999999999, -6.2...","[-6.40366, -6.005179999999999, -6.259808000000...","[0.0769905864375617, 0.11035999999999965, 0.08..."
7,LinearRegression,"[-6.014487641445765, -6.094860496431176, -6.00...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.821904275478004, -5.857693271027201, -6.4...","[-5.760712839409977, -5.875575458808629, -6.50...","[0.07513734048391776, 0.05603689378979565, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.733333333333333, -5.15...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.3...","[-6.118666666666667, -6.118666666666667, -6.10...","[0.11400974617208051, 0.11400974617208051, 0.3..."
9,SVR,"[-6.081208009466288, -5.724475285630279, -5.32...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.062151499957152, -5.987131520808791, -5.7...","[-6.07304236689686, -6.00975872929179, -5.7032...","[0.015138000671056244, 0.012220125351811708, 0..."


In [35]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_RRCK.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_RRCK.csv')

In [7]:
#Atomic + monomeric_composition based features
df1 = pd.read_csv('features/Monomeric/Train_mon_comp_RRCK.csv')
df2 = pd.read_csv('features/Atomic/Train_all_atomic_desc_RRCK.csv')
df_train = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_train

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,0.090909,0.090909,0.000000,0.0,0.0,0.0,0.0,...,1,74,12,0,0,0,0,106,0,1
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,0.090909,0.090909,0.000000,0.0,0.0,0.0,0.0,...,1,73,13,0,0,0,0,107,0,1
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,0.090909,0.090909,0.000000,0.0,0.0,0.0,0.0,...,1,73,12,0,0,0,0,106,0,1
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,0.090909,0.090909,0.000000,0.0,0.0,0.0,0.0,...,2,73,12,0,0,0,0,106,0,1
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,0.181818,0.090909,0.000000,0.0,0.0,0.0,0.0,...,1,72,12,0,0,0,0,106,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,1,36,6,0,6,0,0,60,1,1
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,0.000000,0.000000,0.333333,0.0,0.0,0.0,0.0,...,1,39,6,0,0,0,0,54,0,1
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,0.166667,0.166667,0.000000,0.0,0.0,0.0,0.0,...,1,34,6,0,6,0,0,60,1,1
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,0.000000,0.000000,0.333333,0.0,0.0,0.0,0.0,...,1,38,6,0,0,0,0,54,0,1


In [8]:
df1 = pd.read_csv('features/Monomeric/Test_mon_comp_RRCK.csv')
df2 = pd.read_csv('features/Atomic/Test_all_atomic_desc_RRCK.csv')
df_test = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_test

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,...,1,74,12,0,0,0,0,111,0,1
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,...,1,73,12,0,0,0,0,106,0,1
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,67,9,0,6,0,0,91,1,1
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,59,9,0,6,0,0,90,1,1
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,60,8,0,6,0,0,82,1,1
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,0.0,0.0,0.125,0.125,0.0,0.0,0.0,...,1,56,8,0,6,0,0,82,1,1
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,0.0,0.0,0.222222,0.111111,0.0,0.0,0.0,...,1,54,9,0,6,0,0,91,1,1
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,38,6,0,30,0,0,85,1,1
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,...,1,40,6,0,23,0,0,99,1,1
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,43,7,0,18,0,0,85,1,1


In [9]:
import re
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)

    df.columns = [clean_name(col) for col in df.columns]
    return df

In [10]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [11]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 408)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 408)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 183
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 15
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 178
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 14
[LightGBM] [Info] Start training from score -5.528750
[LightGB



0.07846311307106169




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3369,0.4381,0.5804,0.2485,0.5032,0.5278,0.6234,0.5624,0.7896,0.1375,0.5258,0.574
DecisionTreeRegressor,0.4584,0.5053,0.677,-0.0225,0.4617,0.4618,0.3415,0.414,0.5844,0.5276,0.7621,0.7666
RandomForestRegressor,0.2518,0.376,0.5018,0.4382,0.665,0.6791,0.4662,0.4635,0.6828,0.3551,0.6854,0.7495
GradientBoostingRegressor,0.2397,0.3567,0.4896,0.4652,0.6841,0.7,0.3667,0.4283,0.6055,0.4927,0.762,0.7971
AdaBoostRegressor,0.2947,0.4309,0.5429,0.3425,0.5868,0.6317,0.5245,0.516,0.7243,0.2743,0.6592,0.7512
XGBRegressor,0.2584,0.373,0.5083,0.4236,0.6699,0.6864,0.3824,0.4158,0.6184,0.471,0.7097,0.7729
ExtraTreesRegressor,0.247,0.3578,0.497,0.449,0.6805,0.6891,0.4328,0.4268,0.6579,0.4013,0.6854,0.7549
LinearRegression,2.5403,0.9458,1.5938,-4.6667,0.291,0.4448,1.0111,0.7189,1.0055,-0.3988,0.5185,0.5278
KNeighborsRegressor,0.3327,0.4041,0.5768,0.2578,0.5579,0.5941,0.5363,0.4968,0.7323,0.2581,0.6398,0.7133
SVR,0.2694,0.3887,0.5191,0.399,0.6349,0.6323,0.5337,0.4884,0.7306,0.2616,0.6879,0.7291


In [12]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_and_mono_comp_RRCK.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_and_mono_comp_RRCK.csv')

In [13]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 82)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 82)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 183
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 15
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 178
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 14
[LightGBM] [Info] Start training from score -5.528750
[LightGBM]



-0.1437962607050336




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3369,0.4381,0.5804,0.2485,0.5032,0.5278,0.6234,0.5624,0.7896,0.1375,0.5258,0.574
DecisionTreeRegressor,0.4836,0.5128,0.6954,-0.0788,0.4544,0.4538,0.3403,0.4065,0.5834,0.5292,0.7672,0.7889
RandomForestRegressor,0.2497,0.3751,0.4997,0.4429,0.6692,0.6837,0.4709,0.4614,0.6862,0.3486,0.6781,0.7564
GradientBoostingRegressor,0.2363,0.3523,0.4861,0.4728,0.6896,0.7067,0.3679,0.4271,0.6066,0.491,0.7626,0.7996
AdaBoostRegressor,0.2998,0.4363,0.5476,0.3311,0.5788,0.6115,0.5302,0.5282,0.7281,0.2665,0.675,0.7223
XGBRegressor,0.2584,0.373,0.5083,0.4236,0.6699,0.6864,0.3824,0.4158,0.6184,0.471,0.7097,0.7729
ExtraTreesRegressor,0.2564,0.3601,0.5063,0.4281,0.6669,0.6782,0.4348,0.4297,0.6594,0.3984,0.6848,0.7513
LinearRegression,2.5403,0.9458,1.5938,-4.6667,0.291,0.4448,1.0111,0.7189,1.0055,-0.3988,0.5185,0.5278
KNeighborsRegressor,0.3289,0.4048,0.5735,0.2663,0.5673,0.6015,0.5364,0.4958,0.7324,0.2579,0.6431,0.7064
SVR,0.2694,0.3887,0.5191,0.399,0.6349,0.6323,0.5337,0.4884,0.7306,0.2616,0.6879,0.7291


In [14]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.802444784213362, -5.7683475554749535, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9841070250323645, -5.9841070250323645, -5...","[-5.988971339351776, -5.988971339351776, -5.68...","[0.08530700411593888, 0.08530700411593888, 0.2..."
1,DecisionTreeRegressor,"[-6.13, -6.92, -5.16, -4.3, -6.96, -4.5, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -5.57, -6.46, -6.46, -5.16, -4...","[-6.101999999999999, -6.122, -6.618, -6.316, -...","[0.3202124294901746, 0.2752017441805193, 0.758..."
2,RandomForestRegressor,"[-6.074499999999998, -5.880700000000003, -5.19...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0875999999999975, -6.085099999999999, -5....","[-6.202589999999998, -6.1943699999999975, -6.0...","[0.10220774138977937, 0.14463749721285765, 0.0..."
3,GradientBoostingRegressor,"[-6.034910547722071, -5.763580225049631, -5.25...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.674966222650195, -6.279071012109364, -5.9...","[-6.647113969127818, -6.292290193009951, -6.22...","[0.21849797874464758, 0.20570930342056465, 0.2..."
4,AdaBoostRegressor,"[-6.205625, -6.008809523809527, -5.34617647058...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.2250000000000005, -6.2250000000000005, -6...","[-6.246907009247069, -6.242938291810631, -6.10...","[0.05323923436223835, 0.06633795634070008, 0.2..."
5,XGBRegressor,"[-6.1313257, -6.2896805, -5.159187, -5.0093913...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.5702524, -6.590028, -6.4245596, -6.563687...","[-6.389728, -6.504466, -6.257691, -6.4516745, ...","[0.21942674, 0.2808346, 0.18625596, 0.10250749..."
6,ExtraTreesRegressor,"[-6.045199999999999, -6.011050000000004, -5.16...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.015100000000001, -5.942, -6.0459000000000...","[-6.079960000000001, -6.070179999999998, -6.09...","[0.06778328997621759, 0.15078072025295275, 0.0..."
7,LinearRegression,"[-6.3037585200177535, -5.9892467985225935, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.609107085206263, -5.874403506311881, -7.3...","[-6.6529199513564, -6.7422066280623785, -7.790...","[1.9500868929488304, 1.6326147310561523, 1.237..."
8,KNeighborsRegressor,"[-5.716666666666666, -6.19, -5.013333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.260000000000001, -6.260000000000001, -6.1...","[-6.114000000000001, -6.114000000000001, -5.98...","[0.2338698593472688, 0.2338698593472688, 0.121..."
9,SVR,"[-5.771455216772495, -6.1926462687907, -5.3200...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.938547657012659, -6.103892850118804, -5.8...","[-5.956685313804375, -6.050411220384989, -5.89...","[0.05116242492971367, 0.12161697184591885, 0.0..."


In [15]:
result_df.to_csv('results/Atomic/Results_all_atomic_desc_and_mono_comp_const_rem_RRCK.csv')
prediction_df.to_csv('results/Atomic/Prediction_data_all_atomic_desc_and_mono_comp_const_rem_RRCK.csv')

In [16]:
const_col

['Ala_indol_2_yl_',
 'dAla_indol_2_yl_',
 'Me_Ala_indol_2_yl_',
 'Ala_5_Tet_',
 '2Abz',
 'Aib',
 'Aoc_2_',
 '5_Ava',
 'Bal',
 'Me_Bal',
 'HOCOCH2_Bal',
 'Cys_EtO2H__NH2',
 'dCha',
 'D',
 'Asp_piperidide',
 'Asp_OMe_',
 'Asp_Ph_2_NH2__',
 'dAsp_pyrrol_1_yl_',
 'E',
 'Glu_NH2',
 'Glu_3R_Me_',
 'Glu_OMe_',
 'dGlu_OMe_',
 'Phe_4_F_',
 'dPhe_4_F_',
 'Phe_4_CF3_',
 'Phe_4_NO2_',
 'Phe_CHF2_',
 'dPhe_3_4_diF_',
 'Et_Phe',
 'H2NEt_Phe',
 'Me_Phe_3_Cl_',
 'Me_Phe_4_Cl_',
 'Me_Phe_a_b_dehydro_',
 'G',
 'Bn_Gly',
 'Bn_4_Cl__Gly',
 'Bu_Gly',
 'Et_Gly',
 'EtOEt_Gly',
 'HOCOCH2_Gly_ol',
 'MeOEt_Gly',
 'NH2Bu_Gly',
 'Pr_Gly',
 'PhEt_Gly',
 'cHexCH2_Gly',
 'isoamyl_Gly',
 'pentyl_Gly',
 '3_pyridylethyl_Gly',
 'd_N__O_Gly_allyl_',
 'GABA',
 'H',
 'Me_Hph',
 'bHph',
 'Hph_2_Cl_',
 'Hph_3_Cl_',
 'Hph_4_Cl_',
 'Hse_Et_',
 'dHyp',
 'Hyp_Et_',
 'dI',
 'meI',
 'Me_dI',
 '_N__O_xiIle',
 'd_N__O_aIle',
 'K',
 'dK',
 'Me_dK',
 'Lys_Ac_',
 'Lys_Cbz_',
 'Lys_iPr_',
 'Lys_Me_',
 'Lys_Me2_',
 'Lys_Tfa_',
 'aMeLeu',

In [4]:
#Fingerprints models
#All fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)

df_test = pd.read_csv('features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 20188)
y_train shape:  (140,)
X_test shape:  (36, 20188)
y_test shape:  (36,)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4032
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 824
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4069
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 853
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2378,0.3696,0.4876,0.4696,0.6896,0.7043,0.4385,0.4635,0.6622,0.3933,0.7142,0.7548
DecisionTreeRegressor,0.3471,0.4271,0.5892,0.2256,0.6154,0.6241,0.3048,0.363,0.5521,0.5784,0.7988,0.8241
RandomForestRegressor,0.2225,0.3573,0.4717,0.5037,0.7221,0.734,0.432,0.4399,0.6573,0.4024,0.7483,0.8036
GradientBoostingRegressor,0.2049,0.3387,0.4526,0.543,0.7443,0.7558,0.2751,0.3673,0.5245,0.6194,0.8462,0.859
AdaBoostRegressor,0.2491,0.3973,0.4991,0.4444,0.6799,0.675,0.4385,0.4606,0.6622,0.3933,0.7537,0.7739
XGBRegressor,0.2266,0.3542,0.476,0.4946,0.7095,0.7207,0.3413,0.3855,0.5842,0.5279,0.7853,0.8395
ExtraTreesRegressor,0.1939,0.3285,0.4404,0.5674,0.7617,0.7943,0.3812,0.4033,0.6174,0.4727,0.7636,0.8296
LinearRegression,2.6029,1.114,1.6134,-4.8064,0.299,0.3468,0.4754,0.548,0.6895,0.3423,0.7513,0.7382
KNeighborsRegressor,0.4021,0.4669,0.6341,0.103,0.4714,0.507,0.6052,0.5553,0.7779,0.1628,0.6094,0.6468
SVR,0.3455,0.4431,0.5878,0.2293,0.4947,0.5148,0.6347,0.5449,0.7967,0.1219,0.543,0.6292


In [5]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.293540351996311, -6.02616124772942, -5.358...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.298499033891243, -6.298499033891243, -6.3...","[-6.336842475995235, -6.333227415029346, -6.26...","[0.09854845378412454, 0.09610374466706918, 0.1..."
1,DecisionTreeRegressor,"[-6.13, -6.1850000000000005, -5.22, -5.85, -6....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -7.48, -6.46, -6.5, -4.72, -6....","[-6.093999999999999, -5.986, -7.14600000000000...","[0.07199999999999988, 0.07199999999999988, 0.4..."
2,RandomForestRegressor,"[-6.051099999999998, -5.943700000000003, -5.21...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.027599999999999, -5.9963, -6.265000000000...","[-6.104279999999998, -6.06102, -6.397950000000...","[0.0440218763798185, 0.053634369577725466, 0.2..."
3,GradientBoostingRegressor,"[-6.033879653681506, -5.627194696786993, -5.26...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.028705198645376, -5.966278047052107, -6.5...","[-6.0757133635549145, -6.001621650240036, -6.7...","[0.0739590727153139, 0.07138807121396391, 0.35..."
4,AdaBoostRegressor,"[-6.122105263157894, -5.791491228070174, -5.20...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.122105263157894, -6.122105263157894, -6.3...","[-6.181860306170832, -6.122222210932737, -6.33...","[0.09753793043681339, 0.10063872821770435, 0.1..."
5,XGBRegressor,"[-6.271129, -5.956721, -5.3172274, -6.066091, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.009154, -5.951429, -6.4343777, -6.45095, ...","[-6.038414, -5.9867706, -6.468236, -6.4193563,...","[0.08069204, 0.07263097, 0.2196782, 0.12366667..."
6,ExtraTreesRegressor,"[-6.077099999999997, -6.0423500000000026, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.013999999999998, -5.949999999999999, -6.2...","[-6.011849999999999, -5.962259999999999, -6.40...","[0.05485103463016728, 0.02452000000000041, 0.2..."
7,LinearRegression,"[-5.189094099708547, -4.0, -4.954305555556097,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-9.535351970228447, -5.949999999999609, -9.2...","[-7.035603984361797, -6.109271514281189, -8.18...","[2.2518727930742153, 0.3185430285623466, 1.090..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.19, -5.193333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.76...","[0.05153639490690065, 0.05153639490690065, 0.0..."
9,SVR,"[-5.8028702717892315, -5.821384462445965, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.996569010708649, -6.001608212426845, -5.8...","[-5.983802784638731, -5.983097819346977, -5.83...","[0.020481459017287713, 0.030386017509208635, 0..."


In [6]:
result_df.to_csv('results/Fingerprints/Results_All_fingerprints_fp_RRCK.csv')
prediction_df.to_csv('results/Fingerprints/Prediction_data_All_fingerprints_fp_RRCK.csv')

In [7]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [8]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [9]:
#All fingerprints constant removal
df_train = pd.read_csv('features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 3768)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 3768)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4032
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 824
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4069
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 853
[LightGBM] [Info] Start training from score -5.528750
[L

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2378,0.3696,0.4876,0.4696,0.6896,0.7043,0.4385,0.4635,0.6622,0.3933,0.7142,0.7548
DecisionTreeRegressor,0.335,0.4341,0.5788,0.2527,0.6205,0.6232,0.3102,0.3768,0.5569,0.5709,0.7907,0.8245
RandomForestRegressor,0.2259,0.3575,0.4753,0.496,0.7171,0.7365,0.436,0.4409,0.6603,0.3968,0.746,0.801
GradientBoostingRegressor,0.2095,0.3444,0.4577,0.5327,0.7363,0.7434,0.2797,0.3684,0.5288,0.6131,0.8451,0.8554
AdaBoostRegressor,0.2677,0.4084,0.5174,0.4029,0.6368,0.6353,0.4436,0.4655,0.666,0.3863,0.7417,0.7634
XGBRegressor,0.2266,0.3542,0.476,0.4946,0.7095,0.7207,0.3413,0.3855,0.5842,0.5279,0.7853,0.8395
ExtraTreesRegressor,0.1959,0.3301,0.4426,0.5629,0.76,0.7815,0.3728,0.4006,0.6106,0.4842,0.7767,0.8268
LinearRegression,2.6029,1.114,1.6134,-4.8064,0.299,0.3468,0.4754,0.548,0.6895,0.3423,0.7513,0.7382
KNeighborsRegressor,0.4021,0.4669,0.6341,0.103,0.4714,0.507,0.6052,0.5553,0.7779,0.1628,0.6094,0.6468
SVR,0.3455,0.4431,0.5878,0.2293,0.4947,0.5148,0.6347,0.5448,0.7967,0.122,0.5431,0.6292


In [10]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.293540351996311, -6.02616124772942, -5.358...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.298499033891243, -6.298499033891243, -6.3...","[-6.336842475995235, -6.333227415029346, -6.26...","[0.09854845378412454, 0.09610374466706918, 0.1..."
1,DecisionTreeRegressor,"[-6.13, -5.87, -5.22, -5.85, -5.32, -5.91, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -7.48, -6.22, -6.5, -4.72, -6....","[-6.042, -6.0920000000000005, -7.1400000000000...","[0.11070682002478427, 0.284, 0.416413256273140..."
2,RandomForestRegressor,"[-6.059549999999998, -5.949200000000002, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.029399999999999, -6.007399999999999, -6.3...","[-6.084559999999999, -6.05533, -6.383720000000...","[0.03385596550092762, 0.05501010452635092, 0.2..."
3,GradientBoostingRegressor,"[-6.031506441714104, -5.649558793170612, -5.26...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.028705198645376, -5.966278047052107, -6.4...","[-6.065491109125154, -6.0030269327417, -6.7764...","[0.06744669391979767, 0.07405954786421517, 0.3..."
4,AdaBoostRegressor,"[-6.259500000000001, -5.791969696969696, -5.30...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.238431372549019, -6.238431372549019, -6.3...","[-6.2750434173669465, -6.251186274509804, -6.3...","[0.10269191036933618, 0.0889753301820639, 0.15..."
5,XGBRegressor,"[-6.271129, -5.956721, -5.3172274, -6.066091, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.009154, -5.951429, -6.4343777, -6.45095, ...","[-6.038414, -5.9867706, -6.468236, -6.4193563,...","[0.08069204, 0.07263097, 0.2196782, 0.12366667..."
6,ExtraTreesRegressor,"[-6.022699999999999, -6.076800000000003, -5.24...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0074000000000005, -5.949999999999999, -6....","[-6.0187, -5.9681299999999995, -6.456980000000...","[0.04046489836883238, 0.03626000000000005, 0.2..."
7,LinearRegression,"[-5.189094098659434, -4.0, -4.954305555555499,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-9.535351970627698, -5.950000000000425, -9.2...","[-7.035603984438912, -6.109271514281525, -8.18...","[2.251872793164774, 0.3185430285627709, 1.0907..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.19, -5.193333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.76...","[0.05153639490690065, 0.05153639490690065, 0.0..."
9,SVR,"[-5.802867837565931, -5.821387820331051, -5.38...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.996622928130576, -6.001608612480334, -5.8...","[-5.983824683357165, -5.9830867951849775, -5.8...","[0.020486912547836578, 0.0303808745722354, 0.0..."


In [11]:
result_df.to_csv('results/Fingerprints/Results_All_const_rem_fingerprints_RRCK.csv')
prediction_df.to_csv('results/Fingerprints/Prediction_data_All_const_rem_fingerprints_RRCK.csv')

In [12]:
#Morgan fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/morgan_fp_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/morgan_fp_test_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_fp

X_train shape:  (140, 2048)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 2048)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 53
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 112, number of used features:

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3783,0.4862,0.6151,0.1561,0.3987,0.3937,0.7074,0.614,0.841,0.0214,0.3838,0.4511
DecisionTreeRegressor,0.4295,0.4672,0.6554,0.0419,0.4949,0.5274,0.5801,0.4878,0.7617,0.1975,0.5294,0.5503
RandomForestRegressor,0.2902,0.3903,0.5387,0.3526,0.5987,0.6234,0.5822,0.5074,0.763,0.1946,0.5661,0.6106
GradientBoostingRegressor,0.2652,0.3582,0.5149,0.4085,0.6434,0.6607,0.5769,0.498,0.7595,0.2019,0.5584,0.633
AdaBoostRegressor,0.3376,0.4521,0.581,0.2469,0.5016,0.5328,0.6304,0.5785,0.794,0.1279,0.599,0.6114
XGBRegressor,0.3295,0.4095,0.574,0.2649,0.5883,0.618,0.6038,0.4775,0.777,0.1647,0.5237,0.6205
ExtraTreesRegressor,0.4113,0.4588,0.6414,0.0824,0.4972,0.5305,0.5686,0.4847,0.754,0.2134,0.5401,0.5675
LinearRegression,0.6455,0.5796,0.8034,-0.44,0.3567,0.3965,1.0545,0.7214,1.0269,-0.4589,0.0997,0.1768
KNeighborsRegressor,0.3553,0.4397,0.5961,0.2074,0.5231,0.5529,0.6787,0.556,0.8239,0.061,0.5246,0.5837
SVR,0.2954,0.4006,0.5435,0.3411,0.5909,0.6007,0.6517,0.5279,0.8073,0.0985,0.5211,0.6127


In [13]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.531297470505814, -5.711366959090028, -5.63...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.531297470505814, -5.7112121441245245, -5....","[-5.646703300474309, -5.701439679613568, -5.72...","[0.09524436964780744, 0.10091586369235374, 0.1..."
1,DecisionTreeRegressor,"[-6.13, -5.975, -5.3933333333333335, -6.62, -5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -5.975, -4.3, -6.62, -5.073333...","[-6.078, -5.909999999999999, -5.53700000000000...","[0.10399999999999991, 0.08000000000000007, 0.2..."
2,RandomForestRegressor,"[-5.973599999999998, -6.156198809523816, -5.35...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.019099999999997, -5.898099999999999, -5.6...","[-6.066566666666665, -5.9182766666666655, -5.6...","[0.049734811193412436, 0.023152469510711975, 0..."
3,GradientBoostingRegressor,"[-5.972068389176766, -6.183780872896161, -5.40...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.039598104633682, -5.963114848301747, -5.6...","[-6.07317950159807, -5.979787078482948, -5.692...","[0.022374509717948247, 0.028078845841266718, 0..."
4,AdaBoostRegressor,"[-5.873749999999999, -6.359999999999999, -5.13...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.873749999999999, -5.873749999999999, -5.8...","[-5.900670561454756, -5.856769718844985, -5.81...","[0.10571207292556929, 0.11048425107418011, 0.0..."
5,XGBRegressor,"[-5.9941473, -6.1927757, -5.390035, -6.620369,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.092258, -5.9514093, -5.2211976, -6.232823...","[-6.085742, -5.92769, -5.2932577, -5.981988, -...","[0.06650973, 0.044220783, 0.22009273, 0.699378..."
6,ExtraTreesRegressor,"[-5.916899999999999, -6.245900000000007, -5.39...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.129999999999996, -5.949999999999999, -5.9...","[-6.091619999999997, -5.909999999999999, -5.51...","[0.07675999999999697, 0.07999999999999971, 0.2..."
7,LinearRegression,"[-6.723197754770655, -6.215215993585295, -5.39...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.752145417372643, -5.949999999999997, -4.4...","[-5.36775096483848, -5.918128500986197, -4.999...","[0.8627431625145794, 0.06374299802760008, 0.65..."
8,KNeighborsRegressor,"[-5.9433333333333325, -6.19, -5.39333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.74...","[0.05153639490690065, 0.05153639490690065, 0.0..."
9,SVR,"[-5.7653970736065725, -5.918701900446836, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.873209832758758, -5.949423519958586, -5.7...","[-5.9115543492208875, -5.940738324416255, -5.7...","[0.03229220302843119, 0.027546787352388913, 0...."


In [14]:
df_morgan_fp.to_csv('results/Fingerprints/Results_Morgan_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Morgan_fp_RRCK.csv')

In [15]:
#Morgan count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/count_morgan_fp_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/count_morgan_fp_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_count_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_count_fp

X_train shape:  (140, 2048)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 2048)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 69
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 112, number of used features:



-0.16089629423586715


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2993,0.422,0.5471,0.3324,0.5781,0.5956,0.6148,0.55,0.7841,0.1495,0.5492,0.6293
DecisionTreeRegressor,0.4773,0.4682,0.6908,-0.0646,0.4955,0.542,0.5208,0.4467,0.7217,0.2795,0.6123,0.6913
RandomForestRegressor,0.2769,0.3787,0.5262,0.3823,0.6278,0.6628,0.5592,0.4813,0.7478,0.2264,0.6084,0.7333
GradientBoostingRegressor,0.2742,0.3819,0.5236,0.3884,0.6389,0.6622,0.52,0.4563,0.7211,0.2806,0.6343,0.7163
AdaBoostRegressor,0.2991,0.4388,0.5469,0.3328,0.582,0.6123,0.5769,0.5381,0.7596,0.2019,0.6046,0.664
XGBRegressor,0.2987,0.3863,0.5465,0.3337,0.6174,0.6623,0.5178,0.4366,0.7195,0.2837,0.6127,0.7064
ExtraTreesRegressor,0.2979,0.3954,0.5458,0.3354,0.6117,0.6458,0.4935,0.4353,0.7025,0.3173,0.6519,0.7333
LinearRegression,0.7021,0.5355,0.8379,-0.5663,0.4522,0.5695,0.6057,0.5876,0.7782,0.1621,0.5093,0.5841
KNeighborsRegressor,0.3712,0.4414,0.6093,0.1719,0.5498,0.5814,0.5471,0.5181,0.7396,0.2432,0.6621,0.7047
SVR,0.2785,0.3861,0.5277,0.3788,0.6207,0.624,0.5563,0.4946,0.7459,0.2303,0.6219,0.7005


In [16]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.695304983511653, -5.986359301270523, -5.63...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.695304983511653, -5.893698792108244, -5.8...","[-5.891479641086361, -5.971690945885567, -5.76...","[0.12951606373926675, 0.10232965863020972, 0.1..."
1,DecisionTreeRegressor,"[-6.58, -6.92, -5.09, -5.4, -6.05, -6.2, -6.05...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -6.1, -6.2, -5.4, -5.073333333...","[-6.002000000000001, -5.933999999999999, -5.8,...","[0.16129476122924755, 0.03200000000000003, 0.3..."
2,RandomForestRegressor,"[-5.986899999999999, -5.817466666666673, -5.13...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.999874999999999, -5.944399999999999, -5.6...","[-6.049084999999998, -5.935519999999998, -5.73...","[0.03417363603715502, 0.05647685012463111, 0.1..."
3,GradientBoostingRegressor,"[-5.977550832386751, -6.247992153573798, -5.17...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.03237579586137, -5.98024463625589, -5.730...","[-6.062282540872021, -5.9861973228796685, -5.8...","[0.06181435273702747, 0.03808285304530232, 0.1..."
4,AdaBoostRegressor,"[-5.897234042553189, -6.1167142857142895, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.897234042553189, -6.142954545454546, -6.0...","[-6.051963642487472, -6.074461682461683, -5.93...","[0.15758910106623844, 0.16405549220668797, 0.2..."
5,XGBRegressor,"[-6.1615295, -6.249744, -5.0908656, -5.473682,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.1060667, -5.950252, -5.6400223, -6.469012...","[-6.0654554, -5.99138, -5.7818804, -6.3692617,...","[0.0710769, 0.08284783, 0.21647198, 0.13671055..."
6,ExtraTreesRegressor,"[-6.076599999999997, -6.324633333333337, -5.08...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.049599999999999, -5.949999999999999, -5.8...","[-6.0327, -5.94368, -5.7431400000000075, -6.16...","[0.03751842214166105, 0.012639999999998961, 0...."
7,LinearRegression,"[-6.356217818897776, -6.030992714937686, -5.80...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.228509451763848, -5.950000000000001, -6.5...","[-5.62294590279239, -5.966277654960579, -6.469...","[1.0205705659380804, 0.03255530992115512, 0.25..."
8,KNeighborsRegressor,"[-5.9433333333333325, -6.19, -5.19333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.82...","[0.05153639490690065, 0.05153639490690065, 0.0..."
9,SVR,"[-5.821270995600824, -5.789733128135689, -5.32...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8954490999150835, -5.939444513348278, -5....","[-5.931270314379397, -5.937763902456055, -5.78...","[0.04436570949512518, 0.006955513126242185, 0...."


In [17]:
df_morgan_count_fp.to_csv('results/Fingerprints/Results_Count_Morgan_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Count_Morgan_fp_RRCK.csv')

In [18]:
#AtomPairs2d fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/AtomPairs2D_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/AtomPairs2D_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2D_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2D_fp

X_train shape:  (140, 780)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 780)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 19
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 21


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4308,0.5452,0.6564,0.039,0.2183,0.1929,0.7391,0.6434,0.8597,-0.0225,0.4641,0.4903
DecisionTreeRegressor,0.4272,0.5348,0.6536,0.047,0.2984,0.2461,0.7891,0.658,0.8883,-0.0916,0.3285,0.3223
RandomForestRegressor,0.4094,0.53,0.6398,0.0868,0.3231,0.2575,0.7872,0.6549,0.8873,-0.0891,0.328,0.3328
GradientBoostingRegressor,0.4068,0.5253,0.6378,0.0926,0.3404,0.271,0.784,0.6547,0.8854,-0.0846,0.3299,0.3316
AdaBoostRegressor,0.406,0.5238,0.6372,0.0943,0.3274,0.3123,0.7466,0.6438,0.8641,-0.0329,0.3653,0.3935
XGBRegressor,0.4045,0.5251,0.636,0.0976,0.3438,0.2626,0.7889,0.6577,0.8882,-0.0914,0.3288,0.3223
ExtraTreesRegressor,0.4155,0.529,0.6446,0.0732,0.3187,0.2607,0.7891,0.658,0.8883,-0.0916,0.3285,0.3223
LinearRegression,0.3968,0.5131,0.6299,0.1149,0.3767,0.2902,0.7938,0.6497,0.8909,-0.0981,0.3096,0.3278
KNeighborsRegressor,0.6859,0.6835,0.8282,-0.53,0.1761,0.1476,0.6014,0.6054,0.7755,0.168,0.4156,0.4692
SVR,0.4286,0.5353,0.6547,0.0439,0.2546,0.2301,0.785,0.6463,0.886,-0.086,0.3533,0.3215


In [19]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.286939751613404, -5.7675784994515995, -5.7...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.286939751613404, -5.286939751613404, -5.7...","[-5.422778328595897, -5.422778328595897, -5.74...","[0.08484817398042653, 0.08484817398042653, 0.0..."
1,DecisionTreeRegressor,"[-5.983333333333333, -5.723048780487803, -5.72...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.1525, -6.1525, -5.685047356996378, -5.1950...","[0.09175753072333853, 0.09175753072333853, 0.0..."
2,RandomForestRegressor,"[-5.956641309523811, -5.716945145636179, -5.71...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.956641309523811, -5.956641309523811, -5.7...","[-6.1295850337474125, -6.1295850337474125, -5....","[0.09353693557378943, 0.09353693557378943, 0.0..."
3,GradientBoostingRegressor,"[-5.953268549052178, -5.7233825568538395, -5.7...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.953268549052178, -5.953268549052178, -5.7...","[-6.107109555212862, -6.107109555212862, -5.68...","[0.08344055611904098, 0.08344055611904098, 0.0..."
4,AdaBoostRegressor,"[-5.5022222222222235, -5.723378378378379, -5.7...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.5022222222222235, -5.5022222222222235, -5...","[-6.000652094717669, -6.000652094717669, -5.75...","[0.3327445815131407, 0.3327445815131407, 0.030..."
5,XGBRegressor,"[-5.9834375, -5.723186, -5.723186, -5.723186, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9834375, -5.9834375, -5.723186, -5.122092...","[-6.1525064, -6.1525064, -5.6851683, -5.195113...","[0.09179446, 0.09179446, 0.0442632, 0.11001804..."
6,ExtraTreesRegressor,"[-5.98333333333334, -5.723048780487817, -5.723...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.98333333333334, -5.98333333333334, -5.723...","[-6.152500000000005, -6.152500000000005, -5.68...","[0.09175753072333921, 0.09175753072333921, 0.0..."
7,LinearRegression,"[-5.639013135138875, -5.724854164504859, -5.72...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.639013135138875, -5.639013135138875, -5.7...","[-5.860212302471344, -5.860212302471344, -5.69...","[0.12300519441467392, 0.12300519441467392, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.289999999999999, -6.28...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.2...","[-6.171333333333333, -6.171333333333333, -6.29...","[0.09918781287145202, 0.09918781287145202, 0.0..."
9,SVR,"[-5.814107793539646, -5.699921526874601, -5.69...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.814107793539646, -5.814107793539646, -5.6...","[-5.853396824836513, -5.853396824836513, -5.66...","[0.04813464823741424, 0.04813464823741424, 0.0..."


In [20]:
df_AtomPairs2D_fp.to_csv('results/Fingerprints/Results_AtomPairs2D_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_AtomPairs2D_fp_RRCK.csv')

In [21]:
#AtomPairs2d Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/AtomPairs2DCount_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/AtomPairs2DCount_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2DCount_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2DCount_fp

X_train shape:  (140, 780)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 780)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 762
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 60
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 740
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 6



0.08239028264906334




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2817,0.4111,0.5307,0.3717,0.6101,0.6149,0.4674,0.4722,0.6837,0.3533,0.6999,0.7521
DecisionTreeRegressor,0.441,0.5016,0.6641,0.0163,0.4644,0.4556,0.3751,0.4299,0.6124,0.4811,0.7763,0.7625
RandomForestRegressor,0.2559,0.3903,0.5059,0.4291,0.6595,0.657,0.4132,0.4298,0.6428,0.4284,0.7619,0.8032
GradientBoostingRegressor,0.2679,0.3981,0.5176,0.4024,0.6522,0.6507,0.3157,0.3835,0.5619,0.5632,0.8189,0.8308
AdaBoostRegressor,0.2735,0.4191,0.523,0.3899,0.6281,0.6348,0.4361,0.4778,0.6603,0.3967,0.771,0.7583
XGBRegressor,0.2774,0.3991,0.5267,0.3811,0.6439,0.6468,0.3464,0.403,0.5885,0.5208,0.7825,0.7986
ExtraTreesRegressor,0.2227,0.3522,0.472,0.5031,0.7153,0.7157,0.361,0.403,0.6009,0.5005,0.775,0.8075
LinearRegression,1.5068,0.7197,1.2275,-2.3613,0.384,0.5352,0.6221,0.5704,0.7887,0.1394,0.5535,0.6747
KNeighborsRegressor,0.2952,0.3925,0.5434,0.3414,0.6191,0.6489,0.607,0.511,0.7791,0.1603,0.5582,0.5984
SVR,0.2918,0.4151,0.5402,0.349,0.6053,0.6287,0.6085,0.5488,0.7801,0.1582,0.5906,0.6525


In [22]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.174722296925258, -5.811231849162886, -5.30...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.174722296925258, -6.174722296925258, -6.3...","[-6.202887160256114, -6.178989873364112, -6.27...","[0.07027520523501173, 0.08560694791903788, 0.0..."
1,DecisionTreeRegressor,"[-6.13, -5.57, -5.4, -6.22, -5.4, -6.58, -5.4,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.05, -5.95, -6.78, -6.46, -5.35, -4.72, -6...","[-5.85, -5.83, -7.0120000000000005, -6.048, -5...","[0.2529822128134705, 0.24000000000000024, 0.39..."
2,RandomForestRegressor,"[-6.1020999999999965, -5.86745, -5.2341, -5.92...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.102099999999998, -5.963299999999999, -6.5...","[-6.198464499999999, -6.0783231666666655, -6.5...","[0.05473067714728151, 0.07774882932737848, 0.1..."
3,GradientBoostingRegressor,"[-6.106200077986232, -6.344998862038213, -5.24...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.054609219869423, -5.981616493256371, -6.7...","[-6.105868140136444, -6.035693801833756, -6.88...","[0.06393240980493652, 0.08734882396807951, 0.3..."
4,AdaBoostRegressor,"[-6.13, -6.115, -5.238124999999999, -6.0917073...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.109999999999999, -6.109999999999999, -6.5...","[-6.1341871794871805, -6.081461234991423, -6.5...","[0.06338044513187786, 0.11799236321740199, 0.2..."
5,XGBRegressor,"[-6.129661, -5.9377437, -5.3395033, -6.1105967...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9515038, -5.9505663, -6.9154034, -6.45473...","[-5.9960513, -5.9922028, -6.9584756, -6.302946...","[0.08044295, 0.08206079, 0.3856949, 0.26830214..."
6,ExtraTreesRegressor,"[-6.129999999999996, -6.090199999999995, -5.30...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.090149999999999, -5.949999999999999, -6.9...","[-6.120649999999998, -5.992519999999999, -6.76...","[0.06404927009732307, 0.08503999999999935, 0.3..."
7,LinearRegression,"[-6.126034955452054, -6.3765561171432115, -5.9...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.0, -5.993924298526641, -6.861568642235872...","[-6.86240927341324, -6.102368433259525, -6.510...","[1.9012337489789244, 0.06786547733396986, 0.27..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -5.19...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.3...","[-6.171333333333333, -6.154, -6.22799999999999...","[0.09918781287145202, 0.09180655992054389, 0.1..."
9,SVR,"[-6.052887344882163, -5.66333786713414, -5.447...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.020565120422604, -6.006596091659966, -5.9...","[-6.069998682643051, -6.038159051164101, -5.88...","[0.03239908724185192, 0.01659351512624975, 0.0..."


In [23]:
df_AtomPairs2DCount_fp.to_csv('results/Fingerprints/Results_AtomPairs2D_Count_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_df_AtomPairs2D_Count_fp_RRCK.csv')

In [24]:
#EState fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/EState_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/EState_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_estate_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_estate_fp

X_train shape:  (140, 79)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 79)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info



-0.1682114631281033




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4208,0.5366,0.6487,0.0612,0.2491,0.1581,0.7594,0.6452,0.8714,-0.0505,0.3636,0.3484
DecisionTreeRegressor,0.415,0.5335,0.6442,0.0742,0.3202,0.2518,0.8022,0.6728,0.8956,-0.1097,0.315,0.2388
RandomForestRegressor,0.4112,0.5323,0.6413,0.0827,0.3115,0.2632,0.8006,0.6756,0.8947,-0.1075,0.3252,0.2399
GradientBoostingRegressor,0.4011,0.5221,0.6333,0.1053,0.3415,0.26,0.7958,0.6695,0.8921,-0.1009,0.3224,0.2328
AdaBoostRegressor,0.4063,0.5249,0.6374,0.0936,0.3114,0.2515,0.7519,0.6629,0.8671,-0.0403,0.4504,0.4342
XGBRegressor,0.4058,0.5256,0.637,0.0948,0.3358,0.2709,0.8,0.6697,0.8944,-0.1068,0.3142,0.2328
ExtraTreesRegressor,0.4123,0.5313,0.6421,0.0803,0.3261,0.2524,0.8022,0.6728,0.8956,-0.1097,0.315,0.2388
LinearRegression,0.4372,0.5478,0.6612,0.0247,0.2458,0.2366,0.7956,0.6715,0.8919,-0.1006,0.251,0.3041
KNeighborsRegressor,0.9205,0.7747,0.9594,-1.0534,0.0337,-0.0377,0.9062,0.7656,0.952,-0.2537,0.04,-0.1934
SVR,0.4605,0.5632,0.6786,-0.0274,0.2082,0.2355,0.8627,0.6752,0.9288,-0.1934,0.1749,0.1241


In [25]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.307054751846478, -5.733086951432175, -5.73...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.733086951432175, -5.733086951432175, -5.7...","[-5.716147069640357, -5.716147069640357, -5.71...","[0.027026951970261587, 0.027026951970261587, 0..."
1,DecisionTreeRegressor,"[-6.78, -5.761875000000001, -5.761875000000001...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.686, -5.686, -5.761875000000001, -5.02444...","[-5.6934, -5.6934, -5.7013710407239815, -5.124...","[0.09566060375677687, 0.09566060375677687, 0.0..."
2,RandomForestRegressor,"[-5.977085696248188, -5.7400533879357845, -5.7...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.672280984848485, -5.672280984848485, -5.7...","[-5.6899965858585855, -5.6899965858585855, -5....","[0.08684929683805728, 0.08684929683805728, 0.0..."
3,GradientBoostingRegressor,"[-6.635883161387998, -5.756717266510805, -5.75...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.685146179340739, -5.685146179340739, -5.7...","[-5.687809606570104, -5.687809606570104, -5.70...","[0.0901893473413318, 0.0901893473413318, 0.086..."
4,AdaBoostRegressor,"[-6.78, -5.79417808219178, -5.79417808219178, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.651810344827585, -5.651810344827585, -5.7...","[-5.619743887147336, -5.619743887147336, -5.78...","[0.12904152690784967, 0.12904152690784967, 0.0..."
5,XGBRegressor,"[-6.6676383, -5.7617073, -5.7617073, -5.761707...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.685873, -5.685873, -5.7617073, -5.0246015...","[-5.6932855, -5.6932855, -5.701349, -5.124449,...","[0.09572181, 0.09572181, 0.093075044, 0.092036..."
6,ExtraTreesRegressor,"[-6.779999999999983, -5.7618749999999945, -5.7...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.6859999999999955, -5.6859999999999955, -5...","[-5.693399999999996, -5.693399999999996, -5.70...","[0.09566060375678095, 0.09566060375678095, 0.0..."
7,LinearRegression,"[-5.489770500878781, -5.761875000000001, -5.76...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.862331605498172, -5.862331605498172, -5.7...","[-6.025502850452247, -6.025502850452247, -5.70...","[0.10029037407240912, 0.10029037407240912, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.289999999999999, -6.28...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.2...","[-5.816000000000001, -5.816000000000001, -6.29...","[0.16407857196409817, 0.16407857196409817, 0.0..."
9,SVR,"[-5.812803170309143, -5.500339343057642, -5.50...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.770129081178809, -5.770129081178809, -5.5...","[-5.878196169195936, -5.878196169195936, -5.54...","[0.10551241572577515, 0.10551241572577515, 0.1..."


In [26]:
df_estate_fp.to_csv('results/Fingerprints/Results_EState_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_EState_fp_RRCK.csv')

In [27]:
#Extended fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Extended_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Extended_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_extended_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_extended_fp

X_train shape:  (140, 1024)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1024)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 167
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 516
[LightGBM] [Info] Number of data points in the train set: 112, number of used features

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3734,0.4808,0.611,0.1671,0.4177,0.4227,0.6634,0.5923,0.8145,0.0822,0.5005,0.5295
DecisionTreeRegressor,0.479,0.5153,0.6921,-0.0686,0.3472,0.3605,0.8007,0.6588,0.8948,-0.1077,0.3231,0.3515
RandomForestRegressor,0.3885,0.4731,0.6233,0.1333,0.4162,0.4595,0.7107,0.6044,0.843,0.0168,0.4469,0.4802
GradientBoostingRegressor,0.3734,0.467,0.6111,0.1669,0.4491,0.4892,0.6858,0.5963,0.8281,0.0513,0.4618,0.4977
AdaBoostRegressor,0.3793,0.4856,0.6159,0.1539,0.4152,0.44,0.688,0.6254,0.8294,0.0482,0.4601,0.4929
XGBRegressor,0.4364,0.5056,0.6606,0.0266,0.37,0.3947,0.7109,0.6237,0.8432,0.0165,0.4282,0.4277
ExtraTreesRegressor,0.4792,0.5195,0.6922,-0.0689,0.3341,0.3542,0.7868,0.6499,0.887,-0.0884,0.3346,0.3478
LinearRegression,0.4911,0.5389,0.7008,-0.0956,0.3413,0.3629,0.6742,0.6226,0.8211,0.0672,0.4793,0.5304
KNeighborsRegressor,0.4372,0.488,0.6612,0.0248,0.4136,0.4502,0.7925,0.682,0.8902,-0.0964,0.3606,0.3261
SVR,0.4362,0.5082,0.6604,0.027,0.295,0.3471,0.8223,0.6776,0.9068,-0.1376,0.3446,0.3632


In [28]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.795313932255152, -5.914337442746655, -5.72...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.011919220081905, -6.011919220081905, -5.9...","[-6.0452010356377235, -6.0452010356377235, -5....","[0.02942150785768537, 0.02942150785768537, 0.0..."
1,DecisionTreeRegressor,"[-5.53, -5.975, -5.544444444444444, -5.975, -5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.04, -6.04, -5.975, -5.132, -5.975, -5.544...","[-6.04, -6.04, -5.827, -5.154733333333334, -5....","[0.05692099788303074, 0.05692099788303074, 0.0..."
2,RandomForestRegressor,"[-5.6659847619047605, -6.0401454474692, -5.565...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.984655, -5.995355000000001, -6.0401454474...","[-5.960914333333331, -6.012649333333331, -5.85...","[0.04782011773999092, 0.045212303839158134, 0...."
3,GradientBoostingRegressor,"[-5.678348178183033, -5.940574780044496, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.6119174091382895, -6.0075881098865676, -5...","[-5.871500198498891, -6.017608651039191, -5.84...","[0.14567814525322847, 0.045062581451394655, 0...."
4,AdaBoostRegressor,"[-5.746666666666667, -6.0486, -6.0486, -6.0486...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.995, -5.995, -6.0486, -5.27470588235294, ...","[-5.994884615384615, -5.994884615384615, -5.99...","[0.037026617644624005, 0.037026617644624005, 0..."
5,XGBRegressor,"[-5.501156, -6.2012396, -5.545777, -6.2012396,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.045834, -6.039532, -6.2012396, -5.1320395...","[-5.988635, -6.0396967, -5.8748627, -5.1548576...","[0.06601441, 0.056452252, 0.16321759, 0.284458..."
6,ExtraTreesRegressor,"[-5.529999999999986, -6.026216666666674, -5.54...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.184599999999994, -6.040000000000003, -6.0...","[-6.100819999999999, -6.040000000000001, -5.83...","[0.1208006357599135, 0.0569209978830299, 0.094..."
7,LinearRegression,"[-6.544099542651379, -5.94908973113055, -5.489...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.773934320076065, -6.039999999999998, -5.9...","[-5.805372481205483, -6.039999999999997, -5.84...","[0.09852917734372195, 0.05692099788303046, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.766666666666666, -6.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -6.118666666666667, -5.92...","[0.11400974617208051, 0.11400974617208051, 0.2..."
9,SVR,"[-5.736299097780264, -5.73893103046754, -5.500...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.975748739414786, -6.00980059943213, -5.73...","[-5.975094444485063, -6.0055719293821195, -5.6...","[0.03263860900915797, 0.036345227680330064, 0...."


In [29]:
df_extended_fp.to_csv('results/Fingerprints/Results_Extended_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Extended_fp_RRCK.csv')

In [30]:
#Fingerprinter fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Fingerprinter_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Fingerprinter_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_fingerprinter_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_fingerprinter_fp

X_train shape:  (140, 1024)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1024)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 468
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 156
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 112, number of used features

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3641,0.4872,0.6034,0.1878,0.4338,0.4587,0.653,0.6023,0.8081,0.0966,0.5085,0.4525
DecisionTreeRegressor,0.4422,0.4869,0.665,0.0136,0.4051,0.4353,0.7971,0.6589,0.8928,-0.1028,0.325,0.356
RandomForestRegressor,0.3778,0.4667,0.6147,0.1572,0.4408,0.4782,0.7392,0.6361,0.8598,-0.0227,0.4017,0.4045
GradientBoostingRegressor,0.4117,0.489,0.6416,0.0816,0.3948,0.4523,0.7127,0.6283,0.8442,0.0141,0.413,0.459
AdaBoostRegressor,0.3904,0.508,0.6248,0.1292,0.3803,0.385,0.7035,0.6213,0.8387,0.0268,0.4188,0.4587
XGBRegressor,0.4367,0.5021,0.6608,0.0259,0.3806,0.4054,0.7597,0.6416,0.8716,-0.051,0.3748,0.4014
ExtraTreesRegressor,0.427,0.4889,0.6535,0.0475,0.409,0.4448,0.7849,0.6531,0.8859,-0.0858,0.3397,0.3704
LinearRegression,0.4639,0.5203,0.6811,-0.0349,0.4034,0.4362,0.6849,0.6205,0.8276,0.0526,0.4718,0.4997
KNeighborsRegressor,0.445,0.486,0.6671,0.0072,0.4175,0.4502,0.7645,0.6489,0.8744,-0.0577,0.3671,0.3341
SVR,0.427,0.5018,0.6535,0.0475,0.3124,0.366,0.832,0.685,0.9122,-0.151,0.3254,0.334


In [31]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.776024623454233, -5.991102690129882, -5.76...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.877498673984496, -5.877498673984496, -5.9...","[-5.918891099140959, -5.918891099140959, -5.99...","[0.07230267776412432, 0.07230267776412432, 0.0..."
1,DecisionTreeRegressor,"[-5.132, -6.065833333333333, -5.54090909090909...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.04, -6.04, -6.065833333333333, -5.132, -6...","[-6.04, -6.04, -5.845166666666666, -5.15473333...","[0.05692099788303074, 0.05692099788303074, 0.1..."
2,RandomForestRegressor,"[-5.803215238095239, -6.052048095238101, -5.55...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.998255833333332, -5.991755833333332, -6.0...","[-6.027918944444442, -6.026912277777776, -5.87...","[0.06804502212188496, 0.06148568730782769, 0.0..."
3,GradientBoostingRegressor,"[-5.593305484751483, -5.906208942513288, -5.59...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.008439485280536, -6.008439485280536, -5.9...","[-6.020965683198359, -6.018656701497865, -5.82...","[0.057676335541644926, 0.056272817995200314, 0..."
4,AdaBoostRegressor,"[-6.005999999999999, -5.80904761904762, -5.809...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.005999999999999, -6.005999999999999, -5.8...","[-6.049044444444445, -6.049044444444445, -5.88...","[0.0996469719175935, 0.0996469719175935, 0.081..."
5,XGBRegressor,"[-5.380657, -6.138579, -5.5417986, -6.138579, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0392056, -6.039688, -6.138579, -5.1319304...","[-6.040491, -6.040538, -5.8633127, -5.154841, ...","[0.057073142, 0.056992337, 0.13767058, 0.28444..."
6,ExtraTreesRegressor,"[-5.132000000000004, -6.002891666666675, -5.54...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0401000000000025, -6.040000000000003, -6....","[-6.04002, -6.040000000000001, -5.832578333333...","[0.056921011937595536, 0.0569209978830299, 0.0..."
7,LinearRegression,"[-6.278044070703841, -6.00471131747885, -5.501...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.154156942433332, -6.040000000000002, -6.0...","[-6.069803236894624, -6.040000000000001, -5.87...","[0.10126283424988687, 0.05692099788303046, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.766666666666666, -6.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -6.118666666666667, -5.92...","[0.11400974617208051, 0.11400974617208051, 0.2..."
9,SVR,"[-5.745239836764945, -5.747006681054585, -5.49...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9812702988603705, -5.997582760741254, -5....","[-5.984255681294913, -5.998686926186851, -5.70...","[0.03716793137008103, 0.03712419933876382, 0.0..."


In [32]:
df_fingerprinter_fp.to_csv('results/Fingerprints/Results_Fingerprinter_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Fingerprinter_fp_RRCK.csv')

In [33]:
#GraphOnly fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Graphonly_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Graphonly_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_graph_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_graph_fp

X_train shape:  (140, 1024)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1024)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 62
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 219
[LightGBM] [Info] Number of data points in the train set: 112, number of used features:

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4177,0.5391,0.6463,0.0683,0.2792,0.2679,0.6729,0.5957,0.8203,0.0691,0.4805,0.489
DecisionTreeRegressor,0.451,0.5255,0.6716,-0.006,0.3286,0.2907,0.7092,0.6253,0.8421,0.0189,0.3712,0.483
RandomForestRegressor,0.4121,0.512,0.6419,0.0808,0.3462,0.3497,0.7129,0.6184,0.8443,0.0138,0.3905,0.464
GradientBoostingRegressor,0.4227,0.5214,0.6502,0.057,0.3333,0.3332,0.7184,0.62,0.8476,0.0061,0.3863,0.4572
AdaBoostRegressor,0.4057,0.5127,0.6369,0.095,0.3389,0.3425,0.6926,0.6187,0.8322,0.0418,0.4542,0.4602
XGBRegressor,0.433,0.5174,0.6581,0.034,0.35,0.3319,0.7381,0.6358,0.8591,-0.0211,0.3437,0.4411
ExtraTreesRegressor,0.4394,0.5242,0.6629,0.0198,0.3382,0.3024,0.7182,0.6286,0.8475,0.0064,0.3593,0.4851
LinearRegression,0.4474,0.5353,0.6689,0.0019,0.3252,0.2921,0.6766,0.6156,0.8225,0.064,0.4275,0.509
KNeighborsRegressor,0.5702,0.587,0.7551,-0.272,0.2481,0.2633,0.7313,0.643,0.8552,-0.0118,0.2684,0.3737
SVR,0.4868,0.5453,0.6977,-0.086,0.1551,0.225,0.7345,0.6308,0.857,-0.0161,0.3524,0.4156


In [34]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.811379245539471, -5.852372518820406, -5.85...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.876278070986813, -5.876278070986813, -5.8...","[-5.838158446264808, -5.838158446264808, -5.78...","[0.05987362058391206, 0.05987362058391206, 0.0..."
1,DecisionTreeRegressor,"[-5.787291666666666, -5.787291666666666, -5.78...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.04, -6.04, -5.787291666666666, -5.3533333...","[-5.93, -6.04, -5.724974793703625, -5.35386666...","[0.5472111109983055, 0.05692099788303074, 0.06..."
2,RandomForestRegressor,"[-5.925394422799423, -5.768635813211602, -5.76...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.859866785714286, -6.011935000000001, -5.7...","[-5.852463686147186, -6.058895333333331, -5.71...","[0.09748863643506192, 0.055694702295440865, 0...."
3,GradientBoostingRegressor,"[-6.002897653486404, -5.798143232239564, -5.79...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.930804932272198, -6.016327190930974, -5.7...","[-5.765616764178778, -6.1267179898776885, -5.7...","[0.38071230968489256, 0.10162582534535164, 0.0..."
4,AdaBoostRegressor,"[-5.921486486486484, -5.933088235294116, -5.93...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.921486486486484, -5.921486486486484, -5.9...","[-5.894608055811228, -6.043218349928876, -5.84...","[0.27437394505290913, 0.10117913772909091, 0.0..."
5,XGBRegressor,"[-5.9001036, -5.787367, -5.787367, -5.787367, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.934048, -6.0395107, -5.787367, -5.353075,...","[-5.6844106, -6.0414248, -5.7250185, -5.353698...","[0.3672762, 0.057444543, 0.06275605, 0.3128520..."
6,ExtraTreesRegressor,"[-5.871916666666665, -5.787291666666658, -5.78...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.040000000000003, -6.040000000000003, -5.7...","[-5.824143333333334, -6.040000000000001, -5.72...","[0.464783041333389, 0.0569209978830299, 0.0627..."
7,LinearRegression,"[-5.56409232302851, -5.7872916666666665, -5.78...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.838828079041079, -6.040000000000002, -5.7...","[-5.823639342277671, -6.040000000000004, -5.72...","[0.16326154785556354, 0.05692099788302793, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.289999999999999, -6.28...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.2...","[-6.171333333333333, -6.171333333333333, -6.29...","[0.09918781287145229, 0.09918781287145229, 0.0..."
9,SVR,"[-5.842141845123626, -5.74977046149222, -5.749...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.938478134971531, -5.96463676737004, -5.74...","[-5.974789674960382, -6.0042984499080925, -5.6...","[0.05808453431194321, 0.05568102486782862, 0.0..."


In [35]:
df_graph_fp.to_csv('results/Fingerprints/Results_Graphonly_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Graphonly_fp_RRCK.csv')

In [36]:
#KlekotaRoth fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/KlekotaRoth_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/KlekotaRoth_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRoth_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRoth_fp

X_train shape:  (140, 4860)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 4860)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 42
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 112, number of used features:



-0.5437019464996429


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3227,0.4361,0.5681,0.2802,0.5315,0.552,0.6877,0.6078,0.8292,0.0487,0.4443,0.3873
DecisionTreeRegressor,0.416,0.4444,0.645,0.0721,0.4915,0.5308,0.5255,0.5288,0.7249,0.273,0.6417,0.6231
RandomForestRegressor,0.3203,0.4071,0.5659,0.2856,0.5581,0.5874,0.615,0.5376,0.7843,0.1491,0.5796,0.6008
GradientBoostingRegressor,0.3407,0.4244,0.5837,0.24,0.5164,0.5384,0.6214,0.5334,0.7883,0.1403,0.5328,0.5554
AdaBoostRegressor,0.3661,0.4856,0.6051,0.1833,0.4366,0.4757,0.6673,0.6024,0.8169,0.0768,0.504,0.5169
XGBRegressor,0.3615,0.4116,0.6012,0.1937,0.5422,0.5701,0.6302,0.5391,0.7938,0.1282,0.5384,0.5438
ExtraTreesRegressor,0.3986,0.4282,0.6314,0.1107,0.5049,0.5499,0.5554,0.533,0.7453,0.2316,0.6148,0.6041
LinearRegression,0.4716,0.504,0.6867,-0.052,0.3978,0.4833,0.9346,0.6549,0.9668,-0.293,0.3438,0.3975
KNeighborsRegressor,0.4048,0.4544,0.6363,0.097,0.459,0.4857,0.581,0.5773,0.7622,0.1963,0.6259,0.6238
SVR,0.3666,0.4299,0.6054,0.1823,0.4621,0.4966,0.6739,0.5581,0.8209,0.0677,0.4775,0.4943


In [37]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.969401987098618, -5.966544106185766, -5.54...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.054758943099081, -6.105018619279476, -5.7...","[-6.087714552921868, -6.138406846098204, -5.77...","[0.06716214430496417, 0.09403039789805377, 0.0..."
1,DecisionTreeRegressor,"[-6.0, -5.975, -5.233333333333333, -6.62, -6.0...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0, -5.95, -5.975, -4.3, -6.62, -5.2333333...","[-6.0, -5.959999999999999, -6.133, -4.732, -6....","[0.08221921916437779, 0.019999999999999928, 0...."
2,RandomForestRegressor,"[-5.930873333333328, -6.027715476190484, -5.25...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.004720476190478, -5.9728666666666665, -5....","[-6.022380190476191, -5.990239428571426, -5.81...","[0.04539541659323426, 0.028686512942465988, 0...."
3,GradientBoostingRegressor,"[-6.426165625697367, -6.233332931024105, -5.33...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0231136062221875, -5.991233895031017, -5....","[-6.026414402025165, -6.014515572670147, -5.68...","[0.05090468091294241, 0.029799270534094414, 0...."
4,AdaBoostRegressor,"[-5.889444444444447, -6.532000000000001, -5.50...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.889444444444447, -5.889444444444447, -5.9...","[-5.951375017783469, -5.974069578766388, -5.82...","[0.05142891320465866, 0.1461814303840214, 0.18..."
5,XGBRegressor,"[-6.49573, -5.9755917, -5.233335, -6.6189723, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.999808, -5.9491425, -5.5855203, -4.302839...","[-5.9999094, -5.942426, -5.703902, -4.7131753,...","[0.0819541, 0.014704871, 0.24887095, 0.8196254..."
6,ExtraTreesRegressor,"[-5.798399999999995, -5.975000000000011, -5.23...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0, -5.949999999999999, -5.975000000000011...","[-6.0, -5.959999999999999, -6.001586666666664,...","[0.08221921916437555, 0.020000000000000285, 0...."
7,LinearRegression,"[-6.3487178711531635, -5.975000000000008, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.089269504883119, -5.771460990233754, -4.4...","[-6.086221737191485, -5.8037505101404205, -4.8...","[0.05492017798340823, 0.06268364946015938, 0.7..."
8,KNeighborsRegressor,"[-5.73, -6.19, -5.153333333333333, -6.19, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.88...","[0.05153639490690065, 0.05153639490690065, 0.1..."
9,SVR,"[-5.829689151644861, -5.950163670944254, -5.31...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.969917097529209, -5.973169581141942, -5.7...","[-5.982014581965722, -5.985191783472696, -5.81...","[0.024096993682865452, 0.023434124028062272, 0..."


In [38]:
df_KlekotaRoth_fp.to_csv('results/Fingerprints/Results_KlekotaRoth_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_KlekotaRoth_fp_RRCK.csv')

In [39]:
#KlekotaRoth Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/KlekotaRothCount_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/KlekotaRothCount_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRothCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRothCount_fp

X_train shape:  (140, 4860)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 4860)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 115
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 984
[LightGBM] [Info] Number of data points in the train set: 112, number of used features



-0.5060689921767687




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.281,0.4092,0.5301,0.3731,0.6146,0.6503,0.4425,0.4843,0.6652,0.3878,0.7216,0.7734
DecisionTreeRegressor,0.2195,0.3542,0.4685,0.5103,0.7426,0.7345,0.2151,0.3537,0.4638,0.7024,0.8769,0.8547
RandomForestRegressor,0.2301,0.3651,0.4797,0.4867,0.7108,0.7237,0.3899,0.4212,0.6244,0.4606,0.7948,0.8506
GradientBoostingRegressor,0.2297,0.3631,0.4793,0.4875,0.7094,0.7296,0.2797,0.3811,0.5289,0.613,0.8309,0.834
AdaBoostRegressor,0.271,0.42,0.5206,0.3955,0.6423,0.6612,0.4246,0.4771,0.6517,0.4125,0.7854,0.7922
XGBRegressor,0.2117,0.3596,0.4601,0.5278,0.7355,0.7368,0.2844,0.355,0.5333,0.6066,0.8429,0.8671
ExtraTreesRegressor,0.2296,0.3467,0.4791,0.4879,0.7085,0.7305,0.3636,0.395,0.603,0.497,0.7755,0.8381
LinearRegression,0.6652,0.6279,0.8156,-0.484,0.3858,0.4361,0.9922,0.7263,0.9961,-0.3726,0.2565,0.369
KNeighborsRegressor,0.3672,0.4302,0.6059,0.181,0.5326,0.5495,0.5784,0.5507,0.7605,0.1998,0.6653,0.685
SVR,0.2798,0.3916,0.529,0.3758,0.6183,0.6343,0.5265,0.4838,0.7256,0.2716,0.6727,0.7557


In [40]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.152365028269188, -5.880228473966994, -5.39...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.152365028269188, -6.152365028269188, -6.3...","[-6.1770176366852265, -6.167559636108885, -6.2...","[0.11604630767510929, 0.13007760341596117, 0.1..."
1,DecisionTreeRegressor,"[-6.13, -5.85, -5.08, -5.85, -6.05, -4.0, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.95, -5.95, -6.22, -6.46, -6.22, -5.08, -6...","[-5.97, -5.933999999999999, -6.884, -6.172, -6...","[0.08579044235810879, 0.03200000000000003, 0.5..."
2,RandomForestRegressor,"[-5.983187499999997, -5.860755000000004, -5.10...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.015487499999998, -5.972319999999999, -6.3...","[-6.0709775, -6.061763999999999, -6.2435835714...","[0.05420036000802919, 0.08568191631843941, 0.1..."
3,GradientBoostingRegressor,"[-6.148926619733519, -6.064779248425036, -5.18...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.132191573704119, -6.028819205528163, -6.7...","[-6.1182403563772585, -6.037239853550033, -6.6...","[0.017700181803187744, 0.05005084780900712, 0...."
4,AdaBoostRegressor,"[-6.106428571428572, -5.914482758620689, -5.39...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.103846153846154, -5.970833333333332, -6.4...","[-6.18847052947053, -6.116188515406163, -6.356...","[0.08801293849790826, 0.13565161908387485, 0.0..."
5,XGBRegressor,"[-6.552158, -6.0017843, -5.079845, -5.8452153,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.088332, -5.9509273, -6.5791316, -6.444233...","[-6.0781803, -6.005694, -6.4812117, -6.29535, ...","[0.06692856, 0.11055781, 0.09122321, 0.3989631..."
6,ExtraTreesRegressor,"[-6.037799999999996, -6.054500000000006, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.957400000000002, -5.949999999999999, -6.3...","[-5.994159999999999, -5.972599999999999, -6.24...","[0.0608133735291828, 0.04520000000000018, 0.11..."
7,LinearRegression,"[-7.72505647150127, -6.176072281030752, -5.679...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0741329519061, -5.857350247147322, -7.105...","[-6.681030325789566, -5.839376835037964, -6.58...","[0.9830724082689085, 0.08609001329407302, 0.35..."
8,KNeighborsRegressor,"[-5.816666666666666, -6.463333333333334, -5.15...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.76...","[0.05153639490690065, 0.05153639490690065, 0.0..."
9,SVR,"[-5.965903993055895, -5.973780843285969, -5.31...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.022863473027267, -5.9868188323950395, -5....","[-6.021602941339276, -5.979372992798597, -5.82...","[0.009937980847251091, 0.009221867661003259, 0..."


In [41]:
df_KlekotaRothCount_fp.to_csv('results/Fingerprints/Results_KlekotaRoth_Count_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_KlekotaRoth_Count_fp_RRCK.csv')

In [42]:
#MACCS fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/MACCS_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/MACCS_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_MACCS_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_MACCS_fp

X_train shape:  (140, 166)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 166)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 19
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 18
[LightGBM] [Info] Start training from score -5.528750
[LightGBM]



-0.6722431171466088




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3365,0.452,0.5801,0.2493,0.4995,0.4936,0.6684,0.5749,0.8176,0.0753,0.4528,0.4776
DecisionTreeRegressor,0.4445,0.4865,0.6667,0.0085,0.4461,0.4832,0.6634,0.5687,0.8145,0.0822,0.542,0.5997
RandomForestRegressor,0.3484,0.4296,0.5903,0.2228,0.5125,0.5563,0.6177,0.5285,0.7859,0.1454,0.5745,0.6323
GradientBoostingRegressor,0.3118,0.4066,0.5584,0.3045,0.5678,0.5939,0.6131,0.5098,0.783,0.1518,0.5523,0.6058
AdaBoostRegressor,0.3883,0.498,0.6231,0.1338,0.3785,0.4077,0.6555,0.5825,0.8096,0.0931,0.5412,0.5617
XGBRegressor,0.3958,0.4629,0.6291,0.1171,0.4873,0.5291,0.627,0.5429,0.7919,0.1325,0.5601,0.6357
ExtraTreesRegressor,0.4108,0.4729,0.641,0.0835,0.4695,0.5085,0.6384,0.5504,0.799,0.1168,0.5554,0.6131
LinearRegression,0.5911,0.5924,0.7688,-0.3185,0.3437,0.3969,0.7139,0.6271,0.8449,0.0124,0.399,0.4897
KNeighborsRegressor,0.388,0.4424,0.6229,0.1344,0.4704,0.4908,0.5233,0.5177,0.7234,0.2761,0.6583,0.6626
SVR,0.3688,0.453,0.6073,0.1773,0.4511,0.4618,0.6398,0.5113,0.7999,0.1149,0.5323,0.5644


In [43]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.714305257520165, -5.7351627215585195, -5.6...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.7229386857991065, -5.908164798495146, -5....","[-5.758408402046882, -5.8729129281157695, -5.7...","[0.05326518403918413, 0.0765128527445992, 0.09..."
1,DecisionTreeRegressor,"[-6.0, -6.62, -5.3933333333333335, -6.62, -6.0...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0, -5.95, -5.924000000000001, -4.915, -5....","[-6.0, -6.116, -5.8691, -4.946, -5.8691, -5.56...","[0.08221921916437779, 0.332, 0.254591515962335..."
2,RandomForestRegressor,"[-5.726783333333334, -6.3171074603174615, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.944150476190478, -5.927961666666665, -5.8...","[-5.999196857142858, -5.929596476190474, -5.88...","[0.08005322221322991, 0.028838569044276858, 0...."
3,GradientBoostingRegressor,"[-6.011125206017631, -6.24705565523601, -5.491...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.011125206017631, -5.9747449321520785, -5....","[-6.037799406362517, -6.03824610836174, -5.885...","[0.054443065118369965, 0.15226692707373737, 0...."
4,AdaBoostRegressor,"[-5.983684210526315, -5.701931818181817, -5.48...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983684210526315, -5.983684210526315, -6.0...","[-5.843443619128466, -5.705415971234392, -5.91...","[0.0984116014353851, 0.16957428376957479, 0.22..."
5,XGBRegressor,"[-6.0250797, -6.6187816, -5.3933473, -6.618781...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.999671, -5.950577, -5.9242854, -4.914399,...","[-6.0003157, -6.010991, -5.8691444, -4.945909,...","[0.08192728, 0.12242932, 0.2544323, 0.32176912..."
6,ExtraTreesRegressor,"[-6.0, -6.620000000000004, -5.393333333333327,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0, -5.949999999999999, -5.92399999999999,...","[-6.0, -6.0973599999999974, -5.869099999999994...","[0.08221921916437555, 0.29471999999999704, 0.2..."
7,LinearRegression,"[-5.697555622499471, -5.958346317257138, -5.49...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.996218598601658, -5.957562802796689, -6.0...","[-6.003146048317563, -5.880487546290653, -6.01...","[0.11616390984851654, 0.1708480014802227, 0.12..."
8,KNeighborsRegressor,"[-5.916666666666667, -5.6000000000000005, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.2...","[-6.118666666666667, -6.118666666666667, -6.24...","[0.11400974617208051, 0.11400974617208051, 0.0..."
9,SVR,"[-5.6890735282038225, -5.742176694664073, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.970022752433978, -5.932125605971171, -5.7...","[-6.011574804811504, -5.954565737367078, -5.74...","[0.035479089298743695, 0.0165812970343375, 0.1..."


In [44]:
df_MACCS_fp.to_csv('results/Fingerprints/Results_MACCS_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_MACCS_fp_RRCK.csv')

In [45]:
#PubChem fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/PubChem_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/PubChem_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_PubChem_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_PubChem_fp

X_train shape:  (140, 881)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 881)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 138
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 46
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 135
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 4



-0.006931325020985701


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3519,0.4817,0.5932,0.215,0.4641,0.4516,0.6873,0.607,0.829,0.0491,0.4513,0.5058
DecisionTreeRegressor,0.4732,0.5288,0.6879,-0.0556,0.3794,0.404,0.5937,0.5448,0.7705,0.1786,0.517,0.6273
RandomForestRegressor,0.3976,0.4878,0.6305,0.1131,0.421,0.4637,0.5814,0.5635,0.7625,0.1957,0.5528,0.6309
GradientBoostingRegressor,0.3715,0.4759,0.6095,0.1712,0.4564,0.4703,0.5872,0.5531,0.7663,0.1876,0.5475,0.6329
AdaBoostRegressor,0.3603,0.482,0.6003,0.1962,0.4506,0.4381,0.646,0.5951,0.8038,0.1062,0.498,0.516
XGBRegressor,0.4473,0.5099,0.6688,0.0023,0.4002,0.4278,0.5746,0.5408,0.7581,0.205,0.5342,0.6204
ExtraTreesRegressor,0.4566,0.5175,0.6757,-0.0186,0.3926,0.4143,0.5988,0.5481,0.7738,0.1717,0.5129,0.6119
LinearRegression,0.4199,0.5261,0.648,0.0633,0.4038,0.4071,0.7005,0.5771,0.837,0.0309,0.4059,0.5347
KNeighborsRegressor,0.5113,0.5552,0.715,-0.1405,0.2847,0.284,0.6487,0.6229,0.8054,0.1026,0.5277,0.5039
SVR,0.3797,0.4898,0.6162,0.1529,0.417,0.3994,0.7829,0.5988,0.8848,-0.0831,0.3836,0.5045


In [46]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.753866958463793, -5.854455918609657, -5.85...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.878149226881638, -5.878149226881638, -5.8...","[-5.923264893294471, -5.923264893294471, -5.79...","[0.07177404166900563, 0.07177404166900563, 0.0..."
1,DecisionTreeRegressor,"[-6.46, -6.039999999999999, -5.554285714285714...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.0...","[-5.983333333333333, -5.983333333333333, -5.96...","[0.04211096452627664, 0.04211096452627664, 0.0..."
2,RandomForestRegressor,"[-5.976008333333331, -5.970340008325008, -5.58...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.997853809523812, -5.997853809523812, -5.9...","[-5.989216984126985, -5.989216984126985, -5.97...","[0.018170567574966958, 0.018170567574966958, 0..."
3,GradientBoostingRegressor,"[-5.777679540299164, -6.002454654198335, -5.60...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.950060603605846, -5.950060603605846, -6.0...","[-6.0063293107505, -6.0063293107505, -5.920147...","[0.04453418056679399, 0.04453418056679399, 0.1..."
4,AdaBoostRegressor,"[-5.9513043478260865, -5.9513043478260865, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9513043478260865, -5.9513043478260865, -5...","[-5.981022159887798, -5.981022159887798, -5.88...","[0.21863556169254136, 0.21863556169254136, 0.1..."
5,XGBRegressor,"[-6.1351423, -6.0396934, -5.554266, -6.0396934...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983199, -5.983199, -6.0396934, -6.460977,...","[-5.9832635, -5.9832635, -5.9684286, -6.274962...","[0.042107865, 0.042107865, 0.095423594, 0.3668..."
6,ExtraTreesRegressor,"[-6.461849999999999, -6.040000000000003, -5.55...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.98333333333334, -5.98333333333334, -6.040...","[-5.983333333333337, -5.983333333333337, -5.96...","[0.0421109645262755, 0.0421109645262755, 0.095..."
7,LinearRegression,"[-5.7938634263049, -6.040000000000003, -5.7280...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333326, -5.983333333333326, -6.0...","[-5.983333333333332, -5.983333333333332, -5.94...","[0.04211096452627161, 0.04211096452627161, 0.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.289999999999999, -5.99...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.2...","[-5.806666666666667, -5.806666666666667, -6.15...","[0.14695426348206275, 0.14695426348206275, 0.2..."
9,SVR,"[-5.722729745617715, -5.555846132110769, -5.46...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.969635125484497, -5.969635125484497, -5.5...","[-5.95779285177184, -5.95779285177184, -5.5661...","[0.05884937467876745, 0.05884937467876745, 0.0..."


In [47]:
df_PubChem_fp.to_csv('results/Fingerprints/Results_PubChem_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_PubChem_fp_RRCK.csv')

In [48]:
#Substructure fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/Substructure_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/Substructure_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_Substructure_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_Substructure_fp

X_train shape:  (140, 307)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 307)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test



-0.2797587211086854




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.425,0.5387,0.6519,0.0519,0.2334,0.1666,0.7419,0.6609,0.8613,-0.0264,0.4574,0.5202
DecisionTreeRegressor,0.4208,0.5361,0.6487,0.0613,0.3087,0.2847,0.6855,0.6412,0.828,0.0517,0.5511,0.4765
RandomForestRegressor,0.4129,0.5295,0.6426,0.079,0.3117,0.3013,0.6807,0.6386,0.8251,0.0582,0.5566,0.4844
GradientBoostingRegressor,0.4143,0.532,0.6436,0.0759,0.3145,0.2874,0.6976,0.646,0.8353,0.0349,0.4899,0.3886
AdaBoostRegressor,0.3992,0.5176,0.6318,0.1096,0.3363,0.3266,0.7249,0.6583,0.8514,-0.0028,0.435,0.4351
XGBRegressor,0.4224,0.541,0.6499,0.0577,0.3031,0.298,0.6835,0.6332,0.8268,0.0544,0.52,0.4612
ExtraTreesRegressor,0.419,0.5366,0.6473,0.0653,0.3111,0.2908,0.6822,0.6364,0.826,0.0562,0.5536,0.4761
LinearRegression,0.4402,0.5435,0.6634,0.0181,0.2594,0.2699,0.7324,0.6704,0.8558,-0.0132,0.3784,0.38
KNeighborsRegressor,0.8065,0.6999,0.8981,-0.7992,-0.0873,-0.0468,0.8424,0.7184,0.9178,-0.1654,0.075,0.0868
SVR,0.4421,0.5516,0.6649,0.0139,0.2362,0.2501,0.7524,0.6544,0.8674,-0.0409,0.4729,0.4741


In [49]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.331757615267175, -5.822754608356731, -5.76...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.331757615267175, -5.331757615267175, -5.8...","[-5.347479367976341, -5.347479367976341, -5.78...","[0.028872876421998397, 0.028872876421998397, 0..."
1,DecisionTreeRegressor,"[-5.983333333333333, -6.04, -5.685, -6.04, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.0...","[-5.983333333333333, -5.983333333333333, -5.96...","[0.04211096452627664, 0.04211096452627664, 0.0..."
2,RandomForestRegressor,"[-5.7315751984127, -5.970340008325009, -5.6866...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.950953809523813, -5.950953809523813, -5.9...","[-5.935148515873016, -5.935148515873016, -5.96...","[0.04782697157945389, 0.04782697157945389, 0.0..."
3,GradientBoostingRegressor,"[-5.764487828854021, -6.011506158998895, -5.69...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.96439545084737, -5.96439545084737, -6.011...","[-5.956764585740325, -5.956764585740325, -5.93...","[0.05049147911454359, 0.05049147911454359, 0.0..."
4,AdaBoostRegressor,"[-5.803333333333334, -5.861666666666667, -5.68...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.803333333333334, -5.803333333333334, -5.8...","[-5.979066666666666, -5.979066666666666, -5.84...","[0.14132993706611166, 0.14132993706611166, 0.0..."
5,XGBRegressor,"[-5.7749543, -6.039609, -5.6852345, -6.039609,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983357, -5.983357, -6.039609, -5.1223154,...","[-5.9832067, -5.9832067, -5.968057, -5.1953177...","[0.042183686, 0.042183686, 0.09539337, 0.11009..."
6,ExtraTreesRegressor,"[-5.98333333333334, -6.040000000000003, -5.684...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.98333333333334, -5.98333333333334, -6.040...","[-5.983333333333337, -5.983333333333337, -5.96...","[0.0421109645262755, 0.0421109645262755, 0.095..."
7,LinearRegression,"[-4.645333333333331, -6.037526076491041, -5.68...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333331, -5.983333333333331, -6.0...","[-5.983333333333333, -5.983333333333333, -5.96...","[0.04211096452627709, 0.04211096452627709, 0.0..."
8,KNeighborsRegressor,"[-5.53, -6.289999999999999, -5.993333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.2...","[-5.806666666666667, -5.806666666666667, -6.15...","[0.14695426348206275, 0.14695426348206275, 0.2..."
9,SVR,"[-5.773177325907158, -5.840387052176771, -5.61...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9700794763955916, -5.9700794763955916, -5...","[-5.957876134059558, -5.957876134059558, -5.80...","[0.05886333837942849, 0.05886333837942849, 0.1..."


In [50]:
df_Substructure_fp.to_csv('results/Fingerprints/Results_Substructure_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Substructure_fp_RRCK.csv')

In [51]:
#Substructure Count fingerprints
df_train = pd.read_csv('features/Fingerprints/Train/SubstructureCount_train_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Fingerprints/Test/SubstructureCount_test_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_SubstructureCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_SubstructureCount_fp

X_train shape:  (140, 307)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 307)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 13
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 12
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o



-0.35108525340014984




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3076,0.4464,0.5547,0.3137,0.562,0.5827,0.6537,0.5759,0.8085,0.0957,0.4875,0.5324
DecisionTreeRegressor,0.4231,0.482,0.6504,0.0563,0.4961,0.4926,0.3684,0.4144,0.607,0.4903,0.7334,0.7848
RandomForestRegressor,0.2495,0.3876,0.4995,0.4434,0.6709,0.6956,0.4711,0.4611,0.6864,0.3483,0.6801,0.7202
GradientBoostingRegressor,0.2929,0.4182,0.5412,0.3465,0.6059,0.6208,0.3513,0.4074,0.5927,0.5139,0.7738,0.8029
AdaBoostRegressor,0.2884,0.4363,0.537,0.3567,0.6012,0.6251,0.5285,0.5253,0.727,0.2688,0.6486,0.7033
XGBRegressor,0.2545,0.3771,0.5045,0.4322,0.6722,0.6765,0.4329,0.4187,0.6579,0.4011,0.6899,0.7726
ExtraTreesRegressor,0.2656,0.3848,0.5153,0.4076,0.6527,0.6754,0.4532,0.4363,0.6732,0.3731,0.667,0.755
LinearRegression,0.4224,0.4943,0.6499,0.0578,0.4306,0.484,0.6415,0.599,0.801,0.1125,0.4877,0.5458
KNeighborsRegressor,0.3574,0.4462,0.5979,0.2027,0.534,0.5692,0.6021,0.5459,0.7759,0.1671,0.5712,0.6098
SVR,0.3372,0.4432,0.5807,0.2477,0.5189,0.5472,0.656,0.5876,0.8099,0.0925,0.5709,0.6176


In [52]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.944464865258419, -5.635052653466851, -5.51...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.944464865258419, -5.944464865258419, -5.6...","[-6.001314772131197, -6.001314772131197, -5.62...","[0.09468251270009752, 0.09468251270009752, 0.0..."
1,DecisionTreeRegressor,"[-6.13, -5.08, -5.08, -5.4, -6.05, -6.2, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -6.46, -6.46, -7.48, -5.08, -5...","[-6.2, -5.986, -6.117999999999999, -6.41200000...","[0.24033310217279685, 0.07199999999999988, 0.4..."
2,RandomForestRegressor,"[-6.010399999999999, -5.425216031746036, -5.12...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.023599999999998, -5.9536, -6.023783333333...","[-6.107889999999998, -6.0810699999999995, -6.0...","[0.049507983194632725, 0.08761111573310829, 0...."
3,GradientBoostingRegressor,"[-5.959694613336954, -5.274320184470579, -5.11...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.00217794382563, -5.9713448596144385, -6.1...","[-6.021231619808823, -6.005497102370802, -6.15...","[0.06712402227890822, 0.04491198232978034, 0.0..."
4,AdaBoostRegressor,"[-6.057083333333335, -5.713333333333335, -5.31...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.057083333333335, -6.057083333333335, -6.0...","[-6.1083956140350875, -6.1083956140350875, -6....","[0.051692814727876714, 0.051692814727876714, 0..."
5,XGBRegressor,"[-6.128929, -5.3090863, -5.080198, -5.4003363,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9565606, -5.9533124, -6.024044, -6.466758...","[-5.9826574, -5.944065, -5.942799, -6.3996835,...","[0.026419247, 0.01588873, 0.10191754, 0.128357..."
6,ExtraTreesRegressor,"[-6.047399999999997, -5.539200000000001, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0295999999999985, -5.949999999999999, -5....","[-6.034679999999999, -5.976859999999999, -5.87...","[0.044410737440397, 0.053720000000000205, 0.14..."
7,LinearRegression,"[-4.426033202701838, -5.929105145061196, -5.50...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.743472345835985, -6.012766707376365, -6.2...","[-6.801604983273843, -6.023761848286085, -6.33...","[0.28650014513941713, 0.05504217301855721, 0.1..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.533333333333334, -5.15...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-5.924666666666667, -5.924666666666667, -5.69...","[0.05153639490690065, 0.05153639490690065, 0.1..."
9,SVR,"[-5.905444524254662, -5.772972007201016, -5.39...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.994128292195161, -5.994709368797728, -6.0...","[-5.991491969725333, -5.987653453328037, -5.96...","[0.03180110496156662, 0.030703671806024926, 0...."


In [53]:
df_SubstructureCount_fp.to_csv('results/Fingerprints/Results_Substructure_Count_fp_RRCK.csv')
pred_df.to_csv('results/Fingerprints/Prediction_data_Substructure_Count_fp_RRCK.csv')

In [94]:
#Descriptors models
#2d RDKit descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2drdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2drdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 217)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 217)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2859
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 117
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2852
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 117
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



0.1245769361026926




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.237,0.3553,0.4868,0.4713,0.6887,0.7113,0.5246,0.476,0.7243,0.2742,0.622,0.7254
DecisionTreeRegressor,0.4035,0.4595,0.6353,0.0998,0.5743,0.5936,0.32,0.385,0.5657,0.5573,0.7732,0.7706
RandomForestRegressor,0.2195,0.3532,0.4685,0.5104,0.7249,0.7408,0.4632,0.4532,0.6806,0.3592,0.7093,0.7482
GradientBoostingRegressor,0.223,0.3457,0.4723,0.5025,0.7124,0.7188,0.3887,0.4009,0.6235,0.4623,0.752,0.8023
AdaBoostRegressor,0.2553,0.3935,0.5053,0.4304,0.6704,0.6673,0.5126,0.4873,0.716,0.2908,0.7002,0.7464
XGBRegressor,0.2335,0.3613,0.4832,0.4791,0.7009,0.7014,0.4798,0.4649,0.6927,0.3362,0.6585,0.6768
ExtraTreesRegressor,0.1947,0.3274,0.4412,0.5658,0.7608,0.7731,0.3913,0.4171,0.6256,0.4586,0.7401,0.7865
LinearRegression,3.2099,1.182,1.7916,-6.1605,0.3155,0.3398,1.0421,0.7823,1.0208,-0.4417,0.6623,0.622
KNeighborsRegressor,0.3124,0.4019,0.5589,0.3032,0.6184,0.6459,0.4943,0.5017,0.7031,0.3161,0.7443,0.7622
SVR,0.2056,0.3341,0.4534,0.5414,0.7449,0.7628,0.4234,0.4407,0.6507,0.4142,0.754,0.8111


In [95]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.93835987288455, -5.762486820527443, -5.244...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.348698928507222, -6.159982280669087, -6.0...","[-6.320802528332644, -6.251922628884051, -5.99...","[0.0371313667093177, 0.06321831468574877, 0.10..."
1,DecisionTreeRegressor,"[-6.13, -6.46, -5.22, -4.92, -6.15, -4.0, -6.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -7.48, -6.5, -7.48, -4.72, -6....","[-6.2299999999999995, -6.052, -6.8580000000000...","[0.28446440902158565, 0.20399999999999993, 0.7..."
2,RandomForestRegressor,"[-6.0016, -5.764133333333333, -5.1121000000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0619499999999995, -6.023399999999998, -6....","[-6.125453190476191, -6.058525714285713, -6.19...","[0.0975147506680879, 0.05968811142009827, 0.15..."
3,GradientBoostingRegressor,"[-5.9512253473473615, -5.558338989754418, -5.0...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.978721093957619, -5.9633574814749615, -6....","[-6.3160505443908175, -6.003442476614646, -6.4...","[0.1955404100609013, 0.0674073215586169, 0.238..."
4,AdaBoostRegressor,"[-5.9, -5.521836734693875, -5.210875, -5.37524...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.976666666666667, -5.976666666666667, -6.0...","[-6.090144081805252, -6.181590090090089, -6.05...","[0.09204693791553567, 0.13389765499980477, 0.1..."
5,XGBRegressor,"[-6.1026864, -5.3469954, -5.0746627, -5.12929,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.789037, -5.9500117, -6.0749774, -5.768519...","[-6.009329, -6.0117903, -6.089252, -5.234092, ...","[0.211859, 0.12290845, 0.34433013, 0.6226411, ..."
6,ExtraTreesRegressor,"[-6.053099999999997, -5.643450000000001, -5.05...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.060299999999997, -5.949999999999999, -6.3...","[-6.095261333333333, -5.9932799999999995, -6.4...","[0.07409313459273964, 0.0865599999999997, 0.17..."
7,LinearRegression,"[-4.0, -5.494905840954672, -5.084799965957416,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.188757338687253, -5.927155234796089, -5.4...","[-5.991807303864134, -5.553979930527265, -8.40...","[2.2472698578631456, 0.7770320298594786, 1.650..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.3...","[-5.924666666666667, -5.924666666666667, -6.19...","[0.05153639490690065, 0.05153639490690065, 0.1..."
9,SVR,"[-6.10967609215014, -5.705788111659985, -5.216...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.066532700789061, -5.969497994045415, -6.2...","[-6.0900001726033555, -5.968102157911325, -6.1...","[0.02041119487560847, 0.027503661844547247, 0...."


In [96]:
result_df.to_csv('results/Descriptors/Results_2d_RDKit_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_RDKit_desc_RRCK.csv')

In [97]:
#2d Mordred descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df , prediction_df= train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1430)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1430)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38899
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1111
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38632
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1113
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1849,0.317,0.43,0.5876,0.775,0.8056,0.4376,0.4193,0.6615,0.3946,0.7041,0.7979
DecisionTreeRegressor,0.3915,0.4377,0.6257,0.1267,0.5221,0.5477,0.2967,0.3822,0.5447,0.5895,0.8008,0.792
RandomForestRegressor,0.1818,0.3186,0.4264,0.5944,0.7927,0.812,0.4509,0.447,0.6715,0.3762,0.718,0.793
GradientBoostingRegressor,0.18,0.3109,0.4243,0.5985,0.7783,0.7897,0.3722,0.3998,0.6101,0.4851,0.7508,0.823
AdaBoostRegressor,0.2116,0.3606,0.46,0.5279,0.7408,0.7538,0.4802,0.4566,0.693,0.3356,0.703,0.7752
XGBRegressor,0.2262,0.3495,0.4756,0.4955,0.7078,0.7196,0.4588,0.44,0.6774,0.3652,0.6773,0.7516
ExtraTreesRegressor,0.1733,0.3059,0.4162,0.6135,0.7921,0.8117,0.4354,0.4203,0.6598,0.3977,0.7045,0.7771
LinearRegression,0.7643,0.6221,0.8743,-0.705,0.4778,0.5478,0.426,0.4833,0.6527,0.4107,0.7495,0.6648
KNeighborsRegressor,0.2873,0.3885,0.536,0.3592,0.641,0.6614,0.5536,0.4957,0.744,0.2341,0.6273,0.6864
SVR,0.1892,0.3014,0.4349,0.578,0.7678,0.7889,0.4159,0.4107,0.6449,0.4246,0.7278,0.7808


In [98]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.910141696269743, -6.161793987607897, -5.15...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.91895199403101, -6.0594222330234695, -6.0...","[-6.05603627351805, -6.077932022591685, -6.114...","[0.11070492901527226, 0.09231332670294543, 0.1..."
1,DecisionTreeRegressor,"[-5.21, -5.85, -4.72, -5.02, -5.32, -6.46, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.6, -5.95, -7.48, -6.46, -7.48, -4.72, -6....","[-5.918, -5.933999999999999, -6.73799999999999...","[0.7186765614655874, 0.03200000000000003, 0.74..."
2,RandomForestRegressor,"[-5.994349999999998, -6.064000000000008, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.654166666666666, -5.96145, -6.29740000000...","[-6.042965833333333, -6.0453899999999985, -6.1...","[0.2052031476569825, 0.06524620601996699, 0.16..."
3,GradientBoostingRegressor,"[-6.09884285066795, -6.00455029676722, -4.9890...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.477661308181957, -5.958295064354016, -6.5...","[-6.00864009521185, -6.032885926086662, -6.303...","[0.3187051205024514, 0.15339208394983397, 0.34..."
4,AdaBoostRegressor,"[-6.0, -6.191428571428572, -5.1783333333333355...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -6.136666666666667, -5.886, -6.01475,...","[-6.2259932126696835, -6.0760700000000005, -6....","[0.1379009094159843, 0.07838777994333362, 0.08..."
5,XGBRegressor,"[-5.910527, -5.9518404, -4.9645486, -5.1917067...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.4869833, -5.9503827, -6.074064, -5.76294,...","[-5.9810205, -5.944532, -6.0122857, -5.697724,...","[0.33251268, 0.011460449, 0.21454109, 0.251155..."
6,ExtraTreesRegressor,"[-6.058599999999999, -6.025200000000002, -4.95...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.083399999999999, -5.949999999999999, -6.2...","[-6.2022900000000005, -6.002979999999999, -6.2...","[0.06937457315184002, 0.10595999999999997, 0.2..."
7,LinearRegression,"[-6.028067990745983, -5.426419979973455, -5.35...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.98251282097078, -5.950000000000013, -7.60...","[-5.652689665055912, -5.863085364244387, -7.46...","[1.168829365029526, 0.17382927151130348, 1.778..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.039999999999999, -5.02...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.997333333333334, -5.75...","[0.11400974617208051, 0.13330666399946667, 0.0..."
9,SVR,"[-6.05043537180491, -5.856851789129326, -5.226...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.037712785994414, -6.027298502190129, -6.0...","[-6.08354509709049, -6.062131013647689, -5.977...","[0.04393336812883024, 0.036540383792336824, 0...."


In [99]:
result_df.to_csv('results/Descriptors/Results_2d_Mordred_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_Mordred_desc_RRCK.csv')

In [100]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [101]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [102]:
#2d RDKit descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 160)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 160)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000487 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2859
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 117
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2852
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 117
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



0.19066528570698915




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.237,0.3553,0.4868,0.4713,0.6887,0.7113,0.5246,0.476,0.7243,0.2742,0.622,0.7254
DecisionTreeRegressor,0.4109,0.4679,0.641,0.0834,0.5841,0.5869,0.3413,0.4084,0.5842,0.5279,0.7582,0.7754
RandomForestRegressor,0.2208,0.353,0.4699,0.5075,0.7228,0.74,0.4666,0.4531,0.6831,0.3545,0.7069,0.7498
GradientBoostingRegressor,0.2199,0.3472,0.4689,0.5095,0.7176,0.7266,0.3985,0.4039,0.6313,0.4487,0.7435,0.8027
AdaBoostRegressor,0.2665,0.4022,0.5163,0.4054,0.65,0.6716,0.4792,0.4753,0.6923,0.337,0.7252,0.7552
XGBRegressor,0.2335,0.3613,0.4832,0.4791,0.7009,0.7014,0.4798,0.4649,0.6927,0.3362,0.6585,0.6768
ExtraTreesRegressor,0.191,0.3237,0.437,0.574,0.7648,0.7719,0.3781,0.415,0.6149,0.4769,0.7504,0.7894
LinearRegression,3.2099,1.182,1.7916,-6.1605,0.3155,0.3398,1.0421,0.7823,1.0208,-0.4417,0.6623,0.622
KNeighborsRegressor,0.3124,0.4019,0.5589,0.3032,0.6184,0.6459,0.4943,0.5017,0.7031,0.3161,0.7443,0.7622
SVR,0.2056,0.3341,0.4534,0.5414,0.7449,0.7627,0.4236,0.441,0.6509,0.4139,0.754,0.8111


In [103]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.93835987288455, -5.762486820527443, -5.244...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.348698928507222, -6.159982280669087, -6.0...","[-6.320802528332644, -6.251922628884051, -5.99...","[0.0371313667093177, 0.06321831468574877, 0.10..."
1,DecisionTreeRegressor,"[-6.13, -6.92, -4.72, -4.92, -6.05, -4.0, -6.0...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -7.48, -6.46, -7.48, -5.4, -6....","[-6.306, -6.0920000000000005, -6.728, -6.33799...","[0.2964186228967405, 0.284, 0.9344816745126684..."
2,RandomForestRegressor,"[-6.059800000000001, -5.779, -5.10570000000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.1035, -6.023399999999999, -6.188650000000...","[-6.1369886904761906, -6.069018333333333, -6.2...","[0.09233705042218272, 0.0649783182642059, 0.16..."
3,GradientBoostingRegressor,"[-5.950203698175421, -5.485281991097383, -5.00...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.081467942262058, -5.956408498940884, -6.6...","[-6.33020336225544, -5.9964755512598185, -6.32...","[0.1650208991254781, 0.05750759490218278, 0.29..."
4,AdaBoostRegressor,"[-5.830000000000001, -5.653571428571428, -5.26...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.958606557377045, -6.001666666666666, -6.2...","[-6.133591796425241, -6.20989090909091, -6.124...","[0.1504264077553014, 0.1068085567982552, 0.262..."
5,XGBRegressor,"[-6.1026864, -5.3469954, -5.0746627, -5.12929,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.789037, -5.9500117, -6.0749774, -5.768519...","[-6.009329, -6.0117903, -6.089252, -5.234092, ...","[0.211859, 0.12290845, 0.34433013, 0.6226411, ..."
6,ExtraTreesRegressor,"[-6.1362, -5.696700000000004, -5.0348000000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0559, -5.949999999999999, -6.444600000000...","[-6.071533333333333, -5.993079999999999, -6.51...","[0.04600661305121625, 0.08615999999999956, 0.2..."
7,LinearRegression,"[-4.0, -5.494905840962645, -5.084799965956051,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.188757338418725, -5.927155234874078, -5.4...","[-5.991807304015241, -5.553979930541504, -8.40...","[2.2472698579532855, 0.7770320298662645, 1.650..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.3...","[-5.924666666666667, -5.924666666666667, -6.19...","[0.05153639490690065, 0.05153639490690065, 0.1..."
9,SVR,"[-6.109675600467604, -5.70578819712073, -5.216...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.066532351116907, -5.969497843816715, -6.2...","[-6.089940263045203, -5.968082768247543, -6.17...","[0.020487311288602424, 0.027527694330146978, 0..."


In [104]:
result_df.to_csv('results/Descriptors/Results_2d_rdkit_const_rem_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_rdkit_const_rem_RRCK.csv')

In [105]:
#2d Mordred descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1186)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1186)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38899
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1111
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38632
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1113
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1849,0.317,0.43,0.5876,0.775,0.8056,0.4376,0.4193,0.6615,0.3946,0.7041,0.7979
DecisionTreeRegressor,0.3933,0.4364,0.6272,0.1226,0.5121,0.5311,0.3496,0.4232,0.5913,0.5163,0.7697,0.7598
RandomForestRegressor,0.186,0.3222,0.4313,0.5851,0.7873,0.8054,0.4519,0.4471,0.6723,0.3748,0.7186,0.7992
GradientBoostingRegressor,0.1821,0.3149,0.4267,0.5938,0.7752,0.7904,0.3793,0.405,0.6158,0.4753,0.7473,0.816
AdaBoostRegressor,0.2041,0.3555,0.4518,0.5447,0.7571,0.7655,0.4695,0.4658,0.6852,0.3505,0.7292,0.7852
XGBRegressor,0.2262,0.3495,0.4756,0.4955,0.7078,0.7196,0.4588,0.44,0.6774,0.3652,0.6773,0.7516
ExtraTreesRegressor,0.1667,0.2992,0.4083,0.6281,0.8033,0.8215,0.4305,0.4157,0.6562,0.4044,0.7078,0.7776
LinearRegression,0.7643,0.6221,0.8743,-0.705,0.4778,0.5478,0.426,0.4833,0.6527,0.4107,0.7495,0.6648
KNeighborsRegressor,0.2757,0.3747,0.5251,0.385,0.6532,0.6751,0.5319,0.4807,0.7293,0.2642,0.6406,0.7055
SVR,0.1892,0.3014,0.4349,0.578,0.7677,0.7889,0.4159,0.4107,0.6449,0.4246,0.7278,0.7808


In [106]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.910141696269743, -6.161793987607897, -5.15...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.91895199403101, -6.0594222330234695, -6.0...","[-6.05603627351805, -6.077932022591685, -6.114...","[0.11070492901527226, 0.09231332670294543, 0.1..."
1,DecisionTreeRegressor,"[-5.27, -5.85, -4.96, -5.02, -5.4, -6.46, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.6, -5.95, -7.48, -6.46, -5.85, -4.96, -6....","[-6.002000000000001, -5.933999999999999, -6.74...","[0.7757937870336422, 0.03200000000000003, 0.63..."
2,RandomForestRegressor,"[-5.982149999999997, -6.063800000000003, -5.12...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.662833333333333, -5.983850000000001, -6.3...","[-6.023353333333333, -6.055689999999999, -6.09...","[0.19818948996688224, 0.06029200112784414, 0.1..."
3,GradientBoostingRegressor,"[-6.123367042345114, -5.959793107475603, -4.99...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.486373418633038, -5.958295064354016, -6.5...","[-5.973699818006027, -6.017190649701306, -6.31...","[0.2979198630293542, 0.12204393959575797, 0.45..."
4,AdaBoostRegressor,"[-6.050000000000001, -6.165428571428572, -5.19...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.111590909090908, -6.017272727272727, -6.1...","[-6.178018181818181, -6.056378837752109, -6.03...","[0.09626257393450458, 0.10060528948648971, 0.1..."
5,XGBRegressor,"[-5.910527, -5.9518404, -4.9645486, -5.1917067...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.4869833, -5.9503827, -6.074064, -5.76294,...","[-5.9810205, -5.944532, -6.0122857, -5.697724,...","[0.33251268, 0.011460449, 0.21454109, 0.251155..."
6,ExtraTreesRegressor,"[-6.051399999999998, -5.808100000000004, -4.98...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.089849999999999, -5.949999999999999, -6.4...","[-6.190266666666667, -5.99142, -6.311750000000...","[0.05793824969549737, 0.08284000000000055, 0.1..."
7,LinearRegression,"[-6.028067990745973, -5.4264199799735655, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.982512820970865, -5.950000000000016, -7.6...","[-5.652689665056057, -5.863085364244375, -7.46...","[1.1688293650294197, 0.17382927151133237, 1.77..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.039999999999999, -5.02...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.997333333333334, -5.75...","[0.11400974617208051, 0.13330666399946667, 0.0..."
9,SVR,"[-6.050435408053784, -5.856851649250858, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.037712829140469, -6.0272985594992665, -6....","[-6.083501292361208, -6.0620855037124395, -5.9...","[0.04391504148235846, 0.03655566913657778, 0.0..."


In [107]:
result_df.to_csv('results/Descriptors/Results_2d_Mordred_const_rem_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_df_2d_Mordred_const_rem_RRCK.csv')

In [108]:
#2d RDKit descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_LVR_rdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_LVR_rdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 151)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 151)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2523
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 108
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2517
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 108
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



-0.2262621277749297




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2452,0.3639,0.4951,0.4531,0.6747,0.6982,0.489,0.4767,0.6993,0.3235,0.6662,0.7537
DecisionTreeRegressor,0.3452,0.4397,0.5875,0.2299,0.6204,0.6232,0.3637,0.4106,0.6031,0.4968,0.7378,0.769
RandomForestRegressor,0.2203,0.3515,0.4694,0.5085,0.7237,0.7374,0.4415,0.4502,0.6645,0.3891,0.7249,0.7516
GradientBoostingRegressor,0.2266,0.343,0.4761,0.4944,0.7067,0.7151,0.374,0.3991,0.6116,0.4826,0.7613,0.8118
AdaBoostRegressor,0.2712,0.4034,0.5208,0.3949,0.6389,0.6588,0.4518,0.4686,0.6722,0.375,0.7563,0.7721
XGBRegressor,0.225,0.3612,0.4744,0.498,0.7143,0.7135,0.43,0.4491,0.6558,0.4051,0.7047,0.6848
ExtraTreesRegressor,0.1838,0.321,0.4287,0.5901,0.7765,0.7851,0.3853,0.4189,0.6207,0.467,0.7427,0.782
LinearRegression,2.5034,0.9827,1.5822,-4.5843,0.3177,0.3847,0.8626,0.6234,0.9287,-0.1933,0.6385,0.651
KNeighborsRegressor,0.3101,0.3955,0.5569,0.3082,0.6163,0.6531,0.4958,0.4984,0.7041,0.3141,0.723,0.7439
SVR,0.2111,0.3393,0.4594,0.5292,0.737,0.754,0.4244,0.4454,0.6515,0.4128,0.758,0.818


In [109]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.845539719965485, -5.782674456035341, -5.24...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.331797070419781, -6.126348515483489, -6.1...","[-6.261330418542597, -6.202585219997639, -6.13...","[0.11962600530635428, 0.044499290272702635, 0...."
1,DecisionTreeRegressor,"[-6.13, -5.85, -5.22, -4.96, -5.335, -4.0, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -7.48, -6.46, -7.48, -5.4, -4....","[-6.290000000000001, -6.0920000000000005, -6.7...","[0.3166701754191576, 0.284, 0.9344816745126684..."
2,RandomForestRegressor,"[-6.014699999999998, -5.708850000000001, -5.11...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0349499999999985, -6.05252857142857, -6.4...","[-6.0808505, -6.081905714285712, -6.3542800000...","[0.05862866416788905, 0.05046412716586148, 0.1..."
3,GradientBoostingRegressor,"[-5.9292780754812116, -5.417244788096263, -5.0...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.049708089975782, -5.972039532934815, -6.6...","[-6.2602871564537805, -6.00528483274836, -6.43...","[0.16618750814625413, 0.06514601868553792, 0.2..."
4,AdaBoostRegressor,"[-6.153235294117647, -6.23, -5.382857142857143...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.912631578947368, -6.23, -6.43883720930232...","[-6.0741929824561405, -6.19352380952381, -6.29...","[0.11187465505018472, 0.05110355854678931, 0.0..."
5,XGBRegressor,"[-6.090651, -5.271743, -4.99249, -5.347108, -5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.794349, -5.950972, -6.407552, -5.6887646,...","[-5.9727006, -5.996957, -6.377507, -5.201325, ...","[0.09466652, 0.09276803, 0.27976292, 0.6452367..."
6,ExtraTreesRegressor,"[-6.1151, -5.631850000000001, -5.0400000000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.100899999999999, -5.949999999999999, -6.4...","[-6.0821966666666665, -5.984219999999999, -6.4...","[0.07326158641774289, 0.06844000000000038, 0.1..."
7,LinearRegression,"[-10.0, -5.489937817158172, -5.097898522698841...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.322694572590507, -5.8981446787330185, -6....","[-5.628798277714884, -5.545863734817898, -8.17...","[2.0498866287015742, 0.7731477879100248, 1.033..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -6.3...","[-5.924666666666667, -5.924666666666667, -6.19...","[0.05153639490690065, 0.05153639490690065, 0.1..."
9,SVR,"[-6.1029741034208325, -5.702227741023407, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.070823768649641, -5.978907368213092, -6.2...","[-6.09002197281381, -5.969587374748736, -6.191...","[0.018909837286720663, 0.022670879132164316, 0..."


In [110]:
result_df.to_csv('results/Descriptors/Results_2d_rdkit_LVR_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_rdkit_LVR_RRCK.csv')

In [111]:
#2d Mordred descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
results_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
results_df

X_train shape:  (140, 852)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 852)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26830
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 783
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26679
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 785
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2058,0.336,0.4537,0.5408,0.7412,0.7653,0.4434,0.4295,0.6659,0.3865,0.6952,0.7804
DecisionTreeRegressor,0.3688,0.4425,0.6073,0.1772,0.5424,0.5594,0.3839,0.4445,0.6196,0.4688,0.7314,0.7543
RandomForestRegressor,0.2014,0.3285,0.4488,0.5508,0.7608,0.7796,0.4641,0.4586,0.6813,0.3579,0.7004,0.7656
GradientBoostingRegressor,0.2131,0.3373,0.4616,0.5247,0.7271,0.7355,0.4081,0.4067,0.6388,0.4355,0.7196,0.7887
AdaBoostRegressor,0.2339,0.3629,0.4836,0.4782,0.6957,0.7142,0.4667,0.4656,0.6832,0.3543,0.7164,0.7701
XGBRegressor,0.2402,0.3638,0.4901,0.4642,0.6846,0.6902,0.4924,0.4419,0.7017,0.3188,0.6391,0.7177
ExtraTreesRegressor,0.1744,0.3046,0.4176,0.6109,0.7919,0.8128,0.4373,0.4242,0.6613,0.395,0.7026,0.7737
LinearRegression,1.025,0.7253,1.0124,-1.2864,0.4238,0.4663,0.585,0.5361,0.7649,0.1906,0.717,0.6416
KNeighborsRegressor,0.2931,0.3903,0.5414,0.3462,0.6235,0.6601,0.5655,0.4978,0.752,0.2177,0.6103,0.6686
SVR,0.206,0.3185,0.4539,0.5405,0.7411,0.7712,0.4466,0.4325,0.6683,0.3821,0.7014,0.7807


In [112]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.093634008837347, -6.13794839523806, -5.093...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.869023607183717, -5.9830588695675075, -6....","[-5.926783790589573, -6.081498373399899, -6.08...","[0.13499380262429966, 0.09530227603083953, 0.1..."
1,DecisionTreeRegressor,"[-6.78, -5.85, -4.96, -5.4, -6.05, -6.49, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.85, -5.95, -7.48, -6.46, -7.48, -4.72, -6...","[-5.814, -5.933999999999999, -6.35, -6.088, -6...","[0.5840753376063744, 0.03200000000000003, 0.57..."
2,RandomForestRegressor,"[-6.019499999999999, -6.046500000000005, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.6323383333333314, -5.999850000000001, -6....","[-6.042284333333332, -6.067059999999999, -6.11...","[0.21090203200538446, 0.05596180304457581, 0.1..."
3,GradientBoostingRegressor,"[-6.0792922448898725, -6.008668359529459, -4.9...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.546151244669781, -5.9680187484965, -6.355...","[-5.920546894150406, -5.991734707053967, -6.22...","[0.22488893221132691, 0.06504708081258444, 0.4..."
4,AdaBoostRegressor,"[-6.1433333333333335, -6.140000000000001, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.972173913043478, -6.0649999999999995, -6....","[-6.203188410101452, -6.133919254658385, -6.23...","[0.13556274442733743, 0.06082546838644596, 0.3..."
5,XGBRegressor,"[-6.0197544, -5.763892, -5.1152167, -5.2285433...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.6473403, -5.950748, -6.146745, -5.5296054...","[-5.9843483, -5.9277716, -5.9350777, -5.649126...","[0.24958701, 0.044840433, 0.15002201, 0.292244..."
6,ExtraTreesRegressor,"[-6.080999999999997, -5.9159500000000005, -4.9...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.017049999999999, -5.949999999999999, -6.3...","[-6.179711999999999, -6.002659999999999, -6.28...","[0.09659386717592336, 0.10531999999999897, 0.2..."
7,LinearRegression,"[-6.795902324133721, -5.372767575496746, -5.48...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-7.151668633006642, -5.950000000000012, -8.1...","[-5.953250356112609, -5.849747792271744, -8.03...","[1.6142557849049197, 0.20050441545651473, 1.58..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -5.02...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -6.134666666666667, -5.92...","[0.11400974617208051, 0.12515945385343025, 0.2..."
9,SVR,"[-6.047027703173879, -5.812193768520956, -5.24...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.03162723770627, -6.023091935472627, -5.95...","[-6.087716870109624, -6.062322945420763, -5.94...","[0.048509352783831845, 0.039632720263368164, 0..."


In [113]:
results_df.to_csv('results/Descriptors/Results_2d_Mordred_LVR_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2d_Mordred_LVR_RRCK.csv')

In [114]:
#2d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_RRCK.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-2.5450,6.477025,193.2906,119.101580,0,0,111,51,60,...,100.950923,1.979430,38.389707,17.922198,20.467508,9274.0,80.0,1.183,254.0,1040
1,0,-1.9676,3.871450,190.2914,119.101580,0,0,111,51,60,...,100.952062,1.979452,37.702286,17.921964,19.780322,9274.0,80.0,3.523,254.0,1039
2,0,0.2800,0.078400,202.2946,127.250959,0,0,119,56,63,...,110.218442,1.968186,42.006878,15.364330,19.497913,11674.0,94.0,6.461,284.0,1042
3,0,-1.3788,1.901089,199.0992,125.288752,0,0,117,53,64,...,104.584374,1.973290,37.327689,17.880877,19.446812,9956.0,88.0,4.667,266.0,1034
4,0,-1.2765,1.629452,204.7959,128.382338,0,0,120,54,66,...,106.403540,1.970436,37.505267,17.862423,19.642844,10348.0,92.0,4.380,272.0,1032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0,-2.3982,5.751363,253.4017,141.852408,0,0,121,65,56,...,133.210563,2.049393,43.712429,15.474678,25.255779,18261.0,99.0,5.308,332.0,1851
136,0,-2.2959,5.271157,259.0984,144.945994,0,0,124,66,58,...,135.026472,2.045856,43.876374,15.452959,25.442427,18776.0,103.0,5.021,338.0,1860
137,0,-2.1936,4.811881,264.7951,148.039580,0,0,127,67,60,...,136.842632,2.042427,44.044180,15.431464,25.631764,19280.0,107.0,4.734,344.0,1861
138,0,-1.7280,2.985984,262.2301,148.226373,0,0,128,67,61,...,137.323641,2.049607,37.713596,15.473904,22.239692,19914.0,103.0,6.777,340.0,1850


In [115]:
df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_RRCK.csv')
df 


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.888651,2.446436,4.892748,...,11.228478,126.484444,1215.857018,6.109834,38268,152,418.0,484.0,44.111111,19.277778
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.888651,2.446436,4.892748,...,11.228478,126.484444,1213.841368,6.161631,38268,152,418.0,484.0,44.111111,19.277778
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.134733,2.444935,4.889759,...,11.208585,125.402718,1201.841368,6.131844,37337,150,412.0,477.0,43.250000,19.166667
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,100.268636,2.427280,4.854560,...,11.184019,125.341576,1201.841368,6.131844,37826,148,410.0,473.0,42.638889,19.277778
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,98.550044,2.444219,4.888335,...,11.198475,124.353468,1187.825718,6.154537,36408,148,408.0,472.0,43.000000,18.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,56.766056,2.420826,4.789951,...,10.523715,96.878346,640.394833,6.534641,7064,74,228.0,262.0,18.277778,10.388889
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,52.776707,2.440186,4.848090,...,10.592903,94.894677,620.426134,6.204261,6071,77,220.0,257.0,21.111111,9.722222
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,53.640751,2.419606,4.784653,...,10.483550,94.692583,612.363533,6.656125,6341,70,220.0,252.0,17.777778,9.722222
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,52.045329,2.439503,4.844153,...,10.570008,93.767368,606.410483,6.251654,5725,76,214.0,251.0,20.250000,9.638889


In [116]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1040,CC(C)CN1CC(=O)N(CC(C)C)CC(=O)N(CC(C)C)CC(=O)N2...,-6.40,0,-2.5450,6.477025,193.2906,119.101580,0,0,...,6.418490,100.950923,1.979430,38.389707,17.922198,20.467508,9274.0,80.0,1.183,254.0
1,1039,CC(C)C[C@@H]1NC(=O)CN(Cc2ccc(O)cc2)C(=O)[C@H]2...,-5.54,0,-1.9676,3.871450,190.2914,119.101580,0,0,...,6.418490,100.952062,1.979452,37.702286,17.921964,19.780322,9274.0,80.0,3.523,254.0
2,1042,CC(C)C[C@@H]1NC(=O)CN(Cc2cccc(C(F)(F)F)c2)C(=O...,-6.00,0,0.2800,0.078400,202.2946,127.250959,0,0,...,6.659463,110.218442,1.968186,42.006878,15.364330,19.497913,11674.0,94.0,6.461,284.0
3,1034,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-5.51,0,-1.3788,1.901089,199.0992,125.288752,0,0,...,6.328920,104.584374,1.973290,37.327689,17.880877,19.446812,9956.0,88.0,4.667,266.0
4,1032,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H...,-5.26,0,-1.2765,1.629452,204.7959,128.382338,0,0,...,6.287494,106.403540,1.970436,37.505267,17.862423,19.642844,10348.0,92.0,4.380,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,1851,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-6.15,0,-2.3982,5.751363,253.4017,141.852408,0,0,...,7.408300,133.210563,2.049393,43.712429,15.474678,25.255779,18261.0,99.0,5.308,332.0
136,1860,CN1C(=O)[C@@H](Cc2ccccc2)NC(=O)[C@H](CCCc2cccc...,-5.57,0,-2.2959,5.271157,259.0984,144.945994,0,0,...,7.342097,135.026472,2.045856,43.876374,15.452959,25.442427,18776.0,103.0,5.021,338.0
137,1861,CN1C(=O)[C@@H](Cc2ccccc2)N(C)C(=O)[C@H](CCCc2c...,-5.40,0,-2.1936,4.811881,264.7951,148.039580,0,0,...,7.279021,136.842632,2.042427,44.044180,15.431464,25.631764,19280.0,107.0,4.734,344.0
138,1850,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.74,0,-1.7280,2.985984,262.2301,148.226373,0,0,...,7.058346,137.323641,2.049607,37.713596,15.473904,22.239692,19914.0,103.0,6.777,340.0


In [117]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,0,-2.3118,5.344419,324.4169,207.951609,0,0,...,6.109834,165.660911,1.926290,64.869552,30.340460,34.529092,38268.0,152.0,8.704,418.0
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,0,-1.9035,3.623312,324.6192,206.618023,0,0,...,6.161631,165.660911,1.926290,64.869552,30.340460,34.529092,38268.0,152.0,7.824,418.0
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,0,-3.1214,9.743138,318.9594,204.858023,0,0,...,6.131844,163.824465,1.927347,64.851242,30.333765,34.517477,37337.0,150.0,8.407,412.0
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,0,-2.5378,6.440429,318.5450,204.858023,0,0,...,6.131844,164.030378,1.929769,64.941671,30.873015,34.068656,37826.0,148.0,8.690,410.0
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,0,-2.8334,8.028156,316.0478,201.764437,0,0,...,6.154537,161.805840,1.926260,64.748500,30.296198,34.452302,36408.0,148.0,8.049,408.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,0,-2.8620,8.191044,164.7821,105.925236,0,0,...,6.534641,91.402117,1.987003,33.999411,15.357748,18.641663,7064.0,74.0,5.393,228.0
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,0,-2.4208,5.860273,160.1675,105.072408,0,0,...,6.204261,85.761816,1.949132,34.302616,15.219294,19.083322,6071.0,77.0,4.002,220.0
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,0,-2.2860,5.225796,158.9589,99.738064,0,0,...,6.656125,87.363105,1.985525,33.788693,15.284672,18.504021,6341.0,70.0,4.677,220.0
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,0,-3.7740,14.243076,153.1698,101.978822,0,0,...,6.251654,83.927240,1.951796,34.294802,15.216440,19.078362,5725.0,76.0,3.494,214.0


In [118]:
df_ordered.to_csv('features/Descriptors/Train_2d_padel_curated_RRCK.csv', index=False)

In [119]:
#2d test padel descriptors
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_RRCK.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-3.198,10.227204,147.3466,95.79165,0,0,91,41,50,...,79.902837,1.94885,34.163259,15.1684,18.994859,5079.0,73.0,2.567,206.0,2303
1,0,-2.2671,5.139742,178.9606,112.112408,0,0,104,48,56,...,94.840191,1.975837,34.45279,15.271386,19.181404,7643.0,84.0,4.174,242.0,2308
2,0,-1.2947,1.676248,186.2062,119.10158,0,0,111,51,60,...,100.950923,1.97943,36.64389,17.922198,18.721692,9274.0,80.0,6.411,254.0,2295
3,0,-1.5766,2.485668,194.4885,122.195166,0,0,114,52,62,...,102.770477,1.976355,37.538804,17.903031,19.635773,9622.0,84.0,4.406,260.0,1037
4,0,-1.2947,1.676248,186.2062,119.10158,0,0,111,51,60,...,100.950923,1.97943,36.64389,17.922198,18.721692,9274.0,80.0,6.411,254.0,2300
5,0,-2.574,6.625476,161.8705,102.83165,0,0,95,45,50,...,89.382185,1.986271,33.894145,15.321219,18.572926,6693.0,72.0,5.035,224.0,2335
6,0,-2.8302,8.010032,180.5661,112.914408,0,0,105,49,56,...,96.698242,1.973434,36.974399,17.792362,19.182037,8136.0,86.0,3.038,248.0,2323
7,0,-5.1686,26.714426,208.1255,124.66558,0,0,116,56,60,...,109.523983,1.955785,48.253608,22.597529,25.656078,11989.0,108.0,-1.084,288.0,1867
8,0,-9.7766,95.581908,254.7111,174.162956,0,0,164,72,92,...,141.68827,1.967893,49.102754,22.908473,26.194281,22351.0,130.0,6.754,352.0,1877
9,0,-1.2019,1.444564,188.8699,118.29958,0,0,110,50,60,...,98.681938,1.973639,34.501384,15.289963,19.211421,8468.0,86.0,5.251,252.0,2312


In [120]:
df = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_RRCK.csv')
df

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.888651,2.446436,4.892748,...,11.228478,126.484444,1217.836283,6.18191,38268,152,418.0,484.0,44.111111,19.277778
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.134733,2.444935,4.889759,...,11.208585,125.402718,1201.841368,6.131844,37337,150,412.0,477.0,43.25,19.166667
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,98.89382,2.465436,4.900312,...,11.191962,133.797536,1121.78279,6.129961,29757,145,390.0,461.0,34.722222,18.75
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,90.763285,2.389081,4.777099,...,10.86874,112.161925,1041.666046,6.351622,24888,117,340.0,386.0,29.583333,17.388889
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,89.057864,2.465877,4.910374,...,11.087191,125.281494,1008.698727,6.150602,22351,130,352.0,416.0,30.888889,16.833333
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,82.78409,2.467211,4.910268,...,11.091041,121.222371,952.636126,6.267343,19197,124,344.0,406.0,32.333333,15.0
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,82.790578,2.460889,4.892004,...,11.110939,120.217648,939.57934,6.524857,19137,129,338.0,406.0,31.472222,15.25
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,88.894075,2.433472,4.814442,...,10.886296,120.845208,917.483983,7.003695,20899,104,344.0,394.0,19.0,15.222222
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,80.676276,2.436117,4.821552,...,10.92646,118.602287,901.380838,7.638821,17333,100,332.0,382.0,22.034722,13.875
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,80.821298,2.447625,4.854854,...,10.949525,116.794005,879.489462,6.817748,16719,108,326.0,380.0,24.055556,14.222222


In [121]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2303,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-5.31,0,-3.198,10.227204,147.3466,95.79165,0,0,...,6.355815,79.902837,1.94885,34.163259,15.1684,18.994859,5079.0,73.0,2.567,206.0
1,2308,CC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C...,-4.62,0,-2.2671,5.139742,178.9606,112.112408,0,0,...,6.427174,94.840191,1.975837,34.45279,15.271386,19.181404,7643.0,84.0,4.174,242.0
2,2295,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](...,-6.44,0,-1.2947,1.676248,186.2062,119.10158,0,0,...,6.41849,100.950923,1.97943,36.64389,17.922198,18.721692,9274.0,80.0,6.411,254.0
3,1037,CC(C)C[C@@H]1NC(=O)CN(Cc2ccc(O)cc2)C(=O)[C@H]2...,-5.14,0,-1.5766,2.485668,194.4885,122.195166,0,0,...,6.372526,102.770477,1.976355,37.538804,17.903031,19.635773,9622.0,84.0,4.406,260.0
4,2300,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...,-5.55,0,-1.2947,1.676248,186.2062,119.10158,0,0,...,6.41849,100.950923,1.97943,36.64389,17.922198,18.721692,9274.0,80.0,6.411,254.0
5,2335,CC[C@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2cccc...,-5.53,0,-2.574,6.625476,161.8705,102.83165,0,0,...,6.593465,89.382185,1.986271,33.894145,15.321219,18.572926,6693.0,72.0,5.035,224.0
6,2323,CC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C...,-5.35,0,-2.8302,8.010032,180.5661,112.914408,0,0,...,6.518296,96.698242,1.973434,36.974399,17.792362,19.182037,8136.0,86.0,3.038,248.0
7,1867,C[C@H]1C(=O)N(C)[C@H](C)C(=O)N(C)[C@H](C)C(=O)...,-7.64,0,-5.1686,26.714426,208.1255,124.66558,0,0,...,6.762486,109.523983,1.955785,48.253608,22.597529,25.656078,11989.0,108.0,-1.084,288.0
8,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,0,-9.7766,95.581908,254.7111,174.162956,0,0,...,6.150602,141.68827,1.967893,49.102754,22.908473,26.194281,22351.0,130.0,6.754,352.0
9,2312,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.64,0,-1.2019,1.444564,188.8699,118.29958,0,0,...,6.331431,98.681938,1.973639,34.501384,15.289963,19.211421,8468.0,86.0,5.251,252.0


In [122]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,0,-3.3441,11.183005,322.0103,205.660023,0,0,...,6.18191,165.660911,1.92629,67.281618,32.752526,34.529092,38268.0,152.0,7.806,418.0
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,0,-3.1214,9.743138,318.9594,204.858023,0,0,...,6.131844,163.824465,1.927347,64.851242,30.333765,34.517477,37337.0,150.0,8.407,412.0
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,0,-10.9909,120.799883,282.894,193.959679,0,0,...,6.129961,157.176722,1.964709,54.876101,25.444335,29.431767,29757.0,145.0,7.521,390.0
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,0,-9.7664,95.382569,255.8125,175.736163,0,0,...,6.351622,143.564265,1.966634,53.678034,22.918108,27.745284,24888.0,117.0,9.471,340.0
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,0,-9.7766,95.581908,254.7111,174.162956,0,0,...,6.150602,141.68827,1.967893,49.102754,22.908473,26.194281,22351.0,130.0,6.754,352.0
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,0,-2.0598,4.242776,259.4091,161.788612,0,0,...,6.267343,132.96274,1.955334,48.807355,22.800364,26.006991,19197.0,124.0,4.656,344.0
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,0,-7.2469,52.51756,245.0432,153.743061,0,0,...,6.524857,131.06302,1.956164,54.288367,25.22917,29.059198,19137.0,129.0,0.757,338.0
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,0,-2.016,4.064256,265.1417,151.319959,0,0,...,7.003695,139.322655,2.048863,37.708724,15.472218,22.236506,20899.0,104.0,7.346,344.0
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,0,-0.1663,0.027656,240.8474,135.809822,0,0,...,7.638821,129.48801,2.02325,48.231158,15.458945,22.213781,17333.0,100.0,4.901,332.0
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,0,-1.8728,3.50738,245.6888,143.697545,0,0,...,6.817748,128.3312,2.005175,42.76517,20.418364,22.346806,16719.0,108.0,4.568,326.0


In [123]:
df_ordered.to_csv('features/Descriptors/Test_2d_padel_curated_RRCK.csv', index=False)

In [124]:
#3d Train descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_padel_RRCK.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.260126,2.168178,2.985414,3.703081,4.572657,5.287775,5.971320,6.600131,7.308308,7.949725,...,0.372674,0.395227,0.414936,0.366470,27.705985,239.114381,882.761686,0.245173,1.176633,1043
1,1.261929,2.183718,3.022837,3.765240,4.602265,5.340855,6.057819,6.792770,7.619819,8.431458,...,0.394478,0.518283,0.482646,0.467701,32.523701,294.735295,866.959016,0.387571,1.468629,1035
2,1.260152,2.179188,3.016652,3.748089,4.609748,5.394682,5.995122,6.705751,7.484998,8.119839,...,0.325168,0.400722,0.449515,0.371177,28.987882,256.206923,962.139853,0.258833,1.221414,1032
3,1.260469,2.175286,3.023047,3.751040,4.584167,5.239129,5.956658,6.707209,7.145874,7.895867,...,0.317821,0.460343,0.508610,0.424126,29.554902,246.248308,809.952598,0.351179,1.393079,1040
4,1.266721,2.187884,3.011559,3.736029,4.581380,5.280846,5.885916,6.604646,7.208371,7.831496,...,0.337734,0.488826,0.459264,0.390336,30.205968,260.245984,863.471017,0.332230,1.338426,1042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,1.262636,2.185491,3.021505,3.760947,4.607480,5.407457,6.113102,6.815279,7.531610,8.182586,...,0.379354,0.445017,0.396838,0.307687,28.273510,247.318593,909.546204,0.258821,1.149542,5670
136,1.260372,2.184931,3.011288,3.729456,4.584852,5.351579,6.027140,6.781283,7.452356,8.186011,...,0.305299,0.407700,0.362010,0.364220,30.492034,265.493473,931.787768,0.346891,1.133930,5671
137,1.263251,2.184426,3.010162,3.744587,4.554198,5.366878,6.097769,6.782195,7.631979,8.378278,...,0.358796,0.423582,0.468075,0.449225,35.468099,340.606285,1027.537753,0.392870,1.340882,46
138,1.256794,2.165785,2.997731,3.718485,4.571866,5.356003,6.129648,7.011115,7.773337,8.483838,...,0.421759,0.542612,0.495238,0.403767,36.220082,366.528479,1114.400589,0.395228,1.441617,977


In [125]:
df = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_RRCK.csv')
df 

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.888651,2.446436,4.892748,...,11.228478,126.484444,1215.857018,6.109834,38268,152,418.0,484.0,44.111111,19.277778
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.888651,2.446436,4.892748,...,11.228478,126.484444,1213.841368,6.161631,38268,152,418.0,484.0,44.111111,19.277778
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.134733,2.444935,4.889759,...,11.208585,125.402718,1201.841368,6.131844,37337,150,412.0,477.0,43.250000,19.166667
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,100.268636,2.427280,4.854560,...,11.184019,125.341576,1201.841368,6.131844,37826,148,410.0,473.0,42.638889,19.277778
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,98.550044,2.444219,4.888335,...,11.198475,124.353468,1187.825718,6.154537,36408,148,408.0,472.0,43.000000,18.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,56.766056,2.420826,4.789951,...,10.523715,96.878346,640.394833,6.534641,7064,74,228.0,262.0,18.277778,10.388889
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,52.776707,2.440186,4.848090,...,10.592903,94.894677,620.426134,6.204261,6071,77,220.0,257.0,21.111111,9.722222
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,53.640751,2.419606,4.784653,...,10.483550,94.692583,612.363533,6.656125,6341,70,220.0,252.0,17.777778,9.722222
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,52.045329,2.439503,4.844153,...,10.570008,93.767368,606.410483,6.251654,5725,76,214.0,251.0,20.250000,9.638889


In [126]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1043,CC(C)C[C@@H]1NC(=O)CN(CN2CCOCC2)C(=O)[C@H]2CCC...,-5.090,1.260126,2.168178,2.985414,3.703081,4.572657,5.287775,5.971320,...,0.457441,0.372674,0.395227,0.414936,0.366470,27.705985,239.114381,882.761686,0.245173,1.176633
1,1035,CC(C)C[C@@H]1NC(=O)CN(Cc2ccc(O)cc2)C(=O)[C@H]2...,-5.200,1.261929,2.183718,3.022837,3.765240,4.602265,5.340855,6.057819,...,0.530569,0.394478,0.518283,0.482646,0.467701,32.523701,294.735295,866.959016,0.387571,1.468629
2,1032,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H...,-5.260,1.260152,2.179188,3.016652,3.748089,4.609748,5.394682,5.995122,...,0.505889,0.325168,0.400722,0.449515,0.371177,28.987882,256.206923,962.139853,0.258833,1.221414
3,1040,CC(C)CN1CC(=O)N(CC(C)C)CC(=O)N(CC(C)C)CC(=O)N2...,-6.400,1.260469,2.175286,3.023047,3.751040,4.584167,5.239129,5.956658,...,0.567453,0.317821,0.460343,0.508610,0.424126,29.554902,246.248308,809.952598,0.351179,1.393079
4,1042,CC(C)C[C@@H]1NC(=O)CN(Cc2cccc(C(F)(F)F)c2)C(=O...,-6.000,1.266721,2.187884,3.011559,3.736029,4.581380,5.280846,5.885916,...,0.550420,0.337734,0.488826,0.459264,0.390336,30.205968,260.245984,863.471017,0.332230,1.338426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,5670,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-5.225,1.262636,2.185491,3.021505,3.760947,4.607480,5.407457,6.113102,...,0.459860,0.379354,0.445017,0.396838,0.307687,28.273510,247.318593,909.546204,0.258821,1.149542
136,5671,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-5.255,1.260372,2.184931,3.011288,3.729456,4.584852,5.351579,6.027140,...,0.564594,0.305299,0.407700,0.362010,0.364220,30.492034,265.493473,931.787768,0.346891,1.133930
137,46,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)C(=O...,-5.920,1.263251,2.184426,3.010162,3.744587,4.554198,5.366878,6.097769,...,0.569785,0.358796,0.423582,0.468075,0.449225,35.468099,340.606285,1027.537753,0.392870,1.340882
138,977,C/C1=C\[C@@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C...,-4.960,1.256794,2.165785,2.997731,3.718485,4.571866,5.356003,6.129648,...,0.508392,0.421759,0.542612,0.495238,0.403767,36.220082,366.528479,1114.400589,0.395228,1.441617


In [127]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,1.254180,2.166274,3.007142,3.700183,4.513361,5.316459,5.976700,...,0.584742,0.289275,0.526481,0.467149,0.355332,51.237177,733.133564,3650.811196,0.377113,1.348962
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,1.254432,2.166147,3.010365,3.691038,4.507068,5.364784,6.037048,...,0.562959,0.330424,0.573577,0.533312,0.392941,52.061975,762.354776,3612.998411,0.344438,1.499831
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,1.253592,2.162722,3.007509,3.718467,4.525201,5.252066,5.960072,...,0.505864,0.386967,0.530722,0.572866,0.283984,52.691795,809.151889,3930.912570,0.339246,1.387572
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,1.254398,2.163519,3.003287,3.701928,4.523709,5.373313,6.109207,...,0.543224,0.390435,0.549748,0.553013,0.367643,60.025118,987.347841,4090.444574,0.400488,1.470405
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,1.254467,2.167174,3.008550,3.695675,4.528104,5.365112,6.076403,...,0.559155,0.365398,0.567980,0.530505,0.368965,55.664600,849.215301,3563.631478,0.386829,1.467450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,1.259866,2.193760,2.990089,3.742024,4.661808,5.479539,6.218067,...,0.480821,0.450969,0.447622,0.401417,0.374652,28.991239,235.667908,625.054603,0.397685,1.223691
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,1.258750,2.168440,3.010535,3.753370,4.529155,5.339332,6.100139,...,0.587350,0.351181,0.540411,0.549950,0.480266,27.093819,193.764285,473.031633,0.407795,1.570627
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,1.263336,2.200888,3.010521,3.768007,4.659284,5.461330,6.179759,...,0.603934,0.302221,0.478983,0.563029,0.379015,26.877338,193.282773,552.730401,0.405901,1.421027
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,1.258069,2.171925,2.998728,3.734575,4.547098,5.378378,6.132871,...,0.573506,0.364677,0.497225,0.533337,0.460216,26.843623,192.495908,469.419500,0.407274,1.490778


In [128]:
df_ordered.to_csv('features/Descriptors/Train_3d_padel_curated_RRCK.csv', index=False)

In [129]:
#3d test padel descriptors
df_test = pd.read_csv('features/Descriptors/Test_3d_padel_RRCK.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.261799,2.186755,3.020058,3.756445,4.529348,5.352167,6.150114,6.928809,7.779645,8.58412,...,0.394757,0.497851,0.54297,0.398595,29.938817,245.575146,646.850845,0.402696,1.439416,2308
1,1.284135,2.238134,3.059687,3.813058,4.667183,5.41641,6.164273,6.862916,7.462712,8.138391,...,0.35707,0.434281,0.561102,0.351591,32.29146,330.378307,1411.879635,0.207975,1.346974,1856
2,1.270762,2.233409,3.036703,3.836601,4.654813,5.419781,6.132307,6.780357,7.446888,8.096509,...,0.413066,0.490229,0.356166,0.318931,40.910881,492.776011,2003.890601,0.336964,1.165325,1849
3,1.263047,2.186342,3.024238,3.763724,4.579603,5.397958,6.117764,6.859126,7.473837,8.092249,...,0.358262,0.460196,0.434979,0.435413,29.645739,241.654474,691.817758,0.379654,1.330588,2312
4,1.263868,2.200103,3.00471,3.7771,4.72887,5.516489,6.218313,6.902843,7.576359,8.342238,...,0.373398,0.510431,0.394462,0.389608,27.714935,218.75133,650.410705,0.356417,1.294501,2335
5,1.261647,2.188305,3.022598,3.75681,4.541332,5.311648,6.120919,6.926018,7.714005,8.497336,...,0.402075,0.502233,0.582808,0.399787,29.913352,245.658248,642.848345,0.404143,1.484828,2323
6,1.260551,2.175014,3.035593,3.793677,4.492913,5.263191,6.041626,6.979851,7.728414,8.383715,...,0.331118,0.541763,0.498942,0.284356,38.037688,416.581031,1665.382796,0.321306,1.325061,1878
7,1.259174,2.171089,2.999404,3.773822,4.545005,5.33392,5.960234,6.736934,7.518592,8.161337,...,0.36771,0.524581,0.567214,0.404237,41.840369,500.515903,1980.84397,0.349375,1.496032,1877
8,1.267424,2.201333,2.978891,3.757086,4.610687,5.46914,6.193896,6.939528,7.667159,8.357169,...,0.318526,0.486564,0.535319,0.298116,34.354155,333.403645,1212.744754,0.348766,1.319999,2342
9,1.263156,2.195266,2.984549,3.721556,4.594043,5.402468,6.192703,6.865545,7.439082,7.935941,...,0.368571,0.428115,0.482597,0.354128,29.234281,257.402162,917.012637,0.291471,1.26484,2340


In [130]:
df = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_RRCK.csv')
df

Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.888651,2.446436,4.892748,...,11.228478,126.484444,1217.836283,6.18191,38268,152,418.0,484.0,44.111111,19.277778
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,100.134733,2.444935,4.889759,...,11.208585,125.402718,1201.841368,6.131844,37337,150,412.0,477.0,43.25,19.166667
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,98.89382,2.465436,4.900312,...,11.191962,133.797536,1121.78279,6.129961,29757,145,390.0,461.0,34.722222,18.75
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,90.763285,2.389081,4.777099,...,10.86874,112.161925,1041.666046,6.351622,24888,117,340.0,386.0,29.583333,17.388889
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,89.057864,2.465877,4.910374,...,11.087191,125.281494,1008.698727,6.150602,22351,130,352.0,416.0,30.888889,16.833333
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,82.78409,2.467211,4.910268,...,11.091041,121.222371,952.636126,6.267343,19197,124,344.0,406.0,32.333333,15.0
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,82.790578,2.460889,4.892004,...,11.110939,120.217648,939.57934,6.524857,19137,129,338.0,406.0,31.472222,15.25
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,88.894075,2.433472,4.814442,...,10.886296,120.845208,917.483983,7.003695,20899,104,344.0,394.0,19.0,15.222222
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,80.676276,2.436117,4.821552,...,10.92646,118.602287,901.380838,7.638821,17333,100,332.0,382.0,22.034722,13.875
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,80.821298,2.447625,4.854854,...,10.949525,116.794005,879.489462,6.817748,16719,108,326.0,380.0,24.055556,14.222222


In [131]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2308,CC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C...,-4.62,1.261799,2.186755,3.020058,3.756445,4.529348,5.352167,6.150114,...,0.540374,0.394757,0.497851,0.54297,0.398595,29.938817,245.575146,646.850845,0.402696,1.439416
1,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,1.284135,2.238134,3.059687,3.813058,4.667183,5.41641,6.164273,...,0.448246,0.35707,0.434281,0.561102,0.351591,32.29146,330.378307,1411.879635,0.207975,1.346974
2,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,1.270762,2.233409,3.036703,3.836601,4.654813,5.419781,6.132307,...,0.478243,0.413066,0.490229,0.356166,0.318931,40.910881,492.776011,2003.890601,0.336964,1.165325
3,2312,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C...,-4.64,1.263047,2.186342,3.024238,3.763724,4.579603,5.397958,6.117764,...,0.561507,0.358262,0.460196,0.434979,0.435413,29.645739,241.654474,691.817758,0.379654,1.330588
4,2335,CC[C@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2cccc...,-5.53,1.263868,2.200103,3.00471,3.7771,4.72887,5.516489,6.218313,...,0.53088,0.373398,0.510431,0.394462,0.389608,27.714935,218.75133,650.410705,0.356417,1.294501
5,2323,CC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N2CCC[C@@H]2C...,-5.35,1.261647,2.188305,3.022598,3.75681,4.541332,5.311648,6.120919,...,0.53402,0.402075,0.502233,0.582808,0.399787,29.913352,245.658248,642.848345,0.404143,1.484828
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,1.260551,2.175014,3.035593,3.793677,4.492913,5.263191,6.041626,...,0.547537,0.331118,0.541763,0.498942,0.284356,38.037688,416.581031,1665.382796,0.321306,1.325061
7,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,1.259174,2.171089,2.999404,3.773822,4.545005,5.33392,5.960234,...,0.531873,0.36771,0.524581,0.567214,0.404237,41.840369,500.515903,1980.84397,0.349375,1.496032
8,2342,CCC[C@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccc...,-5.46,1.267424,2.201333,2.978891,3.757086,4.610687,5.46914,6.193896,...,0.565844,0.318526,0.486564,0.535319,0.298116,34.354155,333.403645,1212.744754,0.348766,1.319999
9,2340,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.02,1.263156,2.195266,2.984549,3.721556,4.594043,5.402468,6.192703,...,0.49241,0.368571,0.428115,0.482597,0.354128,29.234281,257.402162,917.012637,0.291471,1.26484


In [132]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,1.254423,2.166133,3.004162,3.702798,4.517434,5.280055,6.003941,...,0.615473,0.315508,0.587862,0.481708,0.41975,54.418494,765.343933,2979.646029,0.423209,1.48932
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,1.252975,2.16423,3.007035,3.698789,4.498476,5.322587,6.062136,...,0.53163,0.381441,0.595506,0.578602,0.384673,59.364061,994.351177,4741.587052,0.369605,1.55878
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,1.258134,2.168095,3.005712,3.781677,4.5014,5.263147,5.900071,...,0.562651,0.292498,0.404332,0.490048,0.357058,43.246459,539.46316,2510.832727,0.343976,1.251438
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,1.259025,2.181485,2.987621,3.750304,4.515982,5.352705,6.112851,...,0.538203,0.316007,0.401819,0.490964,0.334962,42.515208,532.522256,2480.513318,0.307305,1.227745
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,1.259174,2.171089,2.999404,3.773822,4.545005,5.33392,5.960234,...,0.531873,0.36771,0.524581,0.567214,0.404237,41.840369,500.515903,1980.84397,0.349375,1.496032
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,1.258385,2.170534,3.028905,3.75075,4.482363,5.254493,5.962375,...,0.493579,0.344873,0.529573,0.541,0.350437,37.662157,433.577101,1940.275168,0.257679,1.421011
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,1.260551,2.175014,3.035593,3.793677,4.492913,5.263191,6.041626,...,0.547537,0.331118,0.541763,0.498942,0.284356,38.037688,416.581031,1665.382796,0.321306,1.325061
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,1.270762,2.233409,3.036703,3.836601,4.654813,5.419781,6.132307,...,0.478243,0.413066,0.490229,0.356166,0.318931,40.910881,492.776011,2003.890601,0.336964,1.165325
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,1.284135,2.238134,3.059687,3.813058,4.667183,5.41641,6.164273,...,0.448246,0.35707,0.434281,0.561102,0.351591,32.29146,330.378307,1411.879635,0.207975,1.346974
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,1.264707,2.204852,3.047072,3.823369,4.589295,5.380218,6.174379,...,0.568751,0.361967,0.501315,0.546106,0.373435,36.699473,364.122716,1105.825587,0.396077,1.420856


In [133]:
df_ordered.to_csv('features/Descriptors/Test_3d_padel_curated_RRCK.csv', index=False)

In [134]:
#2d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1444)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1444)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33389
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 965
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33201
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 967
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ov

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1922,0.3182,0.4384,0.5712,0.7648,0.796,0.4447,0.4282,0.6668,0.3848,0.6956,0.766
DecisionTreeRegressor,0.3292,0.4309,0.5738,0.2656,0.6121,0.6103,0.3936,0.4422,0.6274,0.4554,0.7221,0.7451
RandomForestRegressor,0.1938,0.329,0.4402,0.5678,0.7738,0.806,0.4571,0.4513,0.6761,0.3676,0.7164,0.7983
GradientBoostingRegressor,0.188,0.3134,0.4336,0.5806,0.7667,0.7933,0.3514,0.4,0.5928,0.5139,0.7708,0.8244
AdaBoostRegressor,0.2093,0.3483,0.4575,0.5332,0.7415,0.7653,0.4556,0.4689,0.675,0.3697,0.7038,0.7757
XGBRegressor,0.231,0.3564,0.4806,0.4846,0.7035,0.7259,0.3669,0.4091,0.6057,0.4924,0.7754,0.844
ExtraTreesRegressor,0.1786,0.3053,0.4226,0.6016,0.7881,0.8156,0.3999,0.4167,0.6324,0.4467,0.7456,0.8057
LinearRegression,7.2656,2.1847,2.6955,-15.2076,0.055,0.0541,3.4996,1.3951,1.8707,-3.8415,0.1365,0.1348
KNeighborsRegressor,0.2941,0.3863,0.5423,0.344,0.637,0.6662,0.5338,0.4701,0.7306,0.2615,0.6569,0.7369
SVR,0.1949,0.3066,0.4415,0.5652,0.7592,0.7785,0.4611,0.4322,0.6791,0.362,0.6793,0.7645


In [135]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.9561471228459055, -6.091054642002129, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.78780820950081, -5.981651754671699, -6.21...","[-6.071802287545527, -6.079849210765806, -6.04...","[0.1716574347422476, 0.09348883081174193, 0.16..."
1,DecisionTreeRegressor,"[-5.75, -5.85, -5.22, -5.1, -5.58, -6.46, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.96, -5.95, -6.1, -6.46, -6.1, -5.4, -6.92...","[-5.656, -5.909999999999999, -6.48799999999999...","[0.653990825623724, 0.08000000000000007, 0.541..."
2,RandomForestRegressor,"[-5.9642, -5.995000000000005, -5.2262000000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.735899999999999, -5.989400000000001, -6.2...","[-6.037091666666668, -6.05651, -6.088372666666...","[0.15884169880034074, 0.07253411886829621, 0.1..."
3,GradientBoostingRegressor,"[-6.012002592790874, -6.016243124628508, -5.04...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.6697531591805665, -5.952955103077377, -6....","[-6.025299593371571, -5.982801238491815, -6.35...","[0.20921485662237696, 0.06524950245829574, 0.2..."
4,AdaBoostRegressor,"[-6.0675, -6.137272727272727, -5.2786206896551...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.025, -6.043333333333333, -6.3435714285714...","[-6.158384920634921, -6.078541666666666, -6.20...","[0.0811917519954377, 0.06486979480295772, 0.21..."
5,XGBRegressor,"[-6.366995, -6.308828, -4.9082127, -5.4593754,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.7993255, -5.950652, -6.185632, -6.156246,...","[-6.0891733, -5.950171, -6.3495092, -6.0568247...","[0.32636833, 0.00052849064, 0.22824572, 0.2263..."
6,ExtraTreesRegressor,"[-6.147549999999996, -5.860500000000001, -4.97...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.127799999999999, -5.949999999999999, -6.4...","[-6.161899999999999, -5.976379999999999, -6.34...","[0.02680559643059607, 0.05275999999999996, 0.2..."
7,LinearRegression,"[-4.0, -9.647014368519224, -5.897624791841157,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.0, -5.949999999999498, -10.0, -4.86863962...","[-7.220374138271107, -5.744041432940826, -7.6,...","[1.9074627469816234, 0.41191713411790876, 2.93..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -6.118666666666667, -5.79...","[0.11400974617208051, 0.11400974617208051, 0.1..."
9,SVR,"[-6.083389918450342, -5.822177075737425, -5.23...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.055962985191872, -6.003501047103598, -5.9...","[-6.119912882097452, -6.037356642908497, -5.91...","[0.032460989969283614, 0.017408845449402216, 0..."


In [136]:
result_df.to_csv('results/Descriptors/Results_2D_padel_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_RRCK.csv')

In [137]:
#2d padel descriptors const removal
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1032)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1032)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33389
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 965
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33201
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 967
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ov

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1922,0.3182,0.4384,0.5712,0.7648,0.796,0.4447,0.4282,0.6668,0.3848,0.6956,0.766
DecisionTreeRegressor,0.3313,0.4405,0.5756,0.2609,0.6057,0.6086,0.4045,0.4472,0.636,0.4404,0.7076,0.767
RandomForestRegressor,0.196,0.3306,0.4428,0.5627,0.7704,0.8012,0.4608,0.4523,0.6788,0.3625,0.7113,0.79
GradientBoostingRegressor,0.1886,0.3145,0.4343,0.5793,0.7662,0.796,0.3592,0.405,0.5993,0.5031,0.7642,0.8175
AdaBoostRegressor,0.2251,0.3516,0.4745,0.4978,0.7124,0.7349,0.4491,0.4632,0.6701,0.3787,0.7101,0.7884
XGBRegressor,0.231,0.3564,0.4806,0.4846,0.7035,0.7259,0.3669,0.4091,0.6057,0.4924,0.7754,0.844
ExtraTreesRegressor,0.1787,0.3095,0.4227,0.6014,0.7878,0.8142,0.4118,0.419,0.6417,0.4303,0.7366,0.7985
LinearRegression,7.2656,2.1847,2.6955,-15.2076,0.055,0.0541,3.4996,1.3951,1.8707,-3.8415,0.1365,0.1348
KNeighborsRegressor,0.2941,0.3863,0.5423,0.344,0.637,0.6662,0.5338,0.4701,0.7306,0.2615,0.6569,0.7369
SVR,0.1949,0.3066,0.4415,0.5652,0.7593,0.7783,0.4612,0.4322,0.6791,0.362,0.6793,0.7645


In [138]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.9561471228459055, -6.091054642002129, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.78780820950081, -5.981651754671699, -6.21...","[-6.071802287545527, -6.079849210765806, -6.04...","[0.1716574347422476, 0.09348883081174193, 0.16..."
1,DecisionTreeRegressor,"[-5.75, -5.95, -5.22, -4.96, -5.58, -6.49, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.1, -5.95, -6.1, -6.46, -6.1, -5.4, -6.92,...","[-5.801, -5.933999999999999, -6.37799999999999...","[0.5466113793180674, 0.03200000000000003, 0.59..."
2,RandomForestRegressor,"[-5.9412499999999975, -5.9485500000000044, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.727620833333332, -5.9965, -6.266200000000...","[-6.053627499999999, -6.067299999999999, -6.11...","[0.1699601023361403, 0.08935751227512959, 0.11..."
3,GradientBoostingRegressor,"[-6.007734794007788, -6.016243124628508, -5.04...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.685261442711315, -5.952955103077377, -6.4...","[-6.017868816390469, -5.98284368185833, -6.333...","[0.1994044053197292, 0.06533415492825123, 0.23..."
4,AdaBoostRegressor,"[-6.101000000000001, -6.073194444444446, -5.24...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0059999999999985, -6.0059999999999985, -6...","[-6.168892602134707, -6.114985294117647, -6.16...","[0.09823580382068528, 0.08031658487652753, 0.0..."
5,XGBRegressor,"[-6.366995, -6.308828, -4.9082127, -5.4593754,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.7993255, -5.950652, -6.185632, -6.156246,...","[-6.0891733, -5.950171, -6.3495092, -6.0568247...","[0.32636833, 0.00052849064, 0.22824572, 0.2263..."
6,ExtraTreesRegressor,"[-6.069249999999998, -5.859900000000003, -5.02...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.047250000000002, -5.949999999999999, -6.3...","[-6.16845, -5.978369999999999, -6.325360000000...","[0.07534626069022878, 0.05673999999999957, 0.2..."
7,LinearRegression,"[-4.0, -9.647014368519471, -5.897624791841887,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.0, -5.949999999999547, -10.0, -4.86863962...","[-7.220374138266202, -5.744041432941204, -7.6,...","[1.9074627469813683, 0.41191713411716735, 2.93..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -6.118666666666667, -5.79...","[0.11400974617208051, 0.11400974617208051, 0.1..."
9,SVR,"[-6.0833898899563135, -5.8221764643379394, -5....",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.055962829491517, -6.003500984275534, -5.9...","[-6.119789816070937, -6.037217738658109, -5.91...","[0.03238799639269755, 0.01730779554560215, 0.0..."


In [139]:
result_df.to_csv('results/Descriptors/Results_2D_padel_const_rem_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_const_rem_RRCK.csv')

In [140]:
#2d padel descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_padel_curated_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_padel_curated_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 718)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 718)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22157
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 662
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22058
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 664
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1916,0.3253,0.4377,0.5727,0.7655,0.7987,0.4555,0.4401,0.6749,0.3698,0.6909,0.7832
DecisionTreeRegressor,0.3248,0.3988,0.5699,0.2755,0.6236,0.6692,0.2964,0.4125,0.5444,0.5899,0.8068,0.8063
RandomForestRegressor,0.1985,0.3315,0.4455,0.5573,0.7657,0.7994,0.4511,0.4541,0.6716,0.3759,0.7222,0.7952
GradientBoostingRegressor,0.2037,0.325,0.4513,0.5457,0.7496,0.7722,0.3773,0.4211,0.6143,0.478,0.7581,0.7972
AdaBoostRegressor,0.2022,0.3455,0.4497,0.549,0.7513,0.7767,0.4816,0.4676,0.694,0.3338,0.6706,0.7514
XGBRegressor,0.2321,0.3453,0.4818,0.4823,0.7057,0.7451,0.4228,0.4255,0.6502,0.4151,0.7189,0.7774
ExtraTreesRegressor,0.168,0.3045,0.4098,0.6253,0.8017,0.8266,0.43,0.4282,0.6557,0.4052,0.7181,0.7772
LinearRegression,6.9977,2.1456,2.6453,-14.6101,0.162,0.1548,3.4251,1.352,1.8507,-3.7384,0.1685,0.1904
KNeighborsRegressor,0.3018,0.3944,0.5494,0.3267,0.6228,0.6553,0.5478,0.473,0.7401,0.2422,0.6477,0.7345
SVR,0.202,0.3132,0.4495,0.5494,0.749,0.7724,0.4988,0.4485,0.7063,0.3099,0.6377,0.7425


In [141]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.938703787296294, -6.232094304780872, -5.28...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8044514849747735, -6.001548586065553, -6....","[-6.0146456503875685, -6.088986174357419, -6.0...","[0.1925185448293264, 0.0972410311133326, 0.143..."
1,DecisionTreeRegressor,"[-5.75, -5.87, -5.4, -5.1, -5.57, -6.46, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.16, -5.95, -6.1, -6.46, -6.1, -5.4, -6.92...","[-5.717999999999999, -5.909999999999999, -6.72...","[0.37145120810141397, 0.08000000000000007, 0.6..."
2,RandomForestRegressor,"[-5.943299999999999, -6.018803333333335, -5.20...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.7148166666666675, -5.976599999999999, -6....","[-6.044983333333333, -6.04512, -6.159120000000...","[0.19473983784640558, 0.05953158489407159, 0.1..."
3,GradientBoostingRegressor,"[-6.027730617410095, -6.080286155832058, -5.09...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.442522304790104, -5.950168933851341, -6.3...","[-5.927893881724126, -5.994967907058017, -6.38...","[0.3610784030293262, 0.08020965851631814, 0.19..."
4,AdaBoostRegressor,"[-6.123534482758622, -6.123780487804877, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.04, -6.075083333333338, -6.18270833333333...","[-6.183201190476192, -6.107500183150184, -6.16...","[0.11346621851354799, 0.0712714388881046, 0.13..."
5,XGBRegressor,"[-5.877562, -6.1120234, -4.950026, -5.4803996,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.865114, -5.9497194, -6.2232904, -5.925979...","[-6.2035356, -5.959027, -6.1810255, -6.0425653...","[0.34634006, 0.017979342, 0.17707849, 0.123275..."
6,ExtraTreesRegressor,"[-6.111399999999998, -5.927500000000001, -5.00...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.088600000000001, -5.949999999999999, -6.3...","[-6.160036666666668, -5.973139999999999, -6.31...","[0.09105661633169813, 0.046280000000000855, 0...."
7,LinearRegression,"[-10.0, -10.0, -5.244022848044642, -8.87204520...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.0, -5.949999999999579, -10.0, -7.35448179...","[-6.8989335519801385, -6.4993748385513, -7.6, ...","[2.6554483218931138, 1.0987496771029484, 2.939..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -6.118666666666667, -5.92...","[0.11400974617208051, 0.11400974617208051, 0.2..."
9,SVR,"[-6.090387156372305, -5.77209850864664, -5.239...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0504420658890545, -5.999385066557721, -5....","[-6.107608347173828, -6.0205758618288145, -5.8...","[0.029073301287539308, 0.017412944329614422, 0..."


In [142]:
result_df.to_csv('results/Descriptors/Results_2D_padel_LVR_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_padel_const_LVR_RRCK.csv')

In [143]:
#2d All descriptors
df_train_padel = pd.read_csv('features/Descriptors/Train_2d_padel_curated_RRCK.csv')
df_train_rdkit = pd.read_csv('features/Descriptors/Train_2d_RDKit_des_RRCK.csv')
df_train_mordred = pd.read_csv('features/Descriptors/Train_2d_Mordred_desc_RRCK.csv')

df_2d_train = df_train_rdkit.merge(df_train_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_train

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,15.193873,15.193873,0.130769,-1.621791,0.147476,26.802326,1216.662,...,6.109834,165.660911,1.926290,64.869552,30.340460,34.529092,38268.0,152.0,8.704,418.0
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,15.152762,15.152762,0.128114,-1.816236,0.134993,26.372093,1214.646,...,6.161631,165.660911,1.926290,64.869552,30.340460,34.529092,38268.0,152.0,7.824,418.0
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,15.129540,15.129540,0.022871,-1.609940,0.147925,26.905882,1202.635,...,6.131844,163.824465,1.927347,64.851242,30.333765,34.517477,37337.0,150.0,8.407,412.0
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,15.028142,15.028142,0.097424,-1.231351,0.116062,27.411765,1202.635,...,6.131844,164.030378,1.929769,64.941671,30.873015,34.068656,37826.0,148.0,8.690,410.0
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,15.068092,15.068092,0.128760,-1.611421,0.157205,27.083333,1188.608,...,6.154537,161.805840,1.926260,64.748500,30.296198,34.452302,36408.0,148.0,8.049,408.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,13.940180,13.940180,0.023137,-1.031903,0.289440,27.260870,640.826,...,6.534641,91.402117,1.987003,33.999411,15.357748,18.641663,7064.0,74.0,5.393,228.0
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,13.929912,13.929912,0.048761,-0.934108,0.462849,28.954545,620.836,...,6.204261,85.761816,1.949132,34.302616,15.219294,19.083322,6071.0,77.0,4.002,220.0
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,13.805528,13.805528,0.023859,-1.030988,0.318688,27.954545,612.772,...,6.656125,87.363105,1.985525,33.788693,15.284672,18.504021,6341.0,70.0,4.677,220.0
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,13.850263,13.850263,0.054388,-0.925132,0.468423,29.209302,606.809,...,6.251654,83.927240,1.951796,34.294802,15.216440,19.078362,5725.0,76.0,3.494,214.0


In [144]:
df_2d_train.to_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv', index=False)

In [145]:
df_test_padel = pd.read_csv('features/Descriptors/Test_2d_padel_curated_RRCK.csv')
df_test_rdkit = pd.read_csv('features/Descriptors/Test_2d_RDKit_des_RRCK.csv')
df_test_mordred = pd.read_csv('features/Descriptors/Test_2d_Mordred_desc_RRCK.csv')

df_2d_test = df_test_rdkit.merge(df_test_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_test

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,15.14449,15.14449,0.113131,-1.744209,0.128505,27.116279,1218.634,...,6.18191,165.660911,1.92629,67.281618,32.752526,34.529092,38268.0,152.0,7.806,418.0
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,15.12954,15.12954,0.022871,-1.60994,0.147925,26.905882,1202.635,...,6.131844,163.824465,1.927347,64.851242,30.333765,34.517477,37337.0,150.0,8.407,412.0
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,15.256898,15.256898,0.00053,-1.187472,0.146159,25.6625,1122.548,...,6.129961,157.176722,1.964709,54.876101,25.444335,29.431767,29757.0,145.0,7.521,390.0
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,14.719159,14.719159,0.02623,-1.109554,0.098749,24.273973,1042.443,...,6.351622,143.564265,1.966634,53.678034,22.918108,27.745284,24888.0,117.0,9.471,340.0
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,15.077973,15.077973,0.0257,-1.150959,0.182069,25.569444,1009.388,...,6.150602,141.68827,1.967893,49.102754,22.908473,26.194281,22351.0,130.0,6.754,352.0
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,15.069391,15.069391,0.013388,-1.165871,0.363344,26.720588,953.28,...,6.267343,132.96274,1.955334,48.807355,22.800364,26.006991,19197.0,124.0,4.656,344.0
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,14.916791,14.916791,0.00378,-1.18176,0.419137,28.313433,940.197,...,6.524857,131.06302,1.956164,54.288367,25.22917,29.059198,19137.0,129.0,0.757,338.0
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,14.898914,14.898914,0.085369,-1.142615,0.10396,20.911765,918.152,...,7.003695,139.322655,2.048863,37.708724,15.472218,22.236506,20899.0,104.0,7.346,344.0
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,14.676408,14.676408,0.002283,-1.335295,0.172783,21.9375,902.053,...,7.638821,129.48801,2.02325,48.231158,15.458945,22.213781,17333.0,100.0,4.901,332.0
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,14.782868,14.782868,0.013476,-1.206234,0.239281,24.21875,880.1,...,6.817748,128.3312,2.005175,42.76517,20.418364,22.346806,16719.0,108.0,4.568,326.0


In [146]:
df_2d_test.to_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv', index=False)

In [147]:
#2d All descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
# X_test = X_test.select_dtypes(include=['number'])
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 3091)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 3091)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75145
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2193
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74680
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2197
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1893,0.3141,0.4351,0.5778,0.7653,0.8017,0.4412,0.4219,0.6642,0.3896,0.6939,0.7594
DecisionTreeRegressor,0.3379,0.4285,0.5813,0.2461,0.6011,0.628,0.3858,0.4403,0.6211,0.4663,0.7339,0.7747
RandomForestRegressor,0.1948,0.3307,0.4413,0.5655,0.7705,0.7986,0.4578,0.4507,0.6766,0.3666,0.7126,0.7967
GradientBoostingRegressor,0.1787,0.3027,0.4228,0.6013,0.7839,0.8164,0.3838,0.4075,0.6195,0.4691,0.7472,0.8208
AdaBoostRegressor,0.2098,0.3435,0.4581,0.5319,0.7399,0.7698,0.4689,0.4647,0.6848,0.3512,0.6879,0.7725
XGBRegressor,0.2196,0.3458,0.4686,0.5102,0.7199,0.7439,0.4221,0.437,0.6497,0.416,0.7364,0.8023
ExtraTreesRegressor,0.173,0.3035,0.4159,0.6142,0.7951,0.8165,0.3945,0.4104,0.6281,0.4542,0.7499,0.8055
LinearRegression,6.9854,2.141,2.643,-14.5825,0.0987,0.0713,3.841,1.5482,1.9598,-4.3138,0.07,0.104
KNeighborsRegressor,0.2996,0.3888,0.5473,0.3317,0.6263,0.658,0.5297,0.4764,0.7278,0.2672,0.6769,0.7475
SVR,0.1879,0.3024,0.4335,0.5808,0.7703,0.7885,0.4277,0.4199,0.654,0.4083,0.7149,0.7832


In [148]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.970369177318453, -6.132392137741762, -5.17...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9658917935958025, -5.998107739358907, -6....","[-6.144806789454829, -6.1018733508854535, -6.1...","[0.12607544189411532, 0.13832945963000004, 0.1..."
1,DecisionTreeRegressor,"[-5.27, -5.85, -5.22, -4.79, -5.4, -6.65, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.72, -5.95, -6.96, -6.65, -6.96, -5.4, -6....","[-5.838, -5.933999999999999, -6.626, -6.432, -...","[0.6555730317821198, 0.03200000000000003, 0.52..."
2,RandomForestRegressor,"[-5.920199999999997, -5.948350000000004, -5.19...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.759412500000001, -5.9544999999999995, -6....","[-6.064482499999999, -6.051379999999999, -6.09...","[0.15420117128284025, 0.09414965533659743, 0.1..."
3,GradientBoostingRegressor,"[-5.882595138241643, -6.098838380233682, -5.01...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.794753250371877, -5.966750021959288, -6.7...","[-6.041528340792622, -5.973687524696051, -6.22...","[0.21477861369986082, 0.02955278799078915, 0.4..."
4,AdaBoostRegressor,"[-5.95, -6.178181818181819, -5.145714285714286...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.955, -6.0042857142857144, -6.021521739130...","[-6.159155397543633, -6.088994548588667, -6.02...","[0.1137633154378726, 0.10439863285736561, 0.27..."
5,XGBRegressor,"[-6.0313396, -6.393152, -4.9681315, -5.866849,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8122888, -5.9500756, -6.1149626, -6.35422...","[-6.0518813, -5.948022, -6.1634297, -5.9530115...","[0.26566187, 0.004322454, 0.39732587, 0.258586..."
6,ExtraTreesRegressor,"[-5.898099999999997, -5.975550000000002, -4.95...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9818999999999996, -5.949999999999999, -6....","[-6.141050000000001, -5.9746, -6.3648700000000...","[0.09643472403652163, 0.04920000000000009, 0.1..."
7,LinearRegression,"[-4.0, -9.893764330543565, -4.0, -6.6463292364...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-8.166542568299457, -5.949999999999946, -9.8...","[-8.994984114726659, -5.560000000000005, -6.37...","[1.3036665572481307, 0.7800000000000027, 2.907..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.039999999999999, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.924666666666667, -5.79...","[0.11400974617208051, 0.05153639490690065, 0.1..."
9,SVR,"[-6.065991221478246, -5.837746879033769, -5.25...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.048915611556839, -6.016429888761645, -5.9...","[-6.096416932880845, -6.03834784835753, -5.979...","[0.029458314550472927, 0.01217190585715695, 0...."


In [149]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_RRCK.csv')

In [150]:
#2d All descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 2378)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 2378)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75145
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2193
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74680
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2197
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1893,0.3141,0.4351,0.5778,0.7653,0.8017,0.4412,0.4219,0.6642,0.3896,0.6939,0.7594
DecisionTreeRegressor,0.289,0.4085,0.5376,0.3554,0.6494,0.6584,0.4446,0.4387,0.6668,0.3849,0.6804,0.7533
RandomForestRegressor,0.1955,0.3309,0.4421,0.564,0.7696,0.7983,0.4494,0.4491,0.6704,0.3783,0.7214,0.8046
GradientBoostingRegressor,0.1777,0.3004,0.4215,0.6037,0.7851,0.8174,0.3894,0.4096,0.624,0.4613,0.7393,0.813
AdaBoostRegressor,0.2184,0.3468,0.4673,0.5128,0.7242,0.7511,0.4742,0.4615,0.6886,0.344,0.6794,0.7799
XGBRegressor,0.2196,0.3458,0.4686,0.5102,0.7199,0.7439,0.4221,0.437,0.6497,0.416,0.7364,0.8023
ExtraTreesRegressor,0.1671,0.2987,0.4088,0.6272,0.8068,0.8413,0.4167,0.4194,0.6456,0.4235,0.7305,0.7891
LinearRegression,6.9854,2.141,2.643,-14.5825,0.0987,0.0713,3.841,1.5482,1.9598,-4.3138,0.07,0.104
KNeighborsRegressor,0.2996,0.3888,0.5473,0.3317,0.6263,0.658,0.5297,0.4764,0.7278,0.2672,0.6769,0.7475
SVR,0.1879,0.3024,0.4335,0.5808,0.7703,0.7885,0.4277,0.4199,0.654,0.4083,0.7149,0.7832


In [151]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.970369177318453, -6.132392137741762, -5.17...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9658917935958025, -5.998107739358907, -6....","[-6.144806789454829, -6.1018733508854535, -6.1...","[0.12607544189411532, 0.13832945963000004, 0.1..."
1,DecisionTreeRegressor,"[-5.06, -5.85, -5.22, -5.1, -5.57, -6.46, -5.5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.04, -5.95, -5.9, -6.46, -5.9, -5.4, -6.92...","[-5.848, -5.933999999999999, -6.31600000000000...","[0.515418276742298, 0.03200000000000003, 0.275..."
2,RandomForestRegressor,"[-5.9144999999999985, -5.958300000000004, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.778566666666666, -5.9626, -6.285050000000...","[-6.059755952380952, -6.051929999999999, -6.13...","[0.14220358951827505, 0.07092159473672335, 0.1..."
3,GradientBoostingRegressor,"[-5.858083603547437, -6.090725928202099, -5.01...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.880550537402026, -5.966750021959288, -6.5...","[-6.06835169001967, -5.971738427290516, -6.244...","[0.20824653529098228, 0.025849448771656496, 0...."
4,AdaBoostRegressor,"[-6.154038461538462, -6.105925925925924, -5.17...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0875, -6.105925925925924, -6.240357142857...","[-6.1290733333333325, -6.07913201776437, -6.03...","[0.09863502510659018, 0.09144519338557816, 0.1..."
5,XGBRegressor,"[-6.0313396, -6.393152, -4.9681315, -5.866849,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8122888, -5.9500756, -6.1149626, -6.35422...","[-6.0518813, -5.948022, -6.1634297, -5.9530115...","[0.26566187, 0.004322454, 0.39732587, 0.258586..."
6,ExtraTreesRegressor,"[-6.061199999999999, -5.890700000000001, -4.97...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.063950000000001, -5.949999999999999, -6.3...","[-6.14544, -5.980459999999999, -6.309740000000...","[0.06385295138049563, 0.06091999999999977, 0.1..."
7,LinearRegression,"[-4.0, -9.893764330554976, -4.0, -6.6463292364...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-8.166542568292709, -5.950000000000008, -9.8...","[-8.994984114737571, -5.55999999999998, -6.374...","[1.3036665572284245, 0.7799999999999903, 2.907..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.039999999999999, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.924666666666667, -5.79...","[0.11400974617208051, 0.05153639490690065, 0.1..."
9,SVR,"[-6.065991176813832, -5.837746800094085, -5.25...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.048915546498334, -6.0164298353037236, -5....","[-6.09638127541406, -6.038309374361353, -5.979...","[0.02942561150665576, 0.012142830020049095, 0...."


In [152]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_const_rem_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_const_rem_RRCK.csv')

In [153]:
#2d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1721)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1721)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51508
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1553
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51249
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1557
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1987,0.3243,0.4458,0.5567,0.753,0.7906,0.4459,0.4316,0.6677,0.3832,0.6889,0.7824
DecisionTreeRegressor,0.3129,0.4013,0.5594,0.302,0.6178,0.6328,0.3964,0.4544,0.6296,0.4516,0.72,0.7151
RandomForestRegressor,0.198,0.3312,0.445,0.5583,0.7641,0.7923,0.457,0.4541,0.676,0.3677,0.7116,0.7957
GradientBoostingRegressor,0.2013,0.3182,0.4487,0.5509,0.7493,0.7808,0.3429,0.3916,0.5856,0.5256,0.7838,0.8393
AdaBoostRegressor,0.207,0.3404,0.455,0.5383,0.7429,0.7744,0.4642,0.467,0.6813,0.3579,0.6922,0.7611
XGBRegressor,0.2459,0.3683,0.4959,0.4515,0.6777,0.713,0.4076,0.4359,0.6384,0.4362,0.7417,0.7779
ExtraTreesRegressor,0.1714,0.3047,0.414,0.6177,0.7972,0.8248,0.4311,0.4268,0.6566,0.4036,0.7169,0.7839
LinearRegression,6.6726,2.0579,2.5831,-13.8849,0.18,0.1753,4.0516,1.6252,2.0129,-4.6051,0.0361,0.0598
KNeighborsRegressor,0.3046,0.3934,0.5519,0.3205,0.6161,0.6548,0.5622,0.4797,0.7498,0.2222,0.6474,0.7203
SVR,0.1982,0.3131,0.4452,0.558,0.7555,0.7809,0.4577,0.4387,0.6765,0.3668,0.6874,0.7891


In [154]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.042740855827409, -6.130015020955236, -5.12...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.054159473685268, -6.006641824153537, -6.3...","[-6.053750049081272, -6.086965147509413, -6.09...","[0.1545513555943005, 0.11752660344468674, 0.15..."
1,DecisionTreeRegressor,"[-5.75, -5.85, -5.4, -4.96, -5.39, -6.5, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.96, -5.95, -7.48, -6.22, -6.96, -5.4, -6....","[-6.166, -5.933999999999999, -6.45999999999999...","[0.6312400494265237, 0.03200000000000003, 0.57..."
2,RandomForestRegressor,"[-5.97545, -5.960400000000002, -5.189500000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.6920375000000005, -5.983499999999998, -6....","[-6.035511166666666, -6.041489999999999, -6.10...","[0.18724740209264357, 0.04970521501814553, 0.1..."
3,GradientBoostingRegressor,"[-6.098484066049467, -6.250006105051718, -5.01...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.776933808058342, -5.953281568330001, -6.6...","[-6.099437579592107, -5.955640067017364, -6.51...","[0.24984169222556724, 0.011022839891349587, 0...."
4,AdaBoostRegressor,"[-6.183541666666666, -6.183541666666666, -5.19...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.256929824561404, -6.2327499999999985, -5....","[-6.201460499073771, -6.177264046822742, -6.13...","[0.09536881361803985, 0.12313561243654257, 0.3..."
5,XGBRegressor,"[-6.0916457, -6.236822, -5.117636, -5.336735, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9109235, -5.9507823, -6.506944, -5.875622...","[-6.0789757, -5.9413376, -6.2795434, -5.771903...","[0.28696758, 0.017888887, 0.3709551, 0.2487358..."
6,ExtraTreesRegressor,"[-6.146099999999998, -5.9387000000000025, -4.9...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.981000000000002, -5.949999999999999, -6.4...","[-6.128196666666668, -5.98042, -6.292200000000...","[0.08918066706286591, 0.0608399999999996, 0.21..."
7,LinearRegression,"[-10.0, -10.0, -4.0, -9.141282098854738, -4.0,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9485167983413, -5.949999999999967, -10.0,...","[-9.189703359668261, -5.92722042369775, -6.4, ...","[1.6205932806634797, 0.04555915260454953, 2.93..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.924666666666667, -5.92...","[0.11400974617208051, 0.05153639490690065, 0.2..."
9,SVR,"[-6.0683822629088215, -5.797269607299271, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0462166925957765, -6.013816070526946, -5....","[-6.090290802825197, -6.0299482448349035, -5.9...","[0.027256844176875527, 0.012804813812981944, 0..."


In [155]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_LVR_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_LVR_RRCK.csv')

In [156]:
#2d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1721)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1721)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51508
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1553
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51249
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1557
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1987,0.3243,0.4458,0.5567,0.753,0.7906,0.4459,0.4316,0.6677,0.3832,0.6889,0.7824
DecisionTreeRegressor,0.3129,0.4013,0.5594,0.302,0.6178,0.6328,0.3964,0.4544,0.6296,0.4516,0.72,0.7151
RandomForestRegressor,0.198,0.3312,0.445,0.5583,0.7641,0.7923,0.457,0.4541,0.676,0.3677,0.7116,0.7957
GradientBoostingRegressor,0.2013,0.3182,0.4487,0.5509,0.7493,0.7808,0.3429,0.3916,0.5856,0.5256,0.7838,0.8393
AdaBoostRegressor,0.207,0.3404,0.455,0.5383,0.7429,0.7744,0.4642,0.467,0.6813,0.3579,0.6922,0.7611
XGBRegressor,0.2459,0.3683,0.4959,0.4515,0.6777,0.713,0.4076,0.4359,0.6384,0.4362,0.7417,0.7779
ExtraTreesRegressor,0.1714,0.3047,0.414,0.6177,0.7972,0.8248,0.4311,0.4268,0.6566,0.4036,0.7169,0.7839
LinearRegression,6.6726,2.0579,2.5831,-13.8849,0.18,0.1753,4.0516,1.6252,2.0129,-4.6051,0.0361,0.0598
KNeighborsRegressor,0.3046,0.3934,0.5519,0.3205,0.6161,0.6548,0.5622,0.4797,0.7498,0.2222,0.6474,0.7203
SVR,0.1982,0.3131,0.4452,0.558,0.7555,0.7809,0.4577,0.4387,0.6765,0.3668,0.6874,0.7891


In [157]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.042740855827409, -6.130015020955236, -5.12...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.054159473685268, -6.006641824153537, -6.3...","[-6.053750049081272, -6.086965147509413, -6.09...","[0.1545513555943005, 0.11752660344468674, 0.15..."
1,DecisionTreeRegressor,"[-5.75, -5.85, -5.4, -4.96, -5.39, -6.5, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.96, -5.95, -7.48, -6.22, -6.96, -5.4, -6....","[-6.166, -5.933999999999999, -6.45999999999999...","[0.6312400494265237, 0.03200000000000003, 0.57..."
2,RandomForestRegressor,"[-5.9754499999999995, -5.960400000000002, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.692037499999999, -5.983499999999998, -6.2...","[-6.035511166666666, -6.041489999999999, -6.10...","[0.18724740209264407, 0.04970521501814566, 0.1..."
3,GradientBoostingRegressor,"[-6.098484066049467, -6.250006105051718, -5.01...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.776933808058342, -5.953281568330001, -6.6...","[-6.099437579592107, -5.955640067017364, -6.51...","[0.24984169222556724, 0.011022839891349587, 0...."
4,AdaBoostRegressor,"[-6.183541666666666, -6.183541666666666, -5.19...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.256929824561404, -6.2327499999999985, -5....","[-6.201460499073771, -6.177264046822742, -6.13...","[0.09536881361803985, 0.12313561243654257, 0.3..."
5,XGBRegressor,"[-6.0916457, -6.236822, -5.117636, -5.336735, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9109235, -5.9507823, -6.506944, -5.875622...","[-6.0789757, -5.9413376, -6.2795434, -5.771903...","[0.28696758, 0.017888887, 0.3709551, 0.2487358..."
6,ExtraTreesRegressor,"[-6.146099999999998, -5.938700000000003, -4.99...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.981, -5.949999999999999, -6.4587000000000...","[-6.128196666666666, -5.98042, -6.292200000000...","[0.0891806670628665, 0.0608399999999996, 0.219..."
7,LinearRegression,"[-10.0, -10.0, -4.0, -9.141282098854738, -4.0,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9485167983413, -5.949999999999967, -10.0,...","[-9.189703359668261, -5.92722042369775, -6.4, ...","[1.6205932806634797, 0.04555915260454953, 2.93..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.890000000000001, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.924666666666667, -5.92...","[0.11400974617208051, 0.05153639490690065, 0.2..."
9,SVR,"[-6.0683822629088215, -5.797269607299271, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0462166925957765, -6.013816070526946, -5....","[-6.090290802825197, -6.0299482448349035, -5.9...","[0.027256844176875527, 0.012804813812981944, 0..."


In [158]:
def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [159]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [160]:
df_train = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
X_train = df_train[selected_features] 
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv')
df_test =df_test.dropna()
X_test =  df_test[X_train.columns]
y_test =  df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 200)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 200)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6368
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 183
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6372
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 183
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2137,0.3283,0.4622,0.5234,0.7266,0.7498,0.4561,0.4375,0.6753,0.369,0.6751,0.7531
DecisionTreeRegressor,0.3933,0.463,0.6271,0.1226,0.5375,0.5785,0.5516,0.4584,0.7427,0.2369,0.6094,0.7116
RandomForestRegressor,0.2099,0.3391,0.4581,0.5319,0.7449,0.7776,0.4756,0.454,0.6896,0.3421,0.6942,0.7826
GradientBoostingRegressor,0.2042,0.3199,0.4519,0.5445,0.7423,0.763,0.3961,0.4113,0.6293,0.4521,0.7336,0.8048
AdaBoostRegressor,0.2125,0.3475,0.461,0.526,0.7321,0.7661,0.4589,0.4713,0.6775,0.3651,0.7091,0.7837
XGBRegressor,0.2296,0.3399,0.4791,0.4879,0.7084,0.7374,0.4629,0.4377,0.6804,0.3596,0.6716,0.7568
ExtraTreesRegressor,0.1674,0.2896,0.4092,0.6265,0.8039,0.8286,0.398,0.4004,0.6308,0.4494,0.7362,0.8177
LinearRegression,10.3504,2.7293,3.2172,-22.0891,-0.055,-0.0578,3.6156,1.4842,1.9015,-4.002,0.4389,0.4211
KNeighborsRegressor,0.2475,0.3469,0.4975,0.4479,0.7125,0.7413,0.5002,0.4372,0.7072,0.308,0.6956,0.799
SVR,0.1831,0.2984,0.4279,0.5915,0.7788,0.8017,0.4177,0.4272,0.6463,0.4222,0.7191,0.7725


In [161]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.964825361867922, -6.077557099554694, -5.37...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.062920239971026, -6.046056711494199, -6.1...","[-6.2123817498350595, -6.099024971665787, -6.0...","[0.08899109390533108, 0.08366785956757977, 0.0..."
1,DecisionTreeRegressor,"[-5.75, -6.92, -4.72, -5.35, -5.57, -6.2, -5.3...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -5.95, -6.46, -6.96, -6.46, -4.72, -6...","[-6.2299999999999995, -5.909999999999999, -5.6...","[0.40630038149132963, 0.08000000000000007, 0.6..."
2,RandomForestRegressor,"[-5.938850000000002, -6.0618000000000025, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.021049999999999, -5.952899999999999, -6.0...","[-6.1039, -6.015899999999999, -5.9589600000000...","[0.06550886962847223, 0.046350965469988445, 0...."
3,GradientBoostingRegressor,"[-5.957059228573685, -5.974052517076113, -5.17...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.877567455757229, -5.9472873677240115, -6....","[-6.068035855570859, -5.9624130098537576, -6.2...","[0.16102305285259572, 0.021021184693871285, 0...."
4,AdaBoostRegressor,"[-5.960999999999999, -6.340000000000001, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.928461538461539, -5.984196428571429, -6.2...","[-6.003092307692308, -5.991048809523809, -6.15...","[0.10442120440309986, 0.06718877866472982, 0.1..."
5,XGBRegressor,"[-5.987833, -6.585021, -5.0471787, -5.843211, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.207873, -5.9503126, -6.5326333, -6.522365...","[-6.221828, -5.981726, -6.391731, -6.4018874, ...","[0.12702148, 0.06331007, 0.22075535, 0.0991331..."
6,ExtraTreesRegressor,"[-6.043899999999997, -6.035900000000002, -4.99...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.034799999999998, -5.949999999999999, -6.2...","[-6.12787, -5.9624, -6.15050333333334, -6.2563...","[0.05718154947183715, 0.02480000000000082, 0.1..."
7,LinearRegression,"[-10.0, -10.0, -10.0, -10.0, -4.0, -4.0, -4.0,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-10.0, -5.949995756149292, -10.0, -4.0, -10....","[-8.8, -6.7600058685056865, -8.8, -8.8, -7.6, ...","[2.4, 1.6199970657997123, 2.4, 2.4, 2.93938769..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.956666666666666, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.6...","[-6.118666666666667, -5.924666666666667, -5.59...","[0.11400974617208051, 0.05153639490690065, 0.0..."
9,SVR,"[-6.086433033735662, -6.003214623674186, -5.14...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.038152242726986, -6.004717886322186, -6.1...","[-6.08329100909606, -6.006315510035184, -6.033...","[0.028653453042295433, 0.007113596218060493, 0..."


In [162]:
result_df.to_csv('results/Descriptors/Results_2D_All_desc_LVR_remove_corr_features_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_All_desc_LVRremove_corr_features_RRCK.csv')

In [163]:
#3d RDKit descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_RDKit_desc_RRCK.csv')
df_train = df_train.fillna(0)
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_RDKit_desc_RRCK.csv')
df_test = df_test.fillna(0)
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 11)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 11)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 429
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 11
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 429
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 11
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 



-4.102362540664324




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4171,0.5321,0.6459,0.0695,0.3434,0.3533,0.7117,0.6103,0.8436,0.0154,0.5157,0.5045
DecisionTreeRegressor,0.7283,0.6369,0.8534,-0.6246,0.1887,0.2321,0.6613,0.6065,0.8132,0.0851,0.3956,0.3891
RandomForestRegressor,0.4255,0.523,0.6523,0.0509,0.3475,0.3355,0.6616,0.5853,0.8134,0.0848,0.5223,0.5197
GradientBoostingRegressor,0.4802,0.5527,0.693,-0.0712,0.2986,0.3163,0.7728,0.655,0.8791,-0.0691,0.3654,0.4007
AdaBoostRegressor,0.4075,0.5227,0.6384,0.0909,0.3669,0.3532,0.7177,0.6009,0.8472,0.0071,0.4682,0.5293
XGBRegressor,0.5549,0.5911,0.7449,-0.2378,0.2782,0.2638,0.7189,0.6086,0.8479,0.0054,0.4615,0.4777
ExtraTreesRegressor,0.4267,0.5125,0.6533,0.048,0.3579,0.3593,0.6589,0.6012,0.8117,0.0884,0.5526,0.5881
LinearRegression,0.3937,0.51,0.6275,0.1217,0.3908,0.3933,0.7267,0.6479,0.8525,-0.0054,0.4093,0.4205
KNeighborsRegressor,0.4776,0.5526,0.6911,-0.0653,0.272,0.2538,0.6708,0.6277,0.819,0.072,0.496,0.4646
SVR,0.4128,0.5032,0.6425,0.0792,0.3576,0.377,0.7149,0.618,0.8455,0.011,0.4411,0.4682


In [164]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.872003524851464, -5.942129055819188, -5.31...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.755940552295383, -5.856191468088004, -5.9...","[-5.795272979695544, -5.953710163105655, -5.81...","[0.09840001832947044, 0.11618448601482735, 0.1..."
1,DecisionTreeRegressor,"[-6.78, -6.05, -5.42, -6.05, -6.2, -6.2, -5.15...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.78, -5.84, -5.27, -6.46, -6.46, -5.42, -6...","[-6.628, -6.014, -6.222, -6.736, -6.5179999999...","[0.14400000000000013, 0.32927799805027974, 0.4..."
2,RandomForestRegressor,"[-6.299199999999996, -5.87615, -5.365150000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.523199999999998, -5.798650000000001, -6.2...","[-6.503379999999998, -5.898920000000002, -6.12...","[0.0908385689010977, 0.11954100802653297, 0.23..."
3,GradientBoostingRegressor,"[-5.904479547823848, -5.773721757026033, -5.30...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.903864880670385, -5.915039155870065, -6.2...","[-6.63451808711072, -5.8174296957989595, -6.03...","[0.3195249532101394, 0.22835808247725112, 0.25..."
4,AdaBoostRegressor,"[-6.26777777777778, -5.799999999999999, -4.885...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.26777777777778, -6.0, -6.05, -6.267777777...","[-6.379655555555556, -5.91436234931315, -5.960...","[0.19669131797794523, 0.11880587095018216, 0.2..."
5,XGBRegressor,"[-6.2667065, -6.00524, -5.41745, -6.151754, -5...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8767705, -5.5623827, -6.2302837, -6.46231...","[-6.4135537, -5.591857, -6.3050094, -6.5786386...","[0.31780335, 0.2158499, 0.16549943, 0.14937702..."
6,ExtraTreesRegressor,"[-6.149899999999994, -5.91865, -5.193099999999...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.551199999999995, -5.759200000000001, -6.3...","[-6.537019999999998, -5.839330000000002, -6.36...","[0.12561077023885894, 0.17077601002482506, 0.0..."
7,LinearRegression,"[-6.179801415444179, -5.618190818597676, -5.53...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.786285968466434, -6.030062181191648, -5.9...","[-6.69649226219903, -5.929780987620515, -6.046...","[0.21159755836590985, 0.09627386048396676, 0.1..."
8,KNeighborsRegressor,"[-6.1000000000000005, -5.373333333333332, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.396666666666667, -5.816666666666666, -6.6...","[-6.462666666666666, -6.003333333333333, -6.44...","[0.0945727233402952, 0.10654159333852271, 0.20..."
9,SVR,"[-5.966884806434886, -5.645558420755609, -5.13...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.409146270334343, -5.717958608151982, -6.3...","[-6.446659390877706, -5.924377573856949, -6.24...","[0.22078497324284418, 0.14375954333557256, 0.1..."


In [165]:
result_df.to_csv('results/Descriptors/Results_3D_RDKit_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_RDKit_desc_RRCK.csv')

In [166]:
#3d Padel descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_padel_curated_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_padel_curated_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 431)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 431)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000850 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16797
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 431
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16796
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 431
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over



-0.6656756991038764


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2801,0.4068,0.5293,0.3751,0.6133,0.6163,0.5844,0.4899,0.7645,0.1915,0.5505,0.6311
DecisionTreeRegressor,0.5784,0.5636,0.7605,-0.2902,0.3273,0.3475,0.6083,0.5419,0.7799,0.1585,0.5109,0.5802
RandomForestRegressor,0.2948,0.4193,0.543,0.3424,0.5914,0.5812,0.5902,0.5144,0.7682,0.1835,0.568,0.6517
GradientBoostingRegressor,0.2808,0.3969,0.5299,0.3735,0.6125,0.6244,0.5733,0.4886,0.7572,0.2069,0.5368,0.6383
AdaBoostRegressor,0.3015,0.4214,0.5491,0.3274,0.573,0.5966,0.553,0.4869,0.7436,0.235,0.5864,0.67
XGBRegressor,0.3327,0.4403,0.5768,0.2578,0.5315,0.5241,0.6013,0.4877,0.7754,0.1681,0.5124,0.6199
ExtraTreesRegressor,0.2491,0.3897,0.4991,0.4442,0.6719,0.6764,0.54,0.4828,0.7349,0.2529,0.6062,0.692
LinearRegression,0.9267,0.7425,0.9626,-1.0672,0.3621,0.3981,0.5654,0.6081,0.7519,0.2178,0.5372,0.5596
KNeighborsRegressor,0.2747,0.3981,0.5242,0.3871,0.6353,0.6394,0.603,0.5397,0.7765,0.1658,0.5611,0.6382
SVR,0.2898,0.4083,0.5384,0.3535,0.5977,0.6075,0.5958,0.5037,0.7719,0.1757,0.5688,0.6578


In [167]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.083883518850135, -5.527057402178294, -5.53...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8292324717894095, -6.086972703160939, -5....","[-6.119379305882473, -6.265640295321556, -5.43...","[0.20211151015170611, 0.21118330209146374, 0.1..."
1,DecisionTreeRegressor,"[-5.87, -6.0, -4.72, -4.72, -5.32, -6.15, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -6.78, -5.4, -6.2, -4.72, -4.0, -6.92...","[-6.418000000000001, -6.332, -5.258, -6.226000...","[0.6548098960767165, 0.4363209827638366, 0.513..."
2,RandomForestRegressor,"[-6.025799999999996, -5.931700000000002, -5.54...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9885, -6.352349999999999, -5.412949999999...","[-6.054090000000002, -6.31396, -5.475569999999...","[0.08850796800288632, 0.10720117256821184, 0.1..."
3,GradientBoostingRegressor,"[-6.003805726670968, -5.676413695862027, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.046919364450175, -6.761120253690717, -5.1...","[-6.19881270627492, -6.564540918707979, -5.408...","[0.2037177583456372, 0.2660755471137488, 0.303..."
4,AdaBoostRegressor,"[-6.080000000000001, -6.176756756756755, -5.70...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.121666666666667, -6.176756756756755, -5.4...","[-6.1878404220779215, -6.296777858071977, -5.5...","[0.11380767642821006, 0.11916346305000047, 0.1..."
5,XGBRegressor,"[-6.1610446, -5.70738, -5.0716186, -5.33462, -...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.979779, -6.515469, -5.46173, -6.58951, -5...","[-6.068549, -6.4852037, -5.175097, -6.12424, -...","[0.32355437, 0.22838202, 0.30195123, 0.3099596..."
6,ExtraTreesRegressor,"[-6.007500000000001, -5.693450000000005, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.015900000000001, -6.267299999999998, -5.7...","[-6.105640000000003, -6.368019999999999, -5.83...","[0.08206737719703305, 0.06039335725061076, 0.1..."
7,LinearRegression,"[-6.581426525696719, -6.10630925923904, -5.337...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.270765796533121, -5.6361231459939924, -7....","[-4.740603282211241, -6.172600226959391, -6.89...","[0.6439077156744848, 0.8156642804422526, 0.838..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.62, -5.013333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8566666666666665, -6.2, -5.83333333333333...","[-6.115333333333333, -6.194, -5.94133333333333...","[0.1347854261821771, 0.03349295116554826, 0.21..."
9,SVR,"[-5.960503535423703, -5.349923221552294, -5.38...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.869916628005446, -6.183782287558216, -5.6...","[-6.039845328072169, -6.1699153499668125, -5.7...","[0.0899343155732827, 0.13173625856950746, 0.07..."


In [168]:
result_df.to_csv('results/Descriptors/Results_3D_padel_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_padel_desc_RRCK.csv')

In [169]:
df_train_rdkit = pd.read_csv('features/Descriptors/Train_3d_RDKit_desc_RRCK.csv')
df_train_rdkit = df_train_rdkit.fillna(0)
df_train_padel = pd.read_csv('features/Descriptors/Train_3d_padel_curated_RRCK.csv')

df_3d_descriptors = df_train_rdkit.merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,25557.550768,38719.861789,59078.304374,0.432605,0.655399,7.119995,0.000026,...,0.584742,0.289275,0.526481,0.467149,0.355332,51.237177,733.133564,3650.811196,0.377113,1.348962
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,25195.527353,38098.222481,54940.535676,0.458596,0.693445,6.976408,0.000028,...,0.562959,0.330424,0.573577,0.533312,0.392941,52.061975,762.354776,3612.998411,0.344438,1.499831
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,27673.061526,37624.458338,57474.870399,0.481481,0.654625,7.144444,0.000024,...,0.505864,0.386967,0.530722,0.572866,0.283984,52.691795,809.151889,3930.912570,0.339246,1.387572
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,27053.583076,35398.003591,54685.502768,0.494712,0.647301,6.978552,0.000024,...,0.543224,0.390435,0.549748,0.553013,0.367643,60.025118,987.347841,4090.444574,0.400488,1.470405
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,25621.505581,37274.558430,58032.092029,0.441506,0.642309,7.132297,0.000025,...,0.559155,0.365398,0.567980,0.530505,0.368965,55.664600,849.215301,3563.631478,0.386829,1.467450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,6396.621513,8859.792939,12249.856286,0.522179,0.723257,4.632664,0.000113,...,0.480821,0.450969,0.447622,0.401417,0.374652,28.991239,235.667908,625.054603,0.397685,1.223691
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,5807.131702,8158.649035,11412.673395,0.508832,0.714876,4.520944,0.000123,...,0.587350,0.351181,0.540411,0.549950,0.480266,27.093819,193.764285,473.031633,0.407795,1.570627
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,6143.538968,9226.806179,12259.349342,0.501131,0.752634,4.748141,0.000123,...,0.603934,0.302221,0.478983,0.563029,0.379015,26.877338,193.282773,552.730401,0.405901,1.421027
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,5497.580476,7708.634196,10921.176551,0.503387,0.705843,4.458761,0.000128,...,0.573506,0.364677,0.497225,0.533337,0.460216,26.843623,192.495908,469.419500,0.407274,1.490778


In [170]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [171]:
df_3d_descriptors.to_csv('features/Descriptors/Train_3d_all_descriptors_RRCK.csv', index=False)

In [172]:
df_test_rdkit = pd.read_csv('features/Descriptors/Test_3d_RDKit_desc_RRCK.csv')
df_test_rdkit = df_test_rdkit.fillna(0)
df_test_padel = pd.read_csv('features/Descriptors/Test_3d_padel_curated_RRCK.csv')

df_3d_descriptors = df_test_rdkit.merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,29235.649823,34655.335992,53592.422815,0.545518,0.646646,6.942832,2.2e-05,...,0.615473,0.315508,0.587862,0.481708,0.41975,54.418494,765.343933,2979.646029,0.423209,1.48932
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,24087.94386,41198.827889,58197.846259,0.413898,0.70791,7.165137,2.9e-05,...,0.53163,0.381441,0.595506,0.578602,0.384673,59.364061,994.351177,4741.587052,0.369605,1.55878
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,22337.453461,27133.737115,43092.072922,0.518366,0.629669,6.420988,2.8e-05,...,0.562651,0.292498,0.404332,0.490048,0.357058,43.246459,539.46316,2510.832727,0.343976,1.251438
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,21040.064822,23207.945553,36960.965663,0.569251,0.627904,6.241096,3e-05,...,0.538203,0.316007,0.401819,0.490964,0.334962,42.515208,532.522256,2480.513318,0.307305,1.227745
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,16994.956964,24094.700818,35431.89834,0.479651,0.680029,6.156698,4e-05,...,0.531873,0.36771,0.524581,0.567214,0.404237,41.840369,500.515903,1980.84397,0.349375,1.496032
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,17472.947386,17989.322133,28544.470331,0.612131,0.630221,5.794122,3.6e-05,...,0.493579,0.344873,0.529573,0.541,0.350437,37.662157,433.577101,1940.275168,0.257679,1.421011
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,13628.705114,21498.8852,30535.503048,0.446323,0.704062,5.909303,5.2e-05,...,0.547537,0.331118,0.541763,0.498942,0.284356,38.037688,416.581031,1665.382796,0.321306,1.325061
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,14236.954483,23107.885866,33197.287392,0.428859,0.696078,6.198006,4.9e-05,...,0.478243,0.413066,0.490229,0.356166,0.318931,40.910881,492.776011,2003.890601,0.336964,1.165325
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,14226.80433,17829.943731,27524.068649,0.516886,0.647795,5.746749,4.6e-05,...,0.448246,0.35707,0.434281,0.561102,0.351591,32.29146,330.378307,1411.879635,0.207975,1.346974
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,11077.747263,22031.587026,31047.11511,0.356804,0.709618,6.03725,6.4e-05,...,0.568751,0.361967,0.501315,0.546106,0.373435,36.699473,364.122716,1105.825587,0.396077,1.420856


In [173]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [174]:
df_3d_descriptors.to_csv('features/Descriptors/Test_3d_all_descriptors_RRCK.csv', index=False)

In [175]:
#3d All descriptors
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_RRCK.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models_3dall = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_3dall, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 442)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 442)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17226
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 442
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17225
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 442
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over



-0.9620996450272525




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2815,0.4166,0.5306,0.372,0.61,0.617,0.5896,0.4864,0.7679,0.1843,0.5498,0.6456
DecisionTreeRegressor,0.5402,0.5613,0.735,-0.205,0.3819,0.4014,0.6216,0.551,0.7884,0.1401,0.506,0.5632
RandomForestRegressor,0.2887,0.4169,0.5374,0.3559,0.6044,0.5985,0.5965,0.5106,0.7723,0.1748,0.5609,0.6615
GradientBoostingRegressor,0.282,0.4093,0.531,0.371,0.6095,0.6173,0.5547,0.4756,0.7448,0.2326,0.5684,0.6759
AdaBoostRegressor,0.2868,0.4103,0.5356,0.3601,0.6025,0.6143,0.602,0.4988,0.7759,0.1671,0.5419,0.6662
XGBRegressor,0.3386,0.446,0.5819,0.2447,0.524,0.5119,0.5615,0.4785,0.7494,0.2231,0.5625,0.6649
ExtraTreesRegressor,0.2447,0.3869,0.4947,0.4541,0.6826,0.6916,0.5433,0.4873,0.7371,0.2484,0.6071,0.699
LinearRegression,0.8107,0.6945,0.9004,-0.8085,0.3745,0.3949,0.5279,0.5985,0.7266,0.2697,0.5557,0.5181
KNeighborsRegressor,0.3083,0.4118,0.5552,0.3123,0.586,0.5926,0.5707,0.5204,0.7555,0.2104,0.5839,0.625
SVR,0.2828,0.4112,0.5318,0.3691,0.6116,0.6289,0.5813,0.4905,0.7624,0.1958,0.5898,0.6658


In [176]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.061280204336053, -5.642516435070985, -5.63...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.9279174118521665, -6.118134729855113, -5....","[-6.125998388780452, -6.276067685891588, -5.41...","[0.14383906188286094, 0.1537500222151755, 0.14..."
1,DecisionTreeRegressor,"[-6.13, -6.0, -4.96, -4.92, -5.32, -6.1, -5.64...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.57, -6.78, -5.52, -6.2, -5.255, -4.0, -6....","[-6.248, -6.578, -5.2780000000000005, -6.15799...","[0.5553161261839962, 0.601544678307439, 0.5262..."
2,RandomForestRegressor,"[-6.049149999999998, -5.893850000000001, -5.49...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0302000000000024, -6.325849999999999, -5....","[-6.099100000000002, -6.2869, -5.426, -5.93548...","[0.10746236550532345, 0.11825796801907063, 0.1..."
3,GradientBoostingRegressor,"[-5.877677273988712, -5.641289130325236, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.974122401074046, -6.606321263160108, -5.2...","[-6.196383999264733, -6.497362121604473, -5.38...","[0.2783375982040261, 0.24256085435893085, 0.16..."
4,AdaBoostRegressor,"[-6.075999999999999, -5.79909090909091, -5.354...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.075999999999999, -6.386666666666666, -5.4...","[-6.1279564102564095, -6.287823333333334, -5.4...","[0.07019450932034847, 0.07619581032518243, 0.1..."
5,XGBRegressor,"[-5.9296527, -5.7853117, -5.1196985, -5.320116...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.923534, -6.290812, -5.238607, -6.2919836,...","[-5.977851, -6.294189, -5.295204, -6.0961604, ...","[0.31703836, 0.29552066, 0.22809951, 0.2183583..."
6,ExtraTreesRegressor,"[-6.076999999999998, -5.634750000000006, -5.22...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.125200000000001, -6.153999999999999, -5.8...","[-6.2767800000000005, -6.27912, -5.90778999999...","[0.15398305621073985, 0.12019216946207216, 0.1..."
7,LinearRegression,"[-6.704425652392967, -5.942027064982862, -5.23...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-4.7115342758379395, -6.427760858178813, -7....","[-5.019749770770046, -6.595834625294442, -6.81...","[0.48009957083890836, 0.8044439316210611, 0.70..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.62, -5.013333333333333...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8566666666666665, -6.2, -5.83333333333333...","[-6.115333333333333, -6.194, -5.94133333333333...","[0.1347854261821771, 0.03349295116554826, 0.21..."
9,SVR,"[-5.982090047018158, -5.356810667292101, -5.37...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.902489096417722, -6.200665883282355, -5.7...","[-6.062269622460747, -6.176095126637981, -5.79...","[0.08652478856876819, 0.1287023318712948, 0.06..."


In [177]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_RRCK.csv')

In [178]:
#3d All descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models_3dall = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_3dall, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 442)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 442)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17226
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 442
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17225
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 442
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over



-0.9620996450272525




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2815,0.4166,0.5306,0.372,0.61,0.617,0.5896,0.4864,0.7679,0.1843,0.5498,0.6456
DecisionTreeRegressor,0.5402,0.5613,0.735,-0.205,0.3819,0.4014,0.6216,0.551,0.7884,0.1401,0.506,0.5632
RandomForestRegressor,0.2887,0.4169,0.5374,0.3559,0.6044,0.5985,0.5965,0.5106,0.7723,0.1748,0.5609,0.6615
GradientBoostingRegressor,0.282,0.4093,0.531,0.371,0.6095,0.6173,0.5547,0.4756,0.7448,0.2326,0.5684,0.6759
AdaBoostRegressor,0.2868,0.4103,0.5356,0.3601,0.6025,0.6143,0.602,0.4988,0.7759,0.1671,0.5419,0.6662
XGBRegressor,0.3386,0.446,0.5819,0.2447,0.524,0.5119,0.5615,0.4785,0.7494,0.2231,0.5625,0.6649
ExtraTreesRegressor,0.2447,0.3869,0.4947,0.4541,0.6826,0.6916,0.5433,0.4873,0.7371,0.2484,0.6071,0.699
LinearRegression,0.8107,0.6945,0.9004,-0.8085,0.3745,0.3949,0.5279,0.5985,0.7266,0.2697,0.5557,0.5181
KNeighborsRegressor,0.3083,0.4118,0.5552,0.3123,0.586,0.5926,0.5707,0.5204,0.7555,0.2104,0.5839,0.625
SVR,0.2828,0.4112,0.5318,0.3691,0.6116,0.6289,0.5813,0.4905,0.7624,0.1958,0.5898,0.6658


In [179]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_const_rem_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_const_rem_RRCK.csv')

In [180]:
#3d All descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_RRCK.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_RRCK.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 375)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 375)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14615
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 375
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14614
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 375
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over



-1.328406480175389




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2541,0.3914,0.504,0.4333,0.6607,0.6582,0.5417,0.4767,0.736,0.2506,0.5895,0.6807
DecisionTreeRegressor,0.57,0.5722,0.755,-0.2716,0.3438,0.3582,0.6585,0.5753,0.8115,0.089,0.4575,0.5163
RandomForestRegressor,0.2854,0.4162,0.5343,0.3632,0.6099,0.6142,0.5851,0.5145,0.7649,0.1906,0.5672,0.6498
GradientBoostingRegressor,0.2791,0.3982,0.5283,0.3774,0.6168,0.6402,0.5392,0.468,0.7343,0.254,0.5781,0.6665
AdaBoostRegressor,0.2913,0.4134,0.5397,0.3503,0.5945,0.6432,0.5759,0.5172,0.7589,0.2033,0.5747,0.6592
XGBRegressor,0.3499,0.449,0.5916,0.2194,0.507,0.5026,0.5191,0.4726,0.7205,0.2818,0.5993,0.6678
ExtraTreesRegressor,0.2562,0.3966,0.5061,0.4286,0.6595,0.6767,0.543,0.4858,0.7369,0.2488,0.6013,0.6759
LinearRegression,0.855,0.7445,0.9247,-0.9074,0.3518,0.3479,0.5394,0.6218,0.7344,0.2538,0.5717,0.5878
KNeighborsRegressor,0.2788,0.4025,0.528,0.378,0.6275,0.6282,0.5709,0.4953,0.7555,0.2103,0.5771,0.6335
SVR,0.266,0.3919,0.5158,0.4065,0.6457,0.6634,0.6085,0.5161,0.78,0.1582,0.55,0.625


In [181]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.1430883691767, -5.395675073910066, -5.3797...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.833089740429352, -6.147505760880303, -5.3...","[-6.17016398474901, -6.205525816150542, -5.503...","[0.2094826804359627, 0.22297928816455892, 0.13..."
1,DecisionTreeRegressor,"[-6.13, -6.1, -5.1, -4.72, -5.32, -6.2, -5.4, ...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.13, -7.48, -5.52, -6.2, -5.1, -4.0, -6.92...","[-6.324, -6.562, -5.3, -5.8260000000000005, -5...","[0.5791925413884405, 0.7581662086904166, 0.430..."
2,RandomForestRegressor,"[-6.107700000000001, -5.803900000000001, -5.45...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.055650000000002, -6.247500000000001, -5.5...","[-6.1729899999999995, -6.3099099999999995, -5....","[0.10599152041554974, 0.1448530096339017, 0.09..."
3,GradientBoostingRegressor,"[-6.1238104017940085, -5.517853282453478, -5.1...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0299184172677505, -6.699628349472598, -5....","[-6.316615901649781, -6.477687587788493, -5.42...","[0.1906780016462925, 0.2656656208368843, 0.343..."
4,AdaBoostRegressor,"[-5.937142857142857, -5.58, -5.31235294117647,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.01, -6.3741666666666665, -5.3305263157894...","[-6.172385073260073, -6.260632420091325, -5.53...","[0.12451918280468363, 0.12144402524065999, 0.2..."
5,XGBRegressor,"[-6.1512623, -5.803018, -4.960959, -5.4792514,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0198164, -6.4950795, -5.431509, -6.380409...","[-6.2488866, -6.2577353, -5.5189705, -6.422532...","[0.16434918, 0.23736133, 0.47906342, 0.1811814..."
6,ExtraTreesRegressor,"[-6.065499999999998, -5.597900000000003, -5.20...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.062449999999999, -6.256399999999998, -5.8...","[-6.26399, -6.3061799999999995, -5.87854, -6.1...","[0.1598229720659706, 0.12200885869476596, 0.18..."
7,LinearRegression,"[-6.703641805614134, -5.184239037233611, -5.29...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.430048763013518, -5.152499054380396, -9.6...","[-5.284210067342322, -5.731950659807234, -7.46...","[0.2867808558912039, 0.8393200286291204, 1.269..."
8,KNeighborsRegressor,"[-5.983333333333333, -5.663333333333334, -5.01...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -6.133333333333333, -5.8...","[-6.1659999999999995, -6.154, -5.9239999999999...","[0.0956475009837916, 0.04133333333333375, 0.18..."
9,SVR,"[-5.969112754584967, -5.322032241719517, -5.34...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.905727313418048, -6.209875646430859, -5.7...","[-6.074561637265738, -6.16348140681811, -5.790...","[0.09191746328543957, 0.144159932290712, 0.076..."


In [182]:
result_df.to_csv('results/Descriptors/Results_3D_All_desc_LVR_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_3D_All_desc_LVR_RRCK.csv')

In [183]:
#2d and 3d descriptors all
df_train_2d = pd.read_csv('features/Descriptors/Train_2d_all_descriptors_RRCK.csv')
df_train_2d
df_train_3d = pd.read_csv('features/Descriptors/Train_3d_all_descriptors_RRCK.csv')
df_train_3d

df_2d_3d_train = df_train_2d.merge(df_train_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_train.to_csv('features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv', index=False)
df_2d_3d_train

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2358,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C(...,-6.13,15.193873,15.193873,0.130769,-1.621791,0.147476,26.802326,1216.662,...,0.584742,0.289275,0.526481,0.467149,0.355332,51.237177,733.133564,3650.811196,0.377113,1.348962
1,2359,C/C=C/C[C@@H](C)C(=O)[C@H]1C(=O)N[C@@H](C(C)C)...,-6.66,15.152762,15.152762,0.128114,-1.816236,0.134993,26.372093,1214.646,...,0.562959,0.330424,0.573577,0.533312,0.392941,52.061975,762.354776,3612.998411,0.344438,1.499831
2,2357,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.95,15.129540,15.129540,0.022871,-1.609940,0.147925,26.905882,1202.635,...,0.505864,0.386967,0.530722,0.572866,0.283984,52.691795,809.151889,3930.912570,0.339246,1.387572
3,2360,C/C=C/C[C@@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(...,-6.78,15.028142,15.028142,0.097424,-1.231351,0.116062,27.411765,1202.635,...,0.543224,0.390435,0.549748,0.553013,0.367643,60.025118,987.347841,4090.444574,0.400488,1.470405
4,2353,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](C)...,-5.87,15.068092,15.068092,0.128760,-1.611421,0.157205,27.083333,1188.608,...,0.559155,0.365398,0.567980,0.530505,0.368965,55.664600,849.215301,3563.631478,0.386829,1.467450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2336,CC[C@@H]1NC(=O)[C@@H](CC)NC(=O)[C@H](CC(C)C)NC...,-5.39,13.940180,13.940180,0.023137,-1.031903,0.289440,27.260870,640.826,...,0.480821,0.450969,0.447622,0.401417,0.374652,28.991239,235.667908,625.054603,0.397685,1.223691
136,2306,CC(C)C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H]2CCCN...,-4.75,13.929912,13.929912,0.048761,-0.934108,0.462849,28.954545,620.836,...,0.587350,0.351181,0.540411,0.549950,0.480266,27.093819,193.764285,473.031633,0.407795,1.570627
137,2334,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]...,-5.58,13.805528,13.805528,0.023859,-1.030988,0.318688,27.954545,612.772,...,0.603934,0.302221,0.478983,0.563029,0.379015,26.877338,193.282773,552.730401,0.405901,1.421027
138,2305,CCC[C@@H]1C(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)...,-4.85,13.850263,13.850263,0.054388,-0.925132,0.468423,29.209302,606.809,...,0.573506,0.364677,0.497225,0.533337,0.460216,26.843623,192.495908,469.419500,0.407274,1.490778


In [184]:
df_test_2d = pd.read_csv('features/Descriptors/Test_2d_all_descriptors_RRCK.csv')
df_test_2d
df_test_3d = pd.read_csv('features/Descriptors/Test_3d_all_descriptors_RRCK.csv')
df_test_3d

df_2d_3d_test = df_test_2d.merge(df_test_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_test.to_csv('features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv', index=False)
df_2d_3d_test

Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2352,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H]([C...,-6.34,15.14449,15.14449,0.113131,-1.744209,0.128505,27.116279,1218.634,...,0.615473,0.315508,0.587862,0.481708,0.41975,54.418494,765.343933,2979.646029,0.423209,1.48932
1,5669,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...,-5.76,15.12954,15.12954,0.022871,-1.60994,0.147925,26.905882,1202.635,...,0.53163,0.381441,0.595506,0.578602,0.384673,59.364061,994.351177,4741.587052,0.369605,1.55878
2,1881,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,15.256898,15.256898,0.00053,-1.187472,0.146159,25.6625,1122.548,...,0.562651,0.292498,0.404332,0.490048,0.357058,43.246459,539.46316,2510.832727,0.343976,1.251438
3,5666,CCCC[C@@H]1NC(=O)[C@H](CCCC)NC(=O)[C@H](CCCC)N...,-6.46,14.719159,14.719159,0.02623,-1.109554,0.098749,24.273973,1042.443,...,0.538203,0.316007,0.401819,0.490964,0.334962,42.515208,532.522256,2480.513318,0.307305,1.227745
4,1877,CCCC[C@H]1C(=O)N(C)[C@H](CCCC)C(=O)N(C)[C@H](C...,-8.0,15.077973,15.077973,0.0257,-1.150959,0.182069,25.569444,1009.388,...,0.531873,0.36771,0.524581,0.567214,0.404237,41.840369,500.515903,1980.84397,0.349375,1.496032
5,1873,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N2CCC[C...,-4.62,15.069391,15.069391,0.013388,-1.165871,0.363344,26.720588,953.28,...,0.493579,0.344873,0.529573,0.541,0.350437,37.662157,433.577101,1940.275168,0.257679,1.421011
6,1878,CCC[C@H]1C(=O)N(C)[C@H](CC)C(=O)N(C)[C@@H](C)C...,-7.3,14.916791,14.916791,0.00378,-1.18176,0.419137,28.313433,940.197,...,0.547537,0.331118,0.541763,0.498942,0.284356,38.037688,416.581031,1665.382796,0.321306,1.325061
7,1849,O=C1CN(CCCc2ccccc2)C(=O)[C@H]2CCCN2C(=O)[C@H](...,-5.92,14.898914,14.898914,0.085369,-1.142615,0.10396,20.911765,918.152,...,0.478243,0.413066,0.490229,0.356166,0.318931,40.910881,492.776011,2003.890601,0.336964,1.165325
8,1856,CC(C)(C)C[C@@H]1NC(=O)[C@@H](Cc2ccccc2)NC(=O)[...,-5.12,14.676408,14.676408,0.002283,-1.335295,0.172783,21.9375,902.053,...,0.448246,0.35707,0.434281,0.561102,0.351591,32.29146,330.378307,1411.879635,0.207975,1.346974
9,2367,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)N(C)...,-6.93,14.782868,14.782868,0.013476,-1.206234,0.239281,24.21875,880.1,...,0.568751,0.361967,0.501315,0.546106,0.373435,36.699473,364.122716,1105.825587,0.396077,1.420856


In [185]:
#All 2d and 3d descriptors
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 3533)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 3533)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 92371
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2635
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91905
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2639
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.214,0.3459,0.4626,0.5227,0.7288,0.7554,0.47,0.4364,0.6856,0.3498,0.6734,0.7802
DecisionTreeRegressor,0.3643,0.4618,0.6036,0.1873,0.5531,0.5546,0.4559,0.4468,0.6752,0.3693,0.6731,0.7278
RandomForestRegressor,0.2142,0.3553,0.4628,0.5222,0.7427,0.7616,0.4783,0.4633,0.6916,0.3383,0.6983,0.7873
GradientBoostingRegressor,0.203,0.3334,0.4505,0.5472,0.7478,0.7555,0.4004,0.4062,0.6328,0.4461,0.7319,0.8238
AdaBoostRegressor,0.2316,0.3694,0.4812,0.4834,0.7062,0.7235,0.4998,0.4597,0.707,0.3086,0.6539,0.7483
XGBRegressor,0.2749,0.3982,0.5243,0.3868,0.6301,0.6533,0.4721,0.4513,0.6871,0.3469,0.6819,0.7798
ExtraTreesRegressor,0.1927,0.3255,0.439,0.5701,0.7648,0.7729,0.4226,0.4243,0.6501,0.4153,0.7254,0.8063
LinearRegression,0.3899,0.4606,0.6244,0.1302,0.5958,0.6226,0.3053,0.442,0.5525,0.5777,0.7709,0.7606
KNeighborsRegressor,0.2813,0.3789,0.5304,0.3725,0.6569,0.6846,0.5597,0.4739,0.7481,0.2257,0.64,0.7178
SVR,0.1945,0.3138,0.441,0.5661,0.761,0.7801,0.4426,0.4319,0.6653,0.3877,0.6998,0.789


In [186]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.984106631148668, -6.049591747297483, -5.09...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.999833488241906, -6.066504928906188, -6.1...","[-6.083826894566467, -6.174136344684146, -5.92...","[0.1662487613321342, 0.1717963136001346, 0.121..."
1,DecisionTreeRegressor,"[-5.75, -5.85, -5.4, -4.72, -5.57, -6.2, -5.57...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.39, -5.95, -6.1, -6.46, -5.9, -5.4, -6.92...","[-5.626, -6.076, -6.234, -6.386, -5.996, -5.09...","[0.5298528097500286, 0.29363923443572726, 0.18..."
2,RandomForestRegressor,"[-6.008649999999997, -5.996800000000004, -5.23...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.935649999999997, -6.1665, -6.183800000000...","[-6.09363, -6.164949999999999, -5.984350000000...","[0.08891484465487312, 0.08251992486666469, 0.1..."
3,GradientBoostingRegressor,"[-5.867823263063595, -6.240501741620101, -5.07...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.734208848252741, -6.141899408379327, -6.4...","[-6.111998427502735, -6.1561787968217265, -6.1...","[0.21285050130844616, 0.13913919704847905, 0.2..."
4,AdaBoostRegressor,"[-5.944583333333334, -6.0225, -5.2032142857142...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.013684210526315, -5.970000000000001, -6.1...","[-6.159993893387314, -6.134768324573399, -5.86...","[0.10383507672559192, 0.09255855822491685, 0.2..."
5,XGBRegressor,"[-6.092227, -6.308349, -5.0307407, -5.6635227,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8222427, -6.1322875, -6.058396, -6.032231...","[-6.0275545, -6.133803, -6.0266595, -5.911379,...","[0.2094793, 0.14878775, 0.27972436, 0.24473776..."
6,ExtraTreesRegressor,"[-6.0848, -5.812500000000003, -5.0382000000000...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.085099999999998, -6.061399999999997, -6.3...","[-6.193319999999998, -6.1379, -6.2131700000000...","[0.0667054540498756, 0.07734799286342219, 0.09..."
7,LinearRegression,"[-6.133203104058406, -5.8957312508055395, -5.2...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.727972461709701, -5.792749041251861, -8.3...","[-5.754914869117431, -6.133103207550368, -7.43...","[0.41467613130413705, 0.29070018349713234, 1.0..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.039999999999999, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.924666666666667, -5.78...","[0.11400974617208051, 0.05153639490690065, 0.1..."
9,SVR,"[-6.040947125737306, -5.78495406479584, -5.280...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.013858871931068, -6.091248206643578, -5.8...","[-6.0727349438524145, -6.094438060936506, -5.9...","[0.03426107184379328, 0.019100298923346772, 0...."


In [187]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_RRCK.csv')

In [188]:
#All 2d and 3d descriptors const rem
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 2820)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 2820)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012918 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 92371
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2635
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91905
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 2639
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.214,0.3459,0.4626,0.5227,0.7288,0.7554,0.47,0.4364,0.6856,0.3498,0.6734,0.7802
DecisionTreeRegressor,0.3911,0.4744,0.6254,0.1275,0.5308,0.5241,0.3988,0.4295,0.6315,0.4483,0.7281,0.783
RandomForestRegressor,0.2146,0.354,0.4632,0.5214,0.7433,0.7656,0.4828,0.465,0.6949,0.332,0.6941,0.786
GradientBoostingRegressor,0.197,0.3322,0.4438,0.5606,0.7573,0.7657,0.4005,0.4083,0.6329,0.4459,0.7292,0.8267
AdaBoostRegressor,0.2056,0.3492,0.4534,0.5415,0.7594,0.7684,0.4949,0.473,0.7035,0.3154,0.6777,0.7559
XGBRegressor,0.2749,0.3982,0.5243,0.3868,0.6301,0.6533,0.4721,0.4513,0.6871,0.3469,0.6819,0.7798
ExtraTreesRegressor,0.1823,0.3157,0.427,0.5933,0.7823,0.7989,0.4075,0.4164,0.6383,0.4363,0.7442,0.8235
LinearRegression,0.3899,0.4606,0.6244,0.1302,0.5958,0.6226,0.3053,0.442,0.5525,0.5777,0.7709,0.7606
KNeighborsRegressor,0.2813,0.3789,0.5304,0.3725,0.6569,0.6846,0.5597,0.4739,0.7481,0.2257,0.64,0.7178
SVR,0.1945,0.3138,0.441,0.5661,0.761,0.7796,0.4426,0.4319,0.6653,0.3877,0.6998,0.789


In [189]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.984106631148668, -6.049591747297483, -5.09...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.999833488241906, -6.066504928906188, -6.1...","[-6.083826894566467, -6.174136344684146, -5.92...","[0.1662487613321342, 0.1717963136001346, 0.121..."
1,DecisionTreeRegressor,"[-5.75, -5.85, -5.4, -4.72, -5.32, -6.22, -5.4...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.4, -5.95, -7.48, -6.96, -7.48, -5.4, -6.6...","[-5.602000000000001, -6.078, -6.56000000000000...","[0.3522726217008637, 0.2928071037389633, 0.507..."
2,RandomForestRegressor,"[-5.956599999999997, -5.986850000000004, -5.23...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.846899999999998, -6.0909, -6.150950000000...","[-6.066979999999999, -6.136979999999999, -5.95...","[0.12919629483851347, 0.04392663884250585, 0.1..."
3,GradientBoostingRegressor,"[-5.872298013178286, -6.239447449488504, -5.00...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.772621537130071, -6.03986029011038, -6.43...","[-6.176301787343389, -6.144716988512229, -6.16...","[0.232911494167955, 0.16602910080857522, 0.264..."
4,AdaBoostRegressor,"[-5.99, -5.957222222222223, -5.179130434782609...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.0528571428571425, -6.0625, -6.09363636363...","[-6.153549014778325, -6.1557279720279725, -5.8...","[0.10862921185807903, 0.12425187388809239, 0.2..."
5,XGBRegressor,"[-6.092227, -6.308349, -5.0307407, -5.6635227,...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.8222427, -6.1322875, -6.058396, -6.032231...","[-6.0275545, -6.133803, -6.0266595, -5.911379,...","[0.2094793, 0.14878775, 0.27972436, 0.24473776..."
6,ExtraTreesRegressor,"[-5.911149999999999, -5.9760000000000035, -5.0...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.054949999999998, -6.0489, -6.357050000000...","[-6.158029999999999, -6.10274, -6.233410000000...","[0.07806846098137228, 0.0533872868761837, 0.15..."
7,LinearRegression,"[-6.133203104058401, -5.895731250805537, -5.27...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.727972461709702, -5.792749041251861, -8.3...","[-5.754914869117427, -6.133103207550368, -7.43...","[0.414676131304136, 0.2907001834971347, 1.0965..."
8,KNeighborsRegressor,"[-5.983333333333333, -6.039999999999999, -4.96...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-5.983333333333333, -5.983333333333333, -5.7...","[-6.118666666666667, -5.924666666666667, -5.78...","[0.11400974617208051, 0.05153639490690065, 0.1..."
9,SVR,"[-6.040947126819007, -5.784954065316029, -5.28...",0 -6.340 1 -5.760 2 -8.000 3 -6.46...,"[[-6.013858855774663, -6.091248188563721, -5.8...","[-6.0726701748208765, -6.094373307163134, -5.9...","[0.034205313524075166, 0.019132039806664853, 0..."


In [190]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_const_rem_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_const_rem_RRCK.csv')

In [191]:
#All 2d and 3d descriptors LVR
df_train = pd.read_csv('features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 2096)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 2096)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66123
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1928
[LightGBM] [Info] Start training from score -5.527679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65863
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1932
[LightGBM] [Info] Start training from score -5.528750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2085,0.3391,0.4566,0.5349,0.7424,0.7626,0.4545,0.4461,0.6742,0.3712,0.6895,0.7825
DecisionTreeRegressor,0.4112,0.4845,0.6413,0.0827,0.4838,0.4751,0.4039,0.4369,0.6356,0.4412,0.7335,0.7766
RandomForestRegressor,0.222,0.3566,0.4711,0.5048,0.7276,0.7505,0.4849,0.4709,0.6963,0.3292,0.6862,0.7735
GradientBoostingRegressor,0.2203,0.3451,0.4693,0.5087,0.7184,0.7442,0.4049,0.4187,0.6363,0.4399,0.7305,0.8016
AdaBoostRegressor,0.2209,0.3603,0.47,0.5072,0.727,0.7468,0.5265,0.4857,0.7256,0.2717,0.6206,0.7034
XGBRegressor,0.2783,0.3937,0.5275,0.3793,0.6229,0.6331,0.4232,0.4475,0.6505,0.4145,0.7304,0.7908
ExtraTreesRegressor,0.1826,0.3207,0.4273,0.5927,0.778,0.7885,0.4369,0.4285,0.661,0.3956,0.712,0.7847
LinearRegression,0.3643,0.4392,0.6036,0.1873,0.6238,0.6322,0.3141,0.4596,0.5604,0.5655,0.7608,0.739
KNeighborsRegressor,0.2981,0.392,0.546,0.3351,0.6204,0.6571,0.5567,0.4809,0.7461,0.2298,0.6416,0.7263
SVR,0.2049,0.3241,0.4527,0.5428,0.7455,0.7676,0.4708,0.4492,0.6862,0.3486,0.6744,0.7711


In [192]:
result_df.to_csv('results/Descriptors/Results_2D_3D_All_desc_LVR_RRCK.csv')
prediction_df.to_csv('results/Descriptors/Prediction_data_2D_3D_All_desc_LVR_RRCK.csv')

In [54]:
#Stacked architecture model
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [55]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [56]:
from tqdm import tqdm
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Train_all_atomic_desc_RRCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Test_all_atomic_desc_RRCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
target_column = 'Permeability'
def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),   
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(140, 249)
(36, 249)
(140, 414)
(36, 414)
(140, 690)
(36, 690)
(140, 13)
(36, 13)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       ID                                             SMILES  Permeability  \
109    24  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...        -6.300   
81     26  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H...        -5.390   
73     27  CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N(C)[C@...        -5.460   
74     28  CC(C)C[C@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)N2CCC[...        -5.210   
100    29  CC(C)C[C@@H]1NC(=O)[C@H](CC(

In [57]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 5-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=5, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=5, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (5 fold CV)': mse_train,
        'Train MAE (5 fold CV)': mae_train,
        'Train RMSE (5 fold CV)': rmse_train,
        'Train R2 (5 fold CV)': r2_train,
        'Train PCC (5 fold CV)': pearson_train,
        'Train SCC (5 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8251
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 230
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8187
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 230
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8184
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 230
[LightGBM] [Info] Start trai


Training models: 1it [00:00,  2.77it/s][A




Training models: 2it [00:01,  1.06s/it][A
Training models: 3it [00:05,  2.03s/it][A
Training models: 4it [00:06,  1.74s/it][A
Training models: 5it [00:07,  1.58s/it][A
Training models: 6it [00:08,  1.48s/it][A
Training models: 10it [00:09,  1.02it/s][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:09<00:29,  9.81s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.242557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 966
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 185
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 967
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 187
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,


Training models: 1it [00:00,  1.47it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 877
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 169
[LightGBM] [Info] Start training from score -5.529777
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 921
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 181
[LightGBM] [Info] Start training from score -5.489554



Training models: 2it [00:02,  1.10s/it][A
Training models: 3it [00:02,  1.02it/s][A
Training models: 4it [00:03,  1.26it/s][A
Training models: 5it [00:04,  1.07s/it][A
Training models: 6it [00:06,  1.15s/it][A
Training models: 8it [00:06,  1.63it/s][A
Training models: 10it [00:07,  1.40it/s][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:16<00:16,  8.24s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26793
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 687
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26793
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 687
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26793
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 687
[LightGBM] [Info] Start t


Training models: 1it [00:00,  1.67it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26793
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 687
[LightGBM] [Info] Start training from score -5.489554



Training models: 2it [00:02,  1.30s/it][A
Training models: 3it [00:12,  5.22s/it][A
Training models: 4it [00:15,  4.54s/it][A
Training models: 5it [00:19,  4.10s/it][A
Training models: 6it [00:20,  3.18s/it][A
Training models: 8it [00:20,  1.65s/it][A
Training models: 9it [00:21,  1.51s/it][A
Training models: 10it [00:21,  2.20s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [00:38<00:14, 14.52s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 5
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 5
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 112, num


Training models: 1it [00:00,  3.22it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 4
[LightGBM] [Info] Start training from score -5.489554



Training models: 2it [00:01,  1.04it/s][A
Training models: 3it [00:01,  1.63it/s][A
Training models: 4it [00:02,  2.30it/s][A
Training models: 5it [00:02,  1.82it/s][A

Training models: 10it [00:04,  2.08it/s][A
Processing dataframe pairs: 100%|██████████| 4/4 [00:43<00:00, 10.94s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (140, 40)
Dimensions of meta_features_test: (36, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1478
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 40
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1471
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 40
[L



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1467
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 40
[LightGBM] [Info] Start training from score -5.489554
LGBMRegressor Evaluation completed: Test R2 score: 0.4672921674769943
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.6112248283501416




RandomForestRegressor Evaluation completed: Test R2 score: 0.5150733666948241
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.5587865874187217
AdaBoostRegressor Evaluation completed: Test R2 score: 0.5205314370200718
XGBRegressor Evaluation completed: Test R2 score: 0.5356517720219225
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.5284821365892622
LinearRegression Evaluation completed: Test R2 score: 0.6180852048242691
KNeighborsRegressor Evaluation completed: Test R2 score: 0.5044395504523412
SVR Evaluation completed: Test R2 score: 0.487105124778043
MLPRegressor Evaluation completed: Test R2 score: 0.33524534639171133


Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.206208,0.343818,0.454101,0.540005,0.738968,0.759734,0.385063,0.419419,0.620534,0.467292,0.742521,0.781926
DecisionTreeRegressor,0.404919,0.500036,0.636333,0.096732,0.555216,0.590503,0.281023,0.362278,0.530116,0.611225,0.818057,0.842597
RandomForestRegressor,0.186954,0.32519,0.432382,0.582954,0.76448,0.778277,0.350525,0.391315,0.592051,0.515073,0.769174,0.825899
GradientBoostingRegressor,0.206881,0.34311,0.454842,0.538503,0.739429,0.749829,0.318927,0.370692,0.564736,0.558787,0.79227,0.849199
AdaBoostRegressor,0.202354,0.332856,0.449838,0.548602,0.743299,0.761503,0.346579,0.397186,0.58871,0.520531,0.778476,0.819978
XGBRegressor,0.225789,0.361261,0.475173,0.496324,0.719204,0.732382,0.33565,0.398483,0.579353,0.535652,0.765805,0.804145
ExtraTreesRegressor,0.187491,0.324966,0.433002,0.581757,0.763644,0.780085,0.340832,0.390796,0.583808,0.528482,0.768366,0.824741
LinearRegression,0.221186,0.361166,0.470304,0.506592,0.730359,0.744672,0.276064,0.37611,0.525418,0.618085,0.804229,0.84199
KNeighborsRegressor,0.204898,0.345179,0.452657,0.542926,0.740622,0.756201,0.358211,0.429322,0.598508,0.50444,0.756691,0.786767
SVR,0.206167,0.348228,0.454056,0.540097,0.736176,0.744609,0.370741,0.412637,0.608885,0.487105,0.716748,0.77029


In [58]:
results_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Results_5_folds_stacked_archi_RRCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Prediction_data_5_folds_stacked_archi_RRCK.csv')

In [59]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 10-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=10, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=10, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10,-4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (10 fold CV)': mse_train,
        'Train MAE (10 fold CV)': mae_train,
        'Train RMSE (10 fold CV)': rmse_train,
        'Train R2 (10 fold CV)': r2_train,
        'Train PCC (10 fold CV)': pearson_train,
        'Train SCC (10 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9192
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 230
[LightGBM] [Info] Start training from score -5.536667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9144
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 232
[LightGBM] [Info] Start training from score -5.541706
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9162
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 232
[LightGBM] [Info] Start trai


Training models: 1it [00:00,  1.25it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9132
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 232
[LightGBM] [Info] Start training from score -5.513651
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9138
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 231
[LightGBM] [Info] Start training from score -5.519127



Training models: 2it [00:03,  2.16s/it][A
Training models: 3it [00:11,  4.43s/it][A
Training models: 4it [00:13,  3.81s/it][A
Training models: 5it [00:17,  3.60s/it][A
Training models: 6it [00:19,  3.25s/it][A
Training models: 7it [00:19,  2.22s/it][A
Training models: 9it [00:21,  1.53s/it][A
Training models: 10it [00:21,  2.16s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:21<01:04, 21.57s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1051
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 205
[LightGBM] [Info] Start training from score -5.536667
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 198
[LightGBM] [Info] Start training from score -5.541706
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training models: 1it [00:00,  1.02it/s][A




Training models: 2it [00:04,  2.21s/it][A
Training models: 3it [00:05,  2.02s/it][A
Training models: 4it [00:06,  1.66s/it][A
Training models: 5it [00:10,  2.22s/it][A
Training models: 6it [00:12,  2.32s/it][A
Training models: 7it [00:12,  1.64s/it][A
Training models: 9it [00:13,  1.05s/it][A
Training models: 10it [00:13,  1.38s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:35<00:34, 17.02s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29978
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 687
[LightGBM] [Info] Start training from score -5.536667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29993
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 687
[LightGBM] [Info] Start training from score -5.541706
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30020
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 687
[LightGBM] [Info] Start t


Training models: 1it [00:01,  1.14s/it][A
Training models: 2it [00:04,  2.49s/it][A
Training models: 3it [00:27, 11.66s/it][A
Training models: 4it [00:35, 10.21s/it][A
Training models: 5it [00:42,  9.06s/it][A
Training models: 6it [00:44,  6.90s/it][A
Training models: 7it [00:44,  4.68s/it][A
Training models: 8it [00:45,  3.23s/it][A
Training models: 9it [00:47,  2.88s/it][A
Training models: 10it [00:47,  4.77s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [01:23<00:31, 31.05s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 6
[LightGBM] [Info] Start training from score -5.536667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 5
[LightGBM] [Info] Start training from score -5.541706
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 6
[LightGBM] [Info] Start training from sc


Training models: 1it [00:00,  1.23it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 6
[LightGBM] [Info] Start training from score -5.519127



Training models: 2it [00:03,  2.19s/it][A
Training models: 3it [00:04,  1.37s/it][A
Training models: 4it [00:04,  1.03it/s][A
Training models: 5it [00:06,  1.13s/it][A

Training models: 10it [00:10,  1.01s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [01:33<00:00, 23.32s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (140, 40)
Dimensions of meta_features_test: (36, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1655
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start training from score -5.536667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1658
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[L



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start training from score -5.511389
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1660
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start training from score -5.549444
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1655
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1655
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start training from score -5.537976
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1652
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start training from score -5.513651
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1660
[LightGBM] [Info] Number of data points in the train set: 126, number of used features: 40
[LightGBM] [Info] Start trainin



DecisionTreeRegressor Evaluation completed: Test R2 score: 0.5720730922049218
RandomForestRegressor Evaluation completed: Test R2 score: 0.5585556617570095
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.6082021409051817
AdaBoostRegressor Evaluation completed: Test R2 score: 0.5504271149460027
XGBRegressor Evaluation completed: Test R2 score: 0.5559889309757324
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.5706443014523859
LinearRegression Evaluation completed: Test R2 score: 0.6631332983780276
KNeighborsRegressor Evaluation completed: Test R2 score: 0.49920786441804765
SVR Evaluation completed: Test R2 score: 0.4694247606169878
MLPRegressor Evaluation completed: Test R2 score: 0.3621980808184395


Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.183984,0.322355,0.428933,0.589581,0.770244,0.792961,0.330079,0.389186,0.574525,0.543359,0.776668,0.832904
DecisionTreeRegressor,0.347696,0.443893,0.589657,0.224383,0.627999,0.674069,0.309323,0.411,0.556168,0.572073,0.799296,0.838386
RandomForestRegressor,0.166358,0.31079,0.407871,0.628898,0.794205,0.812332,0.319094,0.374553,0.564884,0.558556,0.790244,0.85306
GradientBoostingRegressor,0.191393,0.334935,0.437485,0.573053,0.766002,0.794026,0.283207,0.355972,0.532172,0.608202,0.807587,0.865161
AdaBoostRegressor,0.187649,0.329013,0.433184,0.581406,0.763524,0.78978,0.32497,0.393565,0.570061,0.550427,0.786411,0.853447
XGBRegressor,0.227238,0.359067,0.476695,0.493092,0.730847,0.772287,0.320949,0.383935,0.566524,0.555989,0.786177,0.813799
ExtraTreesRegressor,0.15948,0.295175,0.39935,0.644242,0.803146,0.819374,0.310356,0.364499,0.557096,0.570644,0.797405,0.852288
LinearRegression,0.203906,0.341503,0.451559,0.54514,0.763591,0.787805,0.243501,0.370217,0.493458,0.663133,0.832587,0.845723
KNeighborsRegressor,0.178218,0.314579,0.422159,0.602442,0.778278,0.80693,0.361993,0.415067,0.601659,0.499208,0.764703,0.819645
SVR,0.164789,0.307484,0.405942,0.6324,0.795294,0.813211,0.383521,0.403981,0.619291,0.469425,0.719433,0.785866


In [60]:
results_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Results_10_folds_stacked_archi_RRCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Prediction_data_10_folds_stacked_archi_RRCK.csv')

In [61]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 15-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=15, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=15, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (15 fold CV)': mse_train,
        'Train MAE (15 fold CV)': mae_train,
        'Train RMSE (15 fold CV)': rmse_train,
        'Train R2 (15 fold CV)': r2_train,
        'Train PCC (15 fold CV)': pearson_train,
        'Train SCC (15 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9474
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 230
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9419
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9399
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start trai


Training models: 1it [00:01,  1.30s/it][A




Training models: 2it [00:06,  3.33s/it][A
Training models: 3it [00:17,  6.89s/it][A
Training models: 4it [00:21,  5.91s/it][A
Training models: 5it [00:26,  5.40s/it][A
Training models: 6it [00:29,  4.89s/it][A
Training models: 7it [00:30,  3.33s/it][A
Training models: 9it [00:32,  2.31s/it][A
Training models: 10it [00:32,  3.28s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:32<01:38, 32.84s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 207
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 206
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training models: 1it [00:01,  1.45s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 206
[LightGBM] [Info] Start training from score -5.542977



Training models: 2it [00:06,  3.30s/it][A
Training models: 3it [00:08,  3.03s/it][A
Training models: 4it [00:10,  2.45s/it][A
Training models: 5it [00:15,  3.29s/it][A
Training models: 6it [00:18,  3.48s/it][A
Training models: 7it [00:19,  2.39s/it][A
Training models: 8it [00:19,  1.67s/it][A
Training models: 9it [00:20,  1.51s/it][A
Training models: 10it [00:20,  2.05s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:53<00:51, 25.61s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005935 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start t


Training models: 1it [00:01,  1.45s/it][A




Training models: 2it [00:06,  3.83s/it][A
Training models: 3it [00:41, 17.89s/it][A
Training models: 4it [00:53, 15.65s/it][A
Training models: 5it [01:04, 13.93s/it][A
Training models: 6it [01:08, 10.60s/it][A
Training models: 7it [01:09,  7.22s/it][A
Training models: 8it [01:09,  4.98s/it][A
Training models: 9it [01:12,  4.48s/it][A
Training models: 10it [01:13,  7.35s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [02:06<00:47, 47.46s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 5
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from sc


Training models: 1it [00:01,  1.20s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 6
[LightGBM] [Info] Start training from score -5.542977



Training models: 2it [00:06,  3.36s/it][A
Training models: 3it [00:06,  2.10s/it][A
Training models: 4it [00:07,  1.46s/it][A
Training models: 5it [00:09,  1.82s/it][A

Training models: 10it [00:15,  1.58s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [02:22<00:00, 35.66s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (140, 40)
Dimensions of meta_features_test: (36, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1740
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1734
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[L



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1728
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[LightGBM] [Info] Start training from score -5.567269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.546947
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1745
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.539008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1740
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.536031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1738
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1735
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.533664
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.518206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1740
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.504046
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1738
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.542977
LGBMRegressor Evaluation completed: Test R2 score: 0.5328022638661125




DecisionTreeRegressor Evaluation completed: Test R2 score: 0.592578721638174
RandomForestRegressor Evaluation completed: Test R2 score: 0.5415306908331479
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.5560266141582711
AdaBoostRegressor Evaluation completed: Test R2 score: 0.5300723890140349
XGBRegressor Evaluation completed: Test R2 score: 0.48781853605857983
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.5519657259810089
LinearRegression Evaluation completed: Test R2 score: 0.6887289113620519
KNeighborsRegressor Evaluation completed: Test R2 score: 0.45883294899104454
SVR Evaluation completed: Test R2 score: 0.5003184083178264
MLPRegressor Evaluation completed: Test R2 score: 0.29353300717380026


Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.175434,0.312442,0.418849,0.608653,0.781454,0.802038,0.33771,0.401694,0.581128,0.532802,0.783677,0.842559
DecisionTreeRegressor,0.299181,0.397714,0.546974,0.332606,0.659535,0.697074,0.294501,0.415648,0.542679,0.592579,0.804971,0.821008
RandomForestRegressor,0.152558,0.287556,0.390587,0.659683,0.812714,0.836112,0.3314,0.386203,0.575674,0.541531,0.785297,0.842505
GradientBoostingRegressor,0.17063,0.306606,0.413074,0.61937,0.78879,0.804991,0.320922,0.397015,0.5665,0.556027,0.785123,0.833494
AdaBoostRegressor,0.161734,0.308477,0.402162,0.639214,0.800043,0.824175,0.339683,0.3938,0.582823,0.530072,0.782384,0.824097
XGBRegressor,0.170421,0.311415,0.412821,0.619835,0.790873,0.797853,0.370226,0.425766,0.608462,0.487819,0.741152,0.778657
ExtraTreesRegressor,0.145774,0.281962,0.381804,0.674817,0.82203,0.840455,0.323857,0.373949,0.569085,0.551966,0.801053,0.861685
LinearRegression,0.152293,0.309072,0.390247,0.660275,0.822091,0.81778,0.224999,0.333679,0.474341,0.688729,0.843344,0.839029
KNeighborsRegressor,0.184127,0.318579,0.4291,0.589262,0.770947,0.793388,0.391178,0.421054,0.625442,0.458833,0.729692,0.773043
SVR,0.161307,0.307333,0.40163,0.640167,0.800346,0.815803,0.36119,0.400532,0.600991,0.500318,0.729782,0.794491


In [62]:
results_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Results_15_folds_stacked_archi_RRCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Prediction_data_15_folds_stacked_archi_RRCK.csv')

In [63]:
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
target_column = 'Permeability'


meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 20-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=20, shuffle=True, random_state=101)

    # Storing predictions for the current dataframe
    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models"):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)

            # Predictions for validation set
            fold_predictions[val_index] =  np.clip( model.predict(X_val), -10, -4.0)

            # Predictions for test set
            test_predictions_fold =  np.clip( model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        # Store predictions for the meta-learner
        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

# Convert lists to arrays for the meta-learner
meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=20, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        # Predictions for test set
        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test}')

    predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_eval,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

    results[model_name] = {
        'Train MSE (20 fold CV)': mse_train,
        'Train MAE (20 fold CV)': mae_train,
        'Train RMSE (20 fold CV)': rmse_train,
        'Train R2 (20 fold CV)': r2_train,
        'Train PCC (20 fold CV)': pearson_train,
        'Train SCC (20 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
prediction_df = pd.DataFrame(predictions)
results_df

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9668
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 231
[LightGBM] [Info] Start training from score -5.528421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9639
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 232
[LightGBM] [Info] Start training from score -5.546165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9619
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 232
[LightGBM] [Info] Start trai


Training models: 1it [00:02,  2.03s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9586
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 232
[LightGBM] [Info] Start training from score -5.510075
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9627
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 232
[LightGBM] [Info] Start training from score -5.547895



Training models: 2it [00:08,  4.57s/it][A
Training models: 3it [00:23,  9.40s/it][A
Training models: 4it [00:29,  8.07s/it][A
Training models: 5it [00:35,  7.23s/it][A
Training models: 6it [00:40,  6.52s/it][A
Training models: 7it [00:40,  4.44s/it][A
Training models: 8it [00:40,  3.06s/it][A
Training models: 9it [00:43,  3.11s/it][A
Training models: 10it [00:44,  4.43s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:44<02:12, 44.33s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.088559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 208
[LightGBM] [Info] Start training from score -5.528421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 208
[LightGBM] [Info] Start training from score -5.546165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi


Training models: 1it [00:02,  2.10s/it][A

[LightGBM] [Info] Start training from score -5.547895



Training models: 2it [00:08,  4.54s/it][A
Training models: 3it [00:11,  4.13s/it][A
Training models: 4it [00:14,  3.35s/it][A
Training models: 5it [00:20,  4.42s/it][A
Training models: 6it [00:25,  4.68s/it][A
Training models: 7it [00:25,  3.20s/it][A
Training models: 8it [00:25,  2.24s/it][A
Training models: 9it [00:27,  2.05s/it][A
Training models: 10it [00:27,  2.78s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [01:12<01:09, 34.62s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31602
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 687
[LightGBM] [Info] Start training from score -5.528421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31601
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 687
[LightGBM] [Info] Start training from score -5.546165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31602
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 687
[LightGBM] [Info] Start t


Training models: 1it [00:02,  2.15s/it][A




Training models: 2it [00:09,  5.11s/it][A
Training models: 3it [00:56, 24.30s/it][A
Training models: 4it [01:12, 21.23s/it][A
Training models: 5it [01:27, 18.81s/it][A
Training models: 6it [01:33, 14.33s/it][A
Training models: 7it [01:33,  9.72s/it][A
Training models: 8it [01:33,  6.70s/it][A
Training models: 9it [01:38,  6.07s/it][A
Training models: 10it [01:39,  9.94s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [02:51<01:04, 64.22s/it]
Training models: 0it [00:00, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 6
[LightGBM] [Info] Start training from score -5.528421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 6
[LightGBM] [Info] Start training from score -5.546165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 5
[LightGBM] [Info] Start training from sc


Training models: 1it [00:01,  1.52s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 6
[LightGBM] [Info] Start training from score -5.510075
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 6
[LightGBM] [Info] Start training from score -5.547895



Training models: 2it [00:08,  4.51s/it][A
Training models: 3it [00:08,  2.81s/it][A
Training models: 4it [00:09,  1.95s/it][A
Training models: 5it [00:12,  2.31s/it][A
Training models: 6it [00:17,  3.30s/it][A

Training models: 10it [00:20,  2.08s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [03:12<00:00, 48.10s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (140, 40)
Dimensions of meta_features_test: (36, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1776
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.528421
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1762
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[L



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1761
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.544211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1767
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.551128
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1763
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.553534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1767
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.528008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000510 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1767
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.551579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1762
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.535113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1770
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000511 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1768
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.534549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.533158
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1768
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.510075
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1760
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 40
[LightGBM] [Info] Start training from score -5.547895
LGBMRegressor Evaluation completed: Test R2 score: 0.5059776674807872




DecisionTreeRegressor Evaluation completed: Test R2 score: 0.4585833383570751
RandomForestRegressor Evaluation completed: Test R2 score: 0.5424520471896306
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.522715909619691
AdaBoostRegressor Evaluation completed: Test R2 score: 0.5069488635926406
XGBRegressor Evaluation completed: Test R2 score: 0.47685867790335845
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.5534546811389669
LinearRegression Evaluation completed: Test R2 score: 0.6474369048547464
KNeighborsRegressor Evaluation completed: Test R2 score: 0.46842331186053787
SVR Evaluation completed: Test R2 score: 0.46820873582691225
MLPRegressor Evaluation completed: Test R2 score: 0.3367957226881543


Unnamed: 0,Train MSE (20 fold CV),Train MAE (20 fold CV),Train RMSE (20 fold CV),Train R2 (20 fold CV),Train PCC (20 fold CV),Train SCC (20 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.177633,0.319899,0.421466,0.603747,0.777813,0.795938,0.357099,0.394008,0.597578,0.505978,0.783057,0.838054
DecisionTreeRegressor,0.266378,0.400071,0.516118,0.405781,0.709529,0.745512,0.391358,0.433569,0.625586,0.458583,0.749937,0.794362
RandomForestRegressor,0.148073,0.28029,0.384802,0.669689,0.818553,0.840842,0.330734,0.375377,0.575095,0.542452,0.787657,0.833494
GradientBoostingRegressor,0.156332,0.299882,0.395389,0.651264,0.807406,0.824629,0.345,0.378753,0.587367,0.522716,0.784255,0.84405
AdaBoostRegressor,0.141836,0.281366,0.376611,0.683601,0.827346,0.841396,0.356397,0.406901,0.59699,0.506949,0.771546,0.818691
XGBRegressor,0.170391,0.309751,0.412784,0.619903,0.791247,0.805136,0.378148,0.429787,0.614937,0.476859,0.769067,0.795263
ExtraTreesRegressor,0.138622,0.272151,0.37232,0.69077,0.831446,0.843562,0.322781,0.370661,0.568138,0.553455,0.787639,0.852546
LinearRegression,0.178856,0.332845,0.422914,0.60102,0.790049,0.803132,0.254847,0.375217,0.504824,0.647437,0.817274,0.828603
KNeighborsRegressor,0.182065,0.318679,0.426691,0.59386,0.77373,0.795832,0.384245,0.428244,0.619875,0.468423,0.73818,0.787719
SVR,0.169784,0.319812,0.412049,0.621256,0.788524,0.80074,0.3844,0.402258,0.62,0.468209,0.719454,0.790243


In [64]:
results_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Results_20_folds_stacked_archi_RRCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/Stacked/Prediction_data_20_folds_stacked_archi_RRCK.csv')

In [None]:
from tqdm import tqdm
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Train_all_atomic_desc_RRCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Test_all_atomic_desc_RRCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
target_column = 'Permeability'
def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),   
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]


In [65]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import joblib

# Ensure the models directory exists
os.makedirs('/home/users/akshay/PCPpred/RRCK/models_rrck/', exist_ok=True)

# Assuming remove_low_variance_columns and features functions are defined elsewhere
# 2D and 3D descriptors dataframes
df_desc_train =pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_desc = features(train, "Permeability")
joblib.dump(selected_features_desc, '/home/users/akshay/PCPpred/RRCK/models_rrck/selected_features_descriptors.joblib')
df_desc_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_desc]], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test = df_desc_test.dropna()
df_desc_test = df_desc_test[df_desc_train.columns]

# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_fp = features(train, "Permeability")
joblib.dump(selected_features_fp, '/home/users/akshay/PCPpred/RRCK/models_rrck/selected_features_fingerprints.joblib')
df_fp_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_fp]], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test = df_fp_test[df_fp_train.columns]

# Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_emb = features(train, "Permeability")
joblib.dump(selected_features_emb, '/home/users/akshay/PCPpred/RRCK/models_rrck/selected_features_embeddings.joblib')
df_emb_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_emb]], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test = df_emb_test[df_emb_train.columns]

# Atomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Train_all_atomic_desc_RRCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_atomic = features(train, "Permeability")
joblib.dump(selected_features_atomic, '/home/users/akshay/PCPpred/RRCK/models_rrck/selected_features_atomic.joblib')
df_atomic_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_atomic]], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Test_all_atomic_desc_RRCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test = df_atomic_test[df_atomic_train.columns]

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Filter dataframes to have consistent IDs
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]
df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]
df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

target_column = 'Permeability'

def scale_features(df_train, df_test, feature_type):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    # Save the scaler
    joblib.dump(scaler, f'/home/users/akshay/PCPpred/RRCK/models_rrck/scaler_{feature_type}.joblib')
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test, 'Descriptor')
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test, 'Fingerprints')
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test, 'Embeddings')
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test , 'Atomic')

models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]

dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
data_names = ['descriptors', 'fingerprints', 'embeddings', 'atomic']

meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 15-fold cross-validation
for df_idx, (df_train, df_test) in enumerate(tqdm(dataframes, desc="Processing dataframe pairs")):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=15, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for fold_idx, (train_index, val_index) in enumerate(kf.split(X_weak)):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)
            
            model_name = model.__class__.__name__
            joblib.dump(model, f'/home/users/akshay/PCPpred/RRCK/models_rrck/weak_{data_names[df_idx]}_{model_name}_fold_{fold_idx}.joblib')

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -4.0)

            test_predictions_fold = np.clip(model.predict(X_eval), -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)
    
    joblib.dump(fold_meta_features_train, f'/home/users/akshay/PCPpred/RRCK/models_rrck/meta_features_train_{data_names[df_idx]}.joblib')
    joblib.dump(fold_meta_features_test, f'/home/users/akshay/PCPpred/RRCK/models_rrck/meta_features_test_{data_names[df_idx]}.joblib')

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

joblib.dump(meta_features_train, '/home/users/akshay/PCPpred/RRCK/models_rrck/meta_features_train_combined.joblib')
joblib.dump(meta_features_test, '/home/users/akshay/PCPpred/RRCK/models_rrck/meta_features_test_combined.joblib')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=15, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for fold_idx, (train_index, val_index) in enumerate(kf.split(meta_features_train)):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        
        joblib.dump(model, f'/home/users/akshay/PCPpred/RRCK/models_rrck/meta_{model_name}_fold_{fold_idx}.joblib')

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -4.0)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Test actual': y_eval,
        'Test prediction folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_mean,
    })

    results[model_name] = {
        'Train MSE (15 fold CV)': mse_train,
        'Train MAE (15 fold CV)': mae_train,
        'Train RMSE (15 fold CV)': rmse_train,
        'Train R2 (15 fold CV)': r2_train,
        'Train PCC (15 fold CV)': pearson_train,
        'Train SCC (15 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(140, 249)
(36, 249)
(140, 414)
(36, 414)
(140, 690)
(36, 690)
(140, 13)
(36, 13)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       ID                                             SMILES  Permeability  \
109    24  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...        -6.300   
81     26  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H...        -5.390   
73     27  CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N(C)[C@...        -5.460   
74     28  CC(C)C[C@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)N2CCC[...        -5.210   
100    29  CC(C)C[C@@H]1NC(=O)[C@H](CC(

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9474
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 230
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9419
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9399
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start trai


Training models:  10%|█         | 1/10 [00:01<00:15,  1.72s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9485
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 232
[LightGBM] [Info] Start training from score -5.542977



Training models:  20%|██        | 2/10 [00:06<00:29,  3.70s/it][A
Training models:  30%|███       | 3/10 [00:18<00:50,  7.20s/it][A
Training models:  40%|████      | 4/10 [00:22<00:36,  6.15s/it][A
Training models:  50%|█████     | 5/10 [00:27<00:27,  5.51s/it][A
Training models:  60%|██████    | 6/10 [00:31<00:20,  5.03s/it][A
Training models:  70%|███████   | 7/10 [00:31<00:10,  3.42s/it][A
Training models:  80%|████████  | 8/10 [00:31<00:04,  2.43s/it][A
Training models:  90%|█████████ | 9/10 [00:34<00:02,  2.43s/it][A
Training models: 100%|██████████| 10/10 [00:34<00:00,  3.44s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [00:34<01:43, 34.40s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004099 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 207
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 206
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003274 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training models:  10%|█         | 1/10 [00:01<00:12,  1.39s/it][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 206
[LightGBM] [Info] Start training from score -5.542977



Training models:  20%|██        | 2/10 [00:06<00:27,  3.47s/it][A
Training models:  30%|███       | 3/10 [00:09<00:22,  3.15s/it][A
Training models:  40%|████      | 4/10 [00:10<00:15,  2.62s/it][A
Training models:  50%|█████     | 5/10 [00:15<00:16,  3.35s/it][A
Training models:  60%|██████    | 6/10 [00:19<00:14,  3.64s/it][A
Training models:  70%|███████   | 7/10 [00:20<00:07,  2.63s/it][A
Training models:  80%|████████  | 8/10 [00:20<00:04,  2.00s/it][A
Training models:  90%|█████████ | 9/10 [00:22<00:01,  1.85s/it][A
Training models: 100%|██████████| 10/10 [00:22<00:00,  2.27s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [00:57<00:55, 27.57s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train 


Training models:  10%|█         | 1/10 [00:01<00:15,  1.77s/it][A




Training models:  20%|██        | 2/10 [00:07<00:31,  3.99s/it][A
Training models:  30%|███       | 3/10 [00:41<02:05, 18.00s/it][A
Training models:  40%|████      | 4/10 [00:54<01:34, 15.75s/it][A
Training models:  50%|█████     | 5/10 [01:05<01:09, 13.96s/it][A
Training models:  60%|██████    | 6/10 [01:09<00:42, 10.71s/it][A
Training models:  70%|███████   | 7/10 [01:10<00:22,  7.49s/it][A
Training models:  80%|████████  | 8/10 [01:10<00:10,  5.30s/it][A
Training models:  90%|█████████ | 9/10 [01:14<00:04,  4.84s/it][A
Training models: 100%|██████████| 10/10 [01:15<00:00,  7.57s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [02:12<00:49, 49.54s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 5
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from sc


Training models:  10%|█         | 1/10 [00:01<00:12,  1.40s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 6
[LightGBM] [Info] Start training from score -5.542977



Training models:  20%|██        | 2/10 [00:06<00:28,  3.57s/it][A
Training models:  30%|███       | 3/10 [00:07<00:15,  2.23s/it][A
Training models:  40%|████      | 4/10 [00:07<00:09,  1.62s/it][A
Training models:  50%|█████     | 5/10 [00:10<00:09,  1.90s/it][A
Training models:  60%|██████    | 6/10 [00:14<00:10,  2.68s/it][A
Training models:  70%|███████   | 7/10 [00:14<00:05,  1.85s/it][A

Training models: 100%|██████████| 10/10 [00:17<00:00,  1.72s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [02:30<00:00, 37.51s/it]


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (140, 40)
Dimensions of meta_features_test: (36, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1740
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1734
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[L



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1728
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[LightGBM] [Info] Start training from score -5.541346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1732
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1728
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1738
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.545496
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1735
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.533664
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.533588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1740
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start training from score -5.504046
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1738
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 40
[LightGBM] [Info] Start trainin



In [66]:
#Ablation study
import os
import joblib
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [67]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [68]:
from tqdm import tqdm
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Train_all_atomic_desc_RRCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Test_all_atomic_desc_RRCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
target_column = 'Permeability'
def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),   
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(140, 249)
(36, 249)
(140, 414)
(36, 414)
(140, 690)
(36, 690)
(140, 13)
(36, 13)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       ID                                             SMILES  Permeability  \
109    24  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](C...        -6.300   
81     26  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H...        -5.390   
73     27  CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N(C)[C@...        -5.460   
74     28  CC(C)C[C@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)N2CCC[...        -5.210   
100    29  CC(C)C[C@@H]1NC(=O)[C@H](CC(

In [69]:
ablation_results = {}

for ablation_idx in range(len(dataframes)):
    print(f"========== Ablation: Excluding feature at index {ablation_idx} ==========")
    feature_names = ['Descriptor', 'Fingerprints', 'Embeddings', 'Atomic']
    print(f"========== Ablation: Excluding feature :-- {feature_names[ablation_idx]} ==========")

    ablated_dataframes = [pair for i, pair in enumerate(dataframes) if i != ablation_idx]

    meta_features_train = []
    meta_features_test = []

    # Stage 1
    for df_train, df_test in tqdm(ablated_dataframes, desc="Processing ablated dataframes"):
        X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
        y_weak = df_train[target_column]
        X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
        y_eval = df_test[target_column]

        kf = KFold(n_splits=15, shuffle=True, random_state=101)

        fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
        fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

        for i, model in tqdm(enumerate(models_weak), desc="Training weak models", total=len(models_weak)):
            fold_predictions = np.zeros(X_weak.shape[0])
            test_predictions_folds = []

            for train_index, val_index in kf.split(X_weak):
                X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
                y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

                model.fit(X_train, y_train)

                fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -4.0)
                test_predictions_fold = np.clip(model.predict(X_eval), -10, -4.0)
                test_predictions_folds.append(test_predictions_fold)

            fold_meta_features_train[:, i] = fold_predictions
            fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
            print(f'Model training done {i}: {model.__class__.__name__}')

        meta_features_train.append(fold_meta_features_train)
        meta_features_test.append(fold_meta_features_test)
        print('Dataframe training completed')

    # Stack all meta-features
    meta_features_train = np.hstack(meta_features_train)
    meta_features_test = np.hstack(meta_features_test)

    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
    print('Stage 1 completed (Weak Learners)')
    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

    # Stage 2
    results = {}
    kf = KFold(n_splits=15, shuffle=True, random_state=101)

    for model in models_meta:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        for train_index, val_index in kf.split(meta_features_train):
            X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
            y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_fold_train, y_fold_train)
            y_pred_fold = np.clip(model.predict(X_fold_val), -10, -4.0)

            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_fold_val)

            test_predictions_fold = model.predict(meta_features_test)
            test_predictions_fold = np.clip(test_predictions_fold, -10, -4.0)
            test_predictions_folds.append(test_predictions_fold)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        mse_test = mean_squared_error(y_eval, predictions_test_mean)
        mae_test = mean_absolute_error(y_eval, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_eval, predictions_test_mean)
        pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
        spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

        results[model_name] = {
            'Train MSE (15 fold CV)': mse_train,
            'Train MAE (15 fold CV)': mae_train,
            'Train RMSE (15 fold CV)': rmse_train,
            'Train R2 (15 fold CV)': r2_train,
            'Train PCC (15 fold CV)': pearson_train,
            'Train SCC (15 fold CV)': spearman_train,
            'Test MSE': mse_test,
            'Test MAE': mae_test,
            'Test RMSE': rmse_test,
            'Test R2': r2_test,
            'Test PCC': pearson_test,
            'Test SCC': spearman_test,
        }

    ablation_results[f"Ablation_{feature_names[ablation_idx]}"] = pd.DataFrame(results).T

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Ablation Study Completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# To view the results
ablation_results_df = {key: value for key, value in ablation_results.items()}




Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 207
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 206
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi


Training weak models:  10%|█         | 1/10 [00:01<00:13,  1.50s/it][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 206
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:27,  3.38s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:08<00:21,  3.08s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:10<00:14,  2.49s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:15<00:16,  3.31s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:19<00:14,  3.51s/it][A
Training weak models:  70%|███████   | 7/10 [00:19<00:07,  2.41s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:19<00:03,  1.68s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:20<00:01,  1.55s/it][A
Training weak models: 100%|██████████| 10/10 [00:20<00:00,  2.08s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:20<00:41, 20.85s/it]

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start t


Training weak models:  10%|█         | 1/10 [00:01<00:14,  1.63s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31149
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 687
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:30,  3.77s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:41<02:04, 17.85s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:53<01:33, 15.61s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [01:04<01:09, 13.90s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [01:08<00:42, 10.60s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [01:09<00:21,  7.25s/it][A
Training weak models:  80%|████████  | 8/10 [01:09<00:09,  5.00s/it][A

Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [01:12<00:04,  4.57s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [01:13<00:00,  7.37s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [01:34<00:51, 51.96s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 5
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from sc


Training weak models:  10%|█         | 1/10 [00:01<00:09,  1.11s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 6
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:26,  3.29s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:06<00:14,  2.06s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:07<00:08,  1.44s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:09<00:08,  1.80s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:13<00:09,  2.47s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:15<00:00,  1.55s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [01:50<00:00, 36.70s/it]


Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.529000




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1299
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1294
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.541346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000447 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1294
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.567269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1301
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.546947
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.536031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.545496
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1301
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1311
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.518206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1309
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.533588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000411 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1306
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1304
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.542977


Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9474
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 230
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9419
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9399
[LightGBM] [Info] Number of data points in the train set


Training weak models:  10%|█         | 1/10 [00:01<00:13,  1.49s/it][A

[LightGBM] [Info] Total Bins 9485
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 232
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:26,  3.37s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:17<00:48,  6.91s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:21<00:35,  5.93s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:25<00:26,  5.32s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:29<00:19,  4.86s/it][A
Training weak models:  70%|███████   | 7/10 [00:30<00:09,  3.32s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:32<00:02,  2.31s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:32<00:00,  3.29s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:32<01:05, 32.86s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005942 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start t


Training weak models:  10%|█         | 1/10 [00:01<00:14,  1.59s/it][A

[LightGBM] [Info] Total Bins 31149
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 687
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:07<00:31,  3.88s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:42<02:07, 18.24s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:54<01:35, 15.89s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [01:05<01:09, 13.96s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [01:09<00:42, 10.58s/it][A
Training weak models:  70%|███████   | 7/10 [01:09<00:21,  7.18s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [01:09<00:09,  4.95s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [01:13<00:04,  4.55s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [01:14<00:00,  7.41s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [01:46<00:57, 57.11s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 5
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from sc


Training weak models:  10%|█         | 1/10 [00:01<00:10,  1.16s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:26,  3.27s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:06<00:14,  2.05s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:06<00:08,  1.44s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:09<00:08,  1.78s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:13<00:09,  2.49s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [02:02<00:00, 40.85s/it]


Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.55130



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1299
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1298
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.567269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1299
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.528053
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.539008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1306
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.533664
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1306
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.518206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1306
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin









Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9474
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 230
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9419
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9399
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start trai


Training weak models:  10%|█         | 1/10 [00:01<00:15,  1.74s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:27,  3.46s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:17<00:49,  7.07s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:22<00:36,  6.04s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:26<00:27,  5.43s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:30<00:19,  4.85s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:30<00:10,  3.34s/it][A

Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:32<00:02,  2.31s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:33<00:00,  3.32s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:33<01:06, 33.24s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 207
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 206
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training weak models:  10%|█         | 1/10 [00:01<00:13,  1.50s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:27,  3.41s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:08<00:21,  3.09s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:10<00:14,  2.49s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:15<00:16,  3.25s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:18<00:13,  3.44s/it][A
Training weak models:  70%|███████   | 7/10 [00:19<00:07,  2.36s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:19<00:03,  1.65s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:20<00:01,  1.51s/it][A
Training weak models: 100%|██████████| 10/10 [00:20<00:00,  2.06s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [00:53<00:25, 25.81s/it]

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 5
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 6
[LightGBM] [Info] Start training from sc


Training weak models:  10%|█         | 1/10 [00:01<00:09,  1.02s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 6
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:05<00:25,  3.22s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:06<00:14,  2.02s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:06<00:08,  1.42s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:09<00:08,  1.78s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:13<00:09,  2.50s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:15<00:00,  1.55s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [01:09<00:00, 23.12s/it]


Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1297
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.55130



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1292
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.567269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1295
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.546947
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1302
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.539008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.536031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1301
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1299
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.533664
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1304
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.518206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin





Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9474
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 230
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9419
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9399
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 232
[LightGBM] [Info] Start trai


Training weak models:  10%|█         | 1/10 [00:01<00:16,  1.84s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:28,  3.55s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:17<00:49,  7.12s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:22<00:36,  6.06s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:26<00:27,  5.45s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:30<00:19,  4.88s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:30<00:10,  3.36s/it][A

Model training done 6: KNeighborsRegressor
Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:33<00:02,  2.31s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [00:33<01:06, 33.44s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 207
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 206
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug


Training weak models:  10%|█         | 1/10 [00:01<00:15,  1.70s/it][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 206
[LightGBM] [Info] Start training from score -5.542977
Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:27,  3.40s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:09<00:21,  3.09s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:10<00:14,  2.49s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:15<00:16,  3.22s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:18<00:13,  3.41s/it][A
Training weak models:  70%|███████   | 7/10 [00:19<00:07,  2.34s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:19<00:03,  1.63s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [00:20<00:01,  1.50s/it][A
Training weak models: 100%|██████████| 10/10 [00:20<00:00,  2.05s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [00:53<00:25, 25.84s/it]

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005866 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30915
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 687
[LightGBM] [Info] Start t


Training weak models:  10%|█         | 1/10 [00:01<00:13,  1.52s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:06<00:30,  3.79s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:42<02:08, 18.29s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:54<01:35, 15.92s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [01:05<01:10, 14.08s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [01:09<00:42, 10.68s/it][A
Training weak models:  70%|███████   | 7/10 [01:09<00:21,  7.25s/it][A

Model training done 5: ExtraTreesRegressor
Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [01:09<00:09,  5.00s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [01:13<00:04,  4.57s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [01:14<00:00,  7.45s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [02:08<00:00, 42.81s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.529000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing co




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000394 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1303
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.551308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 30
[LightGBM] [Info] Start training from score -5.567269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1304
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.528053
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1311
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.539008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.533664
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1311
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.518206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000375 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1312
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.504046
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 131, number of used features: 30
[LightGBM] [Info] Start training from score -5.542977




XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Ablation Study Completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [70]:
import os
import pickle

ablation_result_dir = '/home/users/akshay/PCPpred/RRCK/results/Ablation/'
os.makedirs(ablation_result_dir, exist_ok=True)

pickle_path = os.path.join(ablation_result_dir, 'ablation_results.pkl')
with open(pickle_path, 'wb') as f:
    pickle.dump(ablation_results, f)


with open(pickle_path, 'rb') as f:
    ablation_results = pickle.load(f)


ablation_results

{'Ablation_Descriptor':                            Train MSE (15 fold CV)  Train MAE (15 fold CV)  \
 LGBMRegressor                            0.177555                0.325194   
 DecisionTreeRegressor                    0.252005                0.361286   
 RandomForestRegressor                    0.153435                0.296401   
 GradientBoostingRegressor                0.165649                0.321392   
 AdaBoostRegressor                        0.156371                0.297685   
 XGBRegressor                             0.186129                0.332056   
 ExtraTreesRegressor                      0.144659                0.285279   
 LinearRegression                         0.179209                0.325241   
 KNeighborsRegressor                      0.177764                0.319321   
 SVR                                      0.157785                0.305937   
 MLPRegressor                             0.584957                0.663292   
 
                            Train RMSE 

In [71]:
ablation_result_dir = '/home/users/akshay/PCPpred/RRCK/results/Ablation/'
os.makedirs(ablation_result_dir, exist_ok=True)

for ablation_label, df in ablation_results.items():
    print(f"Results for {ablation_label}: \n")
    safe_label = ablation_label.replace(" ", "_").replace("/", "_")
    file_path = os.path.join(ablation_result_dir, f"{safe_label}.csv")
    df.to_csv(file_path)

Results for Ablation_Descriptor: 

Results for Ablation_Fingerprints: 

Results for Ablation_Embeddings: 

Results for Ablation_Atomic: 



In [72]:
from IPython.display import display
for ablation_label, df in ablation_results.items():
    print(f"Results for {ablation_label}: \n")
    display(df)

Results for Ablation_Descriptor: 



Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.177555,0.325194,0.421373,0.603922,0.778704,0.787176,0.340832,0.407963,0.583808,0.528483,0.781227,0.816555
DecisionTreeRegressor,0.252005,0.361286,0.502001,0.437843,0.711194,0.728035,0.272607,0.387796,0.522118,0.622868,0.839152,0.829246
RandomForestRegressor,0.153435,0.296401,0.391708,0.657727,0.811034,0.825276,0.319258,0.381229,0.56503,0.558328,0.805328,0.854219
GradientBoostingRegressor,0.165649,0.321392,0.407,0.630481,0.794789,0.800246,0.298955,0.378084,0.546768,0.586417,0.813451,0.84508
AdaBoostRegressor,0.156371,0.297685,0.395438,0.651178,0.807154,0.815814,0.337086,0.395453,0.580591,0.533664,0.783665,0.833752
XGBRegressor,0.186129,0.332056,0.431426,0.584796,0.770673,0.778952,0.364011,0.414451,0.603333,0.496417,0.745624,0.780073
ExtraTreesRegressor,0.144659,0.285279,0.38034,0.677305,0.82341,0.833818,0.319114,0.38594,0.564902,0.558528,0.811754,0.858338
LinearRegression,0.179209,0.325241,0.423331,0.600232,0.785857,0.796723,0.266955,0.35555,0.516676,0.630687,0.812813,0.826286
KNeighborsRegressor,0.177764,0.319321,0.421621,0.603455,0.779103,0.796713,0.346291,0.42068,0.588465,0.52093,0.779428,0.796215
SVR,0.157785,0.305937,0.397222,0.648023,0.805065,0.815532,0.327635,0.384596,0.572394,0.54674,0.76905,0.82178


Results for Ablation_Fingerprints: 



Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.171488,0.306495,0.414111,0.617456,0.786506,0.813842,0.389371,0.417462,0.623996,0.461333,0.755346,0.795958
DecisionTreeRegressor,0.403447,0.455107,0.635174,0.100017,0.57426,0.606364,0.419488,0.456972,0.647679,0.419667,0.684561,0.749566
RandomForestRegressor,0.155203,0.291572,0.393958,0.653783,0.808605,0.831773,0.377988,0.408184,0.614807,0.47708,0.737881,0.78651
GradientBoostingRegressor,0.18242,0.307575,0.427106,0.593069,0.775061,0.799274,0.383365,0.413941,0.619165,0.469641,0.72475,0.789213
AdaBoostRegressor,0.176404,0.320119,0.420004,0.60649,0.779686,0.81035,0.385661,0.413045,0.621016,0.466464,0.741163,0.770934
XGBRegressor,0.218363,0.341825,0.467293,0.51289,0.728282,0.745881,0.408204,0.439949,0.638909,0.435278,0.699889,0.756388
ExtraTreesRegressor,0.151951,0.29022,0.389809,0.661038,0.813048,0.829656,0.372685,0.400857,0.610479,0.484416,0.751967,0.799897
LinearRegression,0.172826,0.311664,0.415724,0.614471,0.791188,0.808403,0.314775,0.40481,0.561048,0.56453,0.777835,0.811868
KNeighborsRegressor,0.17935,0.309179,0.423497,0.599918,0.77747,0.794679,0.410354,0.446193,0.640589,0.432303,0.70069,0.756903
SVR,0.172829,0.30823,0.415727,0.614464,0.784286,0.812094,0.394841,0.4143,0.628364,0.453765,0.708944,0.7784


Results for Ablation_Embeddings: 



Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.208008,0.332408,0.456079,0.535989,0.737299,0.771105,0.340419,0.4155,0.583454,0.529054,0.754442,0.8181
DecisionTreeRegressor,0.378676,0.436429,0.615367,0.155273,0.562144,0.610714,0.329598,0.445593,0.574106,0.544024,0.76335,0.735921
RandomForestRegressor,0.16381,0.294886,0.404735,0.634583,0.796698,0.813266,0.343567,0.415119,0.586146,0.524699,0.76128,0.815988
GradientBoostingRegressor,0.173962,0.306538,0.417088,0.611936,0.78332,0.808516,0.33871,0.42626,0.581988,0.531418,0.753442,0.798867
AdaBoostRegressor,0.185062,0.333243,0.430189,0.587175,0.766893,0.772671,0.358491,0.410554,0.598741,0.504053,0.743312,0.810195
XGBRegressor,0.182749,0.320723,0.427491,0.592336,0.772197,0.795039,0.40863,0.470754,0.639242,0.434688,0.70161,0.723048
ExtraTreesRegressor,0.148365,0.281638,0.385181,0.669038,0.818549,0.831334,0.328053,0.40046,0.572759,0.546161,0.78066,0.83195
LinearRegression,0.161255,0.31334,0.401566,0.640283,0.80625,0.80781,0.224517,0.348633,0.473832,0.689397,0.833244,0.851773
KNeighborsRegressor,0.191123,0.3406,0.437177,0.573654,0.760264,0.77335,0.387659,0.424915,0.622623,0.4637,0.706848,0.764111
SVR,0.204803,0.340128,0.452552,0.543137,0.738203,0.754344,0.405218,0.449143,0.636567,0.439409,0.674509,0.763468


Results for Ablation_Atomic: 



Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.174317,0.313443,0.417513,0.611145,0.782865,0.808727,0.318093,0.400106,0.563998,0.55994,0.784054,0.842078
DecisionTreeRegressor,0.259637,0.371893,0.509546,0.420818,0.703619,0.729215,0.304532,0.42362,0.551845,0.578701,0.783035,0.793332
RandomForestRegressor,0.148969,0.282158,0.385965,0.667689,0.817955,0.840973,0.316603,0.386802,0.562675,0.562002,0.791024,0.838643
GradientBoostingRegressor,0.153929,0.287797,0.392338,0.656625,0.811171,0.826765,0.292462,0.392466,0.540798,0.595399,0.802007,0.842891
AdaBoostRegressor,0.154864,0.301459,0.393528,0.65454,0.810471,0.826883,0.328797,0.390333,0.573408,0.545133,0.786575,0.835039
XGBRegressor,0.173921,0.304122,0.417038,0.612028,0.783956,0.804945,0.345761,0.423959,0.588014,0.521664,0.753251,0.797837
ExtraTreesRegressor,0.146689,0.281629,0.383,0.672775,0.82029,0.83712,0.307385,0.371535,0.554423,0.574754,0.804047,0.8622
LinearRegression,0.133392,0.289211,0.365229,0.702437,0.841642,0.839926,0.232231,0.334146,0.481904,0.678724,0.84202,0.849971
KNeighborsRegressor,0.184036,0.324821,0.428994,0.589465,0.76913,0.78822,0.353445,0.399267,0.594512,0.511033,0.780613,0.831306
SVR,0.168617,0.312189,0.41063,0.623859,0.790238,0.795472,0.349409,0.385071,0.591108,0.516617,0.743527,0.816374
