In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
# from lightgbm.lgb import LGBMRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [2]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [3]:
#2d RDKit descriptors
df_train = pd.read_csv('Descriptors/Train_2d_RDKit_des.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_RDKit_des.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2drdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2drdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 217)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 217)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19594
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 156
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19618
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 158
[LightGBM] [Info] Start training from score -5.7

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.209,0.335,0.4571,0.6645,0.8156,0.7853,0.2091,0.3297,0.4573,0.6695,0.819,0.8001
DecisionTreeRegressor,0.3054,0.3838,0.5527,0.5096,0.7346,0.7156,0.2283,0.3353,0.4778,0.6392,0.8011,0.7853
RandomForestRegressor,0.215,0.3412,0.4637,0.6548,0.8092,0.7796,0.2142,0.3321,0.4628,0.6615,0.8135,0.7947
GradientBoostingRegressor,0.2327,0.3593,0.4824,0.6264,0.7945,0.7564,0.2322,0.3549,0.4818,0.6331,0.7993,0.7737
AdaBoostRegressor,0.405,0.5148,0.6364,0.3498,0.6311,0.5775,0.3902,0.5038,0.6247,0.3834,0.6585,0.625
XGBRegressor,0.2157,0.3382,0.4644,0.6537,0.8093,0.7816,0.2078,0.3247,0.4559,0.6716,0.8197,0.8023
ExtraTreesRegressor,0.2073,0.3339,0.4553,0.6671,0.8168,0.7891,0.2142,0.3316,0.4629,0.6615,0.8135,0.7944
LinearRegression,0.3681,0.4344,0.6067,0.4089,0.6498,0.6759,0.3426,0.424,0.5853,0.4587,0.6796,0.696
KNeighborsRegressor,0.2824,0.384,0.5315,0.5465,0.7454,0.7183,0.2793,0.3723,0.5285,0.5587,0.7518,0.7498
SVR,0.2471,0.3548,0.4971,0.6033,0.7801,0.7564,0.2506,0.3544,0.5006,0.604,0.7805,0.7656


In [4]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.94244077242366, -6.786558855452917, -5.986...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.10501812111347, -6.267956683931639, -6.95...","[-7.154249826199579, -6.351077452266763, -6.93...","[0.08343609678130004, 0.08851531639579152, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.03, -4.66, -6.244999999999999,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.89, -7.0, -7.0, -5.47, -7.0, -6.85,...","[-6.992, -5.5120000000000005, -7.0, -6.698, -5...","[0.016000000000000014, 0.6131035801559147, 0.0..."
2,RandomForestRegressor,"[-6.832800000000002, -6.7573, -5.9484016666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.966800000000003, -5.975995784910001, -6.9...","[-6.977160000000002, -6.204005869893943, -6.91...","[0.014142361896090284, 0.17140115031313113, 0...."
3,GradientBoostingRegressor,"[-6.8339668716881725, -7.269297816272147, -5.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.021417222912616, -6.130864330321678, -6.5...","[-7.229568818533345, -6.3464771218913345, -6.6...","[0.10920497800057241, 0.16149665631528384, 0.0..."
4,AdaBoostRegressor,"[-5.926560473115278, -6.126094049904048, -5.52...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.126094049904048, -5.617833935018051, -5.5...","[-6.206148804747046, -5.667641587597571, -5.58...","[0.050458996435762044, 0.05877404002649797, 0...."
5,XGBRegressor,"[-7.270226, -6.324202, -6.6335635, -5.3446817,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.272304, -6.106129, -6.930661, -7.343884, ...","[-7.062197, -6.280751, -7.0063515, -7.0149984,...","[0.20343843, 0.22225112, 0.17827716, 0.2296413..."
6,ExtraTreesRegressor,"[-6.9175, -6.764500000000002, -6.2826500000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.993200000000002, -6.5210000000000035, -6....","[-6.985060000000002, -6.509300000000001, -6.96...","[0.013430651510630297, 0.15054574055748018, 0...."
7,LinearRegression,"[-9.10578582117024, -6.572551183789379, -5.470...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-10.0, -6.237643152796598, -6.36173348318139...","[-10.0, -6.056286138290185, -6.253304484152165...","[0.0, 0.1387288849373009, 0.09287845728415967,..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -4.89, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -7.0, -7.0, -5.383333333333333, -6.74...","[-6.7780000000000005, -6.949333333333334, -6.8...","[0.18126224096595522, 0.10133333333333318, 0.2..."
9,SVR,"[-6.169959897750688, -6.926011440971111, -6.08...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.169882846730166, -6.429860290786036, -6.9...","[-6.212842519786784, -6.500307442850321, -6.94...","[0.023380745132062508, 0.07357526551822417, 0...."


In [5]:
result_df.to_csv('descriptors_results/Results_2d_RDKit_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2d_RDKit_desc.csv')

In [6]:
#2d Mordred descriptors
df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df , prediction_df= train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


X_train shape:  (5568, 1426)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


X_test shape:  (1392, 1426)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067377 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265212
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1187
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265242
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1190
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2063,0.3347,0.4541,0.6688,0.8184,0.7858,0.2054,0.3276,0.4532,0.6754,0.8229,0.8032
DecisionTreeRegressor,0.3092,0.3861,0.5561,0.5035,0.7327,0.7176,0.2293,0.3433,0.4788,0.6377,0.7994,0.7771
RandomForestRegressor,0.2162,0.3438,0.465,0.6528,0.8084,0.7795,0.2139,0.3354,0.4624,0.6621,0.8143,0.7951
GradientBoostingRegressor,0.227,0.3549,0.4765,0.6354,0.8004,0.7644,0.2184,0.3425,0.4673,0.6549,0.814,0.7856
AdaBoostRegressor,0.3904,0.5032,0.6248,0.3731,0.6505,0.6149,0.3726,0.4873,0.6104,0.4113,0.6814,0.6669
XGBRegressor,0.2174,0.3414,0.4662,0.651,0.8073,0.7742,0.2127,0.3301,0.4612,0.6639,0.815,0.7943
ExtraTreesRegressor,0.2137,0.3411,0.4623,0.6568,0.8105,0.7827,0.2238,0.3399,0.4731,0.6464,0.8041,0.7873
LinearRegression,0.6512,0.4717,0.807,-0.0456,0.5858,0.69,0.3377,0.3822,0.5812,0.4663,0.7177,0.7463
KNeighborsRegressor,0.2713,0.38,0.5209,0.5643,0.7581,0.7273,0.2619,0.3686,0.5118,0.5861,0.7694,0.7593
SVR,0.2267,0.3421,0.4762,0.6359,0.8007,0.7714,0.2376,0.3499,0.4875,0.6245,0.7947,0.7756


In [7]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.803659819579023, -6.952297461961773, -6.31...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.077881107007449, -6.0194634527933095, -6....","[-7.095000868721698, -6.249811763734205, -6.66...","[0.025405667572675622, 0.18539061636084384, 0...."
1,DecisionTreeRegressor,"[-7.0, -7.0, -7.0, -5.85, -4.33, -5.03, -5.85,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -6.92, -6.51, -6.24, -7.0, -6.85...","[-7.0, -6.424000000000001, -6.720000000000001,...","[0.0, 0.6196321489399981, 0.5209222590751907, ..."
2,RandomForestRegressor,"[-6.585237681159421, -6.819599999999999, -6.06...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.774225000000001, -6.134100000000002, -6.6...","[-6.87847, -6.316214242424243, -6.738278214347...","[0.055238368458888666, 0.14462751056208653, 0...."
3,GradientBoostingRegressor,"[-7.172151031819289, -7.023936244369913, -5.93...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.075447251317804, -6.052434368338022, -6.6...","[-7.155205707164302, -6.290186002159451, -6.57...","[0.12130341208673291, 0.20656773942617584, 0.0..."
4,AdaBoostRegressor,"[-5.761421004074878, -5.794734613110019, -5.45...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.792214783892174, -5.413837832042265, -5.6...","[-6.0743247070932025, -5.627583117326498, -5.6...","[0.19644705530494003, 0.12019241477578786, 0.0..."
5,XGBRegressor,"[-6.8409624, -6.8683248, -6.0015554, -5.321909...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8227315, -6.169962, -6.799593, -6.7047834...","[-7.014267, -6.187599, -6.730255, -6.7727537, ...","[0.116441816, 0.2011095, 0.11745094, 0.1260753..."
6,ExtraTreesRegressor,"[-6.693472727272727, -6.900400000000001, -6.20...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9483999999999995, -6.634500000000002, -6....","[-6.932820000000001, -6.487070000000001, -6.84...","[0.045296021900383295, 0.12423871216331846, 0...."
7,LinearRegression,"[-7.248272842181905, -10.0, -6.148327658338530...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.567260105548485, -6.12771397029428, -8.63...","[-7.073452021109697, -6.398290686151727, -8.69...","[2.739153446973038, 0.9518729234576522, 0.0502..."
8,KNeighborsRegressor,"[-7.0, -7.0, -5.88, -4.88, -4.733333333333333,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.746666666666667, -6.746666666666667...","[-7.0, -6.898666666666666, -6.797333333333334,...","[0.0, 0.12410748030101418, 0.10133333333333318..."
9,SVR,"[-7.16943387048738, -7.019600859202305, -5.913...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.299628234862112, -6.7808780790809, -7.050...","[-7.285810463220611, -6.7403078392482385, -7.0...","[0.04481459050799663, 0.053002778477928364, 0...."


In [8]:
result_df.to_csv('descriptors_results/Results_2d_Mordred_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2d_Mordred_desc.csv')


In [2]:
#Removal of constant columns
def remove_constant_columns(df):
    # Identify columns with only one unique value
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    # Drop the constant columns from the DataFrame
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [3]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [12]:
#2d RDKit descriptors const removal
df_train = pd.read_csv('Descriptors/Train_2d_RDKit_des.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_RDKit_des.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 183)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 183)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19594
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 156
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19618
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 158
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.209,0.335,0.4571,0.6645,0.8156,0.7853,0.2091,0.3297,0.4573,0.6695,0.819,0.8001
DecisionTreeRegressor,0.301,0.3817,0.5486,0.5167,0.7382,0.7168,0.2292,0.3354,0.4787,0.6379,0.8003,0.7892
RandomForestRegressor,0.2147,0.3412,0.4634,0.6552,0.8095,0.7801,0.2148,0.3324,0.4634,0.6606,0.813,0.794
GradientBoostingRegressor,0.233,0.3595,0.4827,0.6259,0.7942,0.756,0.2325,0.355,0.4822,0.6326,0.799,0.7735
AdaBoostRegressor,0.4153,0.5203,0.6444,0.3332,0.6191,0.5789,0.4022,0.5107,0.6342,0.3645,0.6433,0.6179
XGBRegressor,0.2157,0.3382,0.4644,0.6537,0.8093,0.7816,0.2078,0.3247,0.4559,0.6716,0.8197,0.8023
ExtraTreesRegressor,0.2077,0.3342,0.4558,0.6665,0.8164,0.7883,0.2138,0.3313,0.4624,0.6622,0.8139,0.7939
LinearRegression,0.3681,0.4344,0.6067,0.4089,0.6498,0.6759,0.3426,0.424,0.5853,0.4587,0.6796,0.696
KNeighborsRegressor,0.2823,0.3833,0.5313,0.5468,0.7456,0.7196,0.2795,0.3741,0.5287,0.5583,0.7516,0.7498
SVR,0.2471,0.3548,0.4971,0.6033,0.7801,0.7564,0.2506,0.3544,0.5006,0.604,0.7805,0.7656


In [13]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.94244077242366, -6.786558855452917, -5.986...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.10501812111347, -6.267956683931639, -6.95...","[-7.154249826199579, -6.351077452266763, -6.93...","[0.08343609678130004, 0.08851531639579152, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -4.66, -5.15,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.26, -7.0, -7.0, -5.47, -5.68, -6.85...","[-7.0, -5.562, -7.0, -6.784000000000001, -5.66...","[0.0, 0.5577239460521666, 0.0, 0.4320000000000..."
2,RandomForestRegressor,"[-6.861100000000001, -6.7424, -5.9228100000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.972700000000002, -6.014529118243336, -6.9...","[-6.96952, -6.178683964340668, -6.901110718597...","[0.0030175486740072885, 0.16138823641970285, 0..."
3,GradientBoostingRegressor,"[-6.7125512595896675, -7.2692978162721475, -5....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.021417222912616, -6.130864330321679, -6.5...","[-7.229568818533345, -6.3464771218913345, -6.6...","[0.10920497800057233, 0.16149665631528373, 0.0..."
4,AdaBoostRegressor,"[-5.655739035014513, -5.775402990849095, -5.50...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.000804597701152, -5.6366873278236795, -5....","[-6.268465732418376, -5.729993528587588, -5.59...","[0.14229497772388564, 0.20160893547149286, 0.0..."
5,XGBRegressor,"[-7.270226, -6.324202, -6.6335635, -5.3446817,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.272304, -6.106129, -6.930661, -7.343884, ...","[-7.062197, -6.280751, -7.0063515, -7.0149984,...","[0.20343843, 0.22225112, 0.17827716, 0.2296413..."
6,ExtraTreesRegressor,"[-6.929200000000001, -6.719900000000003, -6.17...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9956000000000005, -6.419600000000003, -6....","[-6.99204, -6.450820000000002, -6.956679999999...","[0.0058520423785201475, 0.13161081110607847, 0..."
7,LinearRegression,"[-9.10578581867479, -6.572551183708256, -5.470...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-10.0, -6.237643152553717, -6.36173348312036...","[-10.0, -6.05628613838204, -6.2533044841677965...","[0.0, 0.13872888502966715, 0.09287845741109119..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -4.89, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -7.0, -7.0, -5.383333333333333, -6.74...","[-6.7780000000000005, -6.949333333333334, -6.8...","[0.18126224096595522, 0.10133333333333318, 0.2..."
9,SVR,"[-6.169907722708351, -6.92650978832323, -6.086...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.169830678265781, -6.429629218613785, -6.9...","[-6.212826450797931, -6.5003052976767846, -6.9...","[0.023407477163931465, 0.07364038460177252, 0...."


In [14]:
result_df.to_csv('descriptors_results/Results_2d_rdkit_const_rem.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2d_rdkit_const_rem.csv')


In [15]:
#2d Mordred descriptors const removal
df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


X_train shape:  (5568, 1227)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


X_test shape:  (1392, 1227)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265212
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1187
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265242
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1190
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2063,0.3347,0.4541,0.6688,0.8184,0.7858,0.2054,0.3276,0.4532,0.6754,0.8229,0.8032
DecisionTreeRegressor,0.3081,0.3866,0.5551,0.5053,0.7334,0.7176,0.2332,0.3453,0.4829,0.6315,0.7955,0.7797
RandomForestRegressor,0.2154,0.3436,0.4641,0.6542,0.8092,0.7796,0.2138,0.3351,0.4624,0.6621,0.8144,0.7952
GradientBoostingRegressor,0.2272,0.3549,0.4767,0.6352,0.8002,0.7646,0.2183,0.3425,0.4673,0.655,0.814,0.7856
AdaBoostRegressor,0.3832,0.4996,0.619,0.3847,0.6608,0.617,0.3681,0.4849,0.6067,0.4183,0.6886,0.6702
XGBRegressor,0.2174,0.3414,0.4662,0.651,0.8073,0.7742,0.2127,0.3301,0.4612,0.6639,0.815,0.7943
ExtraTreesRegressor,0.2145,0.3415,0.4632,0.6555,0.8097,0.7819,0.223,0.3396,0.4722,0.6476,0.8049,0.7867
LinearRegression,0.6512,0.4717,0.807,-0.0456,0.5858,0.69,0.3377,0.3822,0.5812,0.4663,0.7177,0.7463
KNeighborsRegressor,0.2717,0.3807,0.5213,0.5637,0.7575,0.7263,0.2642,0.3711,0.514,0.5824,0.7671,0.7575
SVR,0.2267,0.3421,0.4762,0.6359,0.8007,0.7714,0.2376,0.3499,0.4875,0.6245,0.7947,0.7756


In [16]:
result_df

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2063,0.3347,0.4541,0.6688,0.8184,0.7858,0.2054,0.3276,0.4532,0.6754,0.8229,0.8032
DecisionTreeRegressor,0.3081,0.3866,0.5551,0.5053,0.7334,0.7176,0.2332,0.3453,0.4829,0.6315,0.7955,0.7797
RandomForestRegressor,0.2154,0.3436,0.4641,0.6542,0.8092,0.7796,0.2138,0.3351,0.4624,0.6621,0.8144,0.7952
GradientBoostingRegressor,0.2272,0.3549,0.4767,0.6352,0.8002,0.7646,0.2183,0.3425,0.4673,0.655,0.814,0.7856
AdaBoostRegressor,0.3832,0.4996,0.619,0.3847,0.6608,0.617,0.3681,0.4849,0.6067,0.4183,0.6886,0.6702
XGBRegressor,0.2174,0.3414,0.4662,0.651,0.8073,0.7742,0.2127,0.3301,0.4612,0.6639,0.815,0.7943
ExtraTreesRegressor,0.2145,0.3415,0.4632,0.6555,0.8097,0.7819,0.223,0.3396,0.4722,0.6476,0.8049,0.7867
LinearRegression,0.6512,0.4717,0.807,-0.0456,0.5858,0.69,0.3377,0.3822,0.5812,0.4663,0.7177,0.7463
KNeighborsRegressor,0.2717,0.3807,0.5213,0.5637,0.7575,0.7263,0.2642,0.3711,0.514,0.5824,0.7671,0.7575
SVR,0.2267,0.3421,0.4762,0.6359,0.8007,0.7714,0.2376,0.3499,0.4875,0.6245,0.7947,0.7756


In [17]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.803659819579023, -6.952297461961773, -6.31...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.077881107007449, -6.0194634527933095, -6....","[-7.095000868721698, -6.249811763734205, -6.66...","[0.025405667572675622, 0.18539061636084384, 0...."
1,DecisionTreeRegressor,"[-6.82, -7.0, -6.244999999999999, -5.05, -4.26...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -6.96, -7.0, -6.24, -7.0, -6.85,...","[-6.992, -6.43, -6.720000000000001, -6.694, -6...","[0.016000000000000014, 0.6298571266565139, 0.5..."
2,RandomForestRegressor,"[-6.535537681159422, -6.766108333333335, -6.00...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.747766666666669, -6.102716666666669, -6.7...","[-6.874323333333335, -6.307488457199335, -6.75...","[0.06899517986384561, 0.14667264145966635, 0.0..."
3,GradientBoostingRegressor,"[-7.172151031819288, -7.023936244369912, -5.93...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0754472513178035, -6.052434368338021, -6....","[-7.154501095613862, -6.281628977846543, -6.57...","[0.12123129267795736, 0.21266660343343977, 0.0..."
4,AdaBoostRegressor,"[-6.092088465441948, -6.055812499999991, -5.29...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.1702678571428535, -5.566031587411215, -5....","[-6.100103518051506, -5.645258461494352, -5.66...","[0.2048426940735511, 0.07647162008521954, 0.03..."
5,XGBRegressor,"[-6.8409624, -6.8683248, -6.0015554, -5.321909...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8227315, -6.169962, -6.799593, -6.7047834...","[-7.014267, -6.187599, -6.730255, -6.7727537, ...","[0.116441816, 0.2011095, 0.11745094, 0.1260753..."
6,ExtraTreesRegressor,"[-6.576300000000001, -6.813800000000001, -6.15...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.964599999999999, -6.627100000000001, -6.8...","[-6.955299999999999, -6.516620000000001, -6.86...","[0.022404374572836998, 0.13256841856188767, 0...."
7,LinearRegression,"[-7.248273384067716, -10.0, -6.148328486140372...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.56729127677113, -6.127714388314189, -8.63...","[-7.073458255354225, -6.3982907369437925, -8.6...","[2.7391545708959977, 0.9518729724276647, 0.050..."
8,KNeighborsRegressor,"[-7.0, -7.0, -5.88, -4.88, -4.733333333333333,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.746666666666667, -6.746666666666667...","[-7.0, -6.898666666666666, -6.797333333333334,...","[0.0, 0.12410748030101418, 0.10133333333333318..."
9,SVR,"[-7.169522117681083, -7.019981504744268, -5.91...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.300096341024478, -6.780839401372151, -7.0...","[-7.285910573423253, -6.740319863124401, -7.00...","[0.04465649746211758, 0.05312871296499056, 0.0..."


In [18]:
result_df.to_csv('descriptors_results/Results_2d_Mordred_const_rem.csv')
prediction_df.to_csv('descriptors_results/Prediction_df_2d_Mordred_const_rem.csv')


In [19]:
#2d RDKit descriptors LVR
df_train = pd.read_csv('Descriptors/Train_2d_RDKit_des.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_RDKit_des.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_LVR_rdkit = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_LVR_rdkit, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 149)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 149)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16888
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 145
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16918
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 147
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2088,0.3352,0.4569,0.6648,0.8159,0.7843,0.2096,0.3297,0.4579,0.6687,0.8186,0.7996
DecisionTreeRegressor,0.3018,0.3813,0.5494,0.5153,0.7378,0.7164,0.2325,0.3398,0.4822,0.6326,0.7971,0.7793
RandomForestRegressor,0.2157,0.341,0.4644,0.6537,0.8085,0.7779,0.2129,0.3311,0.4614,0.6636,0.8148,0.7955
GradientBoostingRegressor,0.2353,0.3605,0.485,0.6223,0.792,0.7526,0.2329,0.356,0.4826,0.6319,0.7992,0.7695
AdaBoostRegressor,0.4154,0.5211,0.6445,0.333,0.618,0.5709,0.4046,0.5125,0.6361,0.3606,0.6397,0.6166
XGBRegressor,0.2142,0.3375,0.4629,0.656,0.8107,0.7821,0.2078,0.3242,0.4559,0.6716,0.8196,0.8015
ExtraTreesRegressor,0.2081,0.3346,0.4561,0.6659,0.8162,0.787,0.2133,0.3316,0.4619,0.6629,0.8144,0.7959
LinearRegression,0.3794,0.441,0.6159,0.3909,0.6358,0.6613,0.3483,0.4299,0.5902,0.4496,0.6717,0.6809
KNeighborsRegressor,0.2809,0.3844,0.53,0.5489,0.7473,0.7193,0.2828,0.3757,0.5317,0.5532,0.7487,0.7475
SVR,0.2466,0.3558,0.4966,0.604,0.7802,0.7553,0.2521,0.3563,0.5021,0.6016,0.7787,0.7636


In [20]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.772400944109638, -6.817938613575938, -5.93...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.150567006766651, -6.157162388380886, -6.8...","[-7.168570015610081, -6.252276277476904, -6.89...","[0.04050697583359361, 0.05555283288068179, 0.1..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -5.05, -5.15,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.54, -7.0, -5.49, -7.0, -7.0, -6.85,...","[-6.992, -5.796, -7.0, -6.396, -6.491999999999...","[0.016000000000000014, 0.44960427044235224, 0...."
2,RandomForestRegressor,"[-6.762436325476252, -6.905400000000001, -5.90...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.988800000000002, -5.934699823646253, -6.9...","[-6.971920000000002, -6.164982577087251, -6.95...","[0.014810995915197037, 0.15301663821012512, 0...."
3,GradientBoostingRegressor,"[-6.868165845082114, -7.001095375674351, -5.67...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.007160101737617, -6.162250044604334, -6.5...","[-7.123784788607092, -6.289702826111688, -6.57...","[0.16574993810970395, 0.1199613320876362, 0.02..."
4,AdaBoostRegressor,"[-5.696136139455443, -6.03492854456322, -5.538...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.097516339869283, -5.583550221140558, -5.5...","[-6.299654819865298, -5.704513885538296, -5.61...","[0.19479835377342583, 0.06727519179993152, 0.0..."
5,XGBRegressor,"[-6.9800005, -6.8295994, -6.5184245, -5.181675...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.077477, -6.4597197, -6.9701786, -6.369826...","[-7.0858164, -6.471612, -6.9989333, -6.7037077...","[0.026511945, 0.18293324, 0.111612126, 0.26935..."
6,ExtraTreesRegressor,"[-6.8972000000000016, -6.8797000000000015, -5....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.994800000000001, -6.470650000000002, -6.9...","[-6.989240000000001, -6.4173300000000015, -6.9...","[0.0049321800453747955, 0.17867570511963915, 0..."
7,LinearRegression,"[-9.799625421276685, -6.634844250531511, -5.36...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-10.0, -6.280339326725771, -6.21843285003556...","[-10.0, -6.045860127063733, -6.069627471516846...","[0.0, 0.14999828568636786, 0.11118803890757178..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -4.89, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -7.0, -7.0, -5.180000000000001, -6.74...","[-6.7780000000000005, -6.949333333333334, -6.8...","[0.18126224096595522, 0.10133333333333318, 0.2..."
9,SVR,"[-6.198027563455584, -6.906001968492207, -6.05...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.198017782213315, -6.551172374530488, -6.8...","[-6.237405257981954, -6.603518031187571, -6.97...","[0.020335485566097086, 0.07972281552182395, 0...."


In [21]:
result_df.to_csv('descriptors_results/Results_2d_rdkit_LVR.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2d_rdkit_LVR.csv')


In [22]:
#2d Mordred descriptors LVR
df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
results_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
results_df

  df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


X_train shape:  (5568, 821)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


X_test shape:  (1392, 821)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170429
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 808
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028377 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170473
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 811
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tota

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2086,0.3356,0.4568,0.665,0.816,0.7849,0.2074,0.3286,0.4554,0.6722,0.8211,0.8034
DecisionTreeRegressor,0.2915,0.381,0.5399,0.5319,0.745,0.7269,0.2351,0.3464,0.4848,0.6285,0.7937,0.7721
RandomForestRegressor,0.217,0.3442,0.4658,0.6516,0.8075,0.7786,0.215,0.3359,0.4637,0.6603,0.8131,0.7948
GradientBoostingRegressor,0.2301,0.3559,0.4797,0.6305,0.7971,0.7617,0.224,0.348,0.4732,0.6461,0.8081,0.7787
AdaBoostRegressor,0.3914,0.5039,0.6256,0.3716,0.6477,0.6129,0.3709,0.4856,0.609,0.4139,0.682,0.6686
XGBRegressor,0.218,0.3432,0.4669,0.65,0.8072,0.7747,0.2122,0.3298,0.4606,0.6647,0.8154,0.793
ExtraTreesRegressor,0.2112,0.3394,0.4595,0.6609,0.813,0.784,0.2214,0.3374,0.4705,0.6502,0.8065,0.7898
LinearRegression,0.4254,0.4071,0.6522,0.317,0.6722,0.7356,0.3181,0.3735,0.564,0.4973,0.7294,0.7629
KNeighborsRegressor,0.2777,0.3834,0.527,0.5541,0.7513,0.718,0.2656,0.3714,0.5154,0.5802,0.7645,0.7563
SVR,0.2365,0.3482,0.4863,0.6203,0.7907,0.7618,0.2444,0.3529,0.4944,0.6138,0.7878,0.7692


In [23]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.590723796184461, -6.864465493660207, -6.30...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1934338278999785, -6.071564767852124, -6....","[-7.104324347085287, -6.313665025245345, -6.70...","[0.11911803727514793, 0.17757965983696158, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.03, -5.2, -6.244999999999999, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -5.89, -5.949999999999999...","[-7.37, -6.49, -6.728, -6.868, -5.854000000000...","[0.7601578783384412, 0.6307455905513726, 0.524..."
2,RandomForestRegressor,"[-6.683116057605, -6.8004, -6.164108333333335,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.867600000000001, -6.257900000000002, -6.7...","[-6.884765000000002, -6.374184742615431, -6.71...","[0.0409211546757904, 0.1266578598196469, 0.134..."
3,GradientBoostingRegressor,"[-6.9538278086749825, -7.052640720542491, -5.8...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.229801769686274, -6.1517648848612785, -6....","[-7.191434474201043, -6.310766037726277, -6.70...","[0.11328046235770686, 0.19618053637325925, 0.0..."
4,AdaBoostRegressor,"[-6.308654545454544, -6.308654545454544, -5.55...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.239743589743589, -5.672502385440382, -5.7...","[-6.060696597590882, -5.708925619905679, -5.66...","[0.1503225840297493, 0.057164129004639755, 0.0..."
5,XGBRegressor,"[-7.2010727, -7.05254, -6.1245418, -4.817527, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.3523107, -6.408034, -7.0629597, -6.966821...","[-7.18943, -6.34603, -6.8971586, -6.7855177, -...","[0.1483452, 0.2780104, 0.15064599, 0.15745322,..."
6,ExtraTreesRegressor,"[-6.860600000000002, -6.872300000000001, -6.09...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9811000000000005, -6.597400000000001, -6....","[-6.970460000000001, -6.585390000000001, -6.91...","[0.006783391482141944, 0.08001171414236755, 0...."
7,LinearRegression,"[-7.539499947810782, -7.185399541640891, -6.23...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.847063878560675, -7.483975315476073, -7.5...","[-7.553554799793129, -7.109674921969928, -7.88...","[1.4875345840168233, 0.6016532257184658, 0.296..."
8,KNeighborsRegressor,"[-7.0, -7.0, -5.88, -4.853333333333333, -4.733...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.746666666666667, -6.746666666666667...","[-7.0, -6.898666666666666, -6.797333333333334,...","[0.0, 0.12410748030101418, 0.10133333333333318..."
9,SVR,"[-7.1115116900074185, -7.009329529671163, -5.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.118067335275781, -6.790359365497927, -7.0...","[-7.115816315634726, -6.767185642475207, -7.01...","[0.03800995531330861, 0.03985679228380903, 0.0..."


In [24]:
results_df.to_csv('descriptors_results/Results_2d_Mordred_LVR.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2d_Mordred_LVR.csv')


In [25]:
#2d Padel descriptors
df_train = pd.read_csv('Descriptors/Train_2d_padel.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train



Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-4.2390,17.969121,111.3714,69.282962,0,0,66,32,34,...,61.494697,1.921709,33.025324,14.973326,18.051998,2732.0,56.0,0.188,156.0,1020
1,0,-4.2390,17.969121,111.3714,69.282962,0,0,66,32,34,...,61.494634,1.921707,33.024381,14.973124,18.051258,2735.0,56.0,0.188,156.0,1021
2,0,-4.2390,17.969121,111.3714,69.282962,0,0,66,32,34,...,61.494634,1.921707,33.024381,14.973124,18.051258,2735.0,56.0,0.188,156.0,1022
3,0,-4.0344,16.276383,122.7648,75.470134,0,0,72,34,38,...,65.139553,1.915869,33.439372,14.950444,18.488928,3129.0,64.0,-0.386,168.0,1023
4,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.401998,1.970407,37.167232,17.862301,19.304931,10372.0,92.0,5.550,272.0,1028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0,-5.9605,35.527560,440.7941,272.895469,0,0,252,119,133,...,235.781272,1.981355,87.859311,40.619051,47.240260,94051.0,199.0,8.707,594.0,914
5564,0,-4.5553,20.750758,443.6195,272.993848,0,0,253,117,136,...,230.754758,1.972263,82.512962,38.029136,44.483827,89262.0,201.0,8.428,580.0,922
5565,0,-5.5847,31.188874,437.8295,270.079883,0,0,248,117,131,...,231.630909,1.979751,85.100101,38.091980,44.482545,90275.0,203.0,8.341,588.0,904
5566,0,-3.1676,10.033690,466.9254,281.726676,0,0,255,123,132,...,244.597932,1.988601,87.647449,38.186244,44.409996,106182.0,208.0,10.250,620.0,888


In [26]:
df = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
df 


  df = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,155.712529,2.416852,4.832818,...,11.593980,169.458441,1772.125588,6.420745,107741,214,630.0,725.0,55.173611,28.291667
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,152.500384,2.445965,4.858291,...,11.590155,178.997365,1742.937365,6.835048,106182,208,620.0,719.0,49.861111,27.277778
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,3,154.777488,2.415089,4.830179,...,11.523579,167.253791,1732.084392,6.511595,102214,204,616.0,705.0,50.027778,28.083333
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,149.378369,2.419912,4.839823,...,11.562315,165.286866,1724.125588,6.338697,101212,208,608.0,699.0,55.673611,27.319444
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.261697,2.429936,4.859872,...,11.591302,165.339859,1722.146324,6.285206,107844,215,608.0,706.0,56.194444,27.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,35.896500,2.304816,4.609632,...,9.859065,63.474076,402.263091,6.385128,2286,42,138.0,153.0,10.638889,6.583333
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,33.357653,2.311467,4.622934,...,9.832367,61.242657,374.231791,6.565470,1880,40,130.0,145.0,10.138889,6.083333
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,31.502840,2.295993,4.591987,...,9.633842,59.559229,370.258006,6.170967,1648,37,118.0,129.0,10.777778,6.083333
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,30.506102,2.303541,4.607082,...,9.648660,58.482085,356.242356,6.249866,1472,37,114.0,126.0,10.527778,5.861111


In [27]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1020,C[C@H]1C(=O)N[C@H](C)C(=O)N[C@H](C)C(=O)N[C@H]...,-8.20,0,-4.2390,17.969121,111.3714,69.282962,0,0,...,6.882636,61.494697,1.921709,33.025324,14.973326,18.051998,2732.0,56.0,0.188,156.0
1,1021,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.20,0,-4.2390,17.969121,111.3714,69.282962,0,0,...,6.882636,61.494634,1.921707,33.024381,14.973124,18.051258,2735.0,56.0,0.188,156.0
2,1022,C[C@H]1C(=O)N[C@H](C)C(=O)N[C@H](C)C(=O)N[C@H]...,-8.30,0,-4.2390,17.969121,111.3714,69.282962,0,0,...,6.882636,61.494634,1.921707,33.024381,14.973124,18.051258,2735.0,56.0,0.188,156.0
3,1023,C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N(C)C(...,-7.00,0,-4.0344,16.276383,122.7648,75.470134,0,0,...,6.698407,65.139553,1.915869,33.439372,14.950444,18.488928,3129.0,64.0,-0.386,168.0
4,1028,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[C@H](...,-4.30,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.401998,1.970407,37.167232,17.862301,19.304931,10372.0,92.0,5.550,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,914,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C...,-5.36,0,-5.9605,35.527560,440.7941,272.895469,0,0,...,6.571450,235.781272,1.981355,87.859311,40.619051,47.240260,94051.0,199.0,8.707,594.0
5564,922,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-6.24,0,-4.5553,20.750758,443.6195,272.993848,0,0,...,6.438858,230.754758,1.972263,82.512962,38.029136,44.483827,89262.0,201.0,8.428,580.0
5565,904,CC[C@H](C)[C@H](NC(=O)[C@H](C)N(C)C(=O)[C@@H]1...,-7.00,0,-5.5847,31.188874,437.8295,270.079883,0,0,...,6.640971,231.630909,1.979751,85.100101,38.091980,44.482545,90275.0,203.0,8.341,588.0
5566,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.00,0,-3.1676,10.033690,466.9254,281.726676,0,0,...,6.835048,244.597932,1.988601,87.647449,38.186244,44.409996,106182.0,208.0,10.250,620.0


In [28]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,0,-4.4613,19.903198,477.8936,296.686157,0,0,...,6.420745,250.045897,1.968865,90.654326,43.537099,47.117227,107741.0,214.0,11.490,630.0
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0,-3.1676,10.033690,466.9254,281.726676,0,0,...,6.835048,244.597932,1.988601,87.647449,38.186244,44.409996,106182.0,208.0,10.250,620.0
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,0,-3.0574,9.347695,476.2546,289.385813,0,0,...,6.511595,248.034411,1.984275,88.350956,35.707994,52.642962,102214.0,204.0,10.960,616.0
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,0,-4.2279,17.875138,460.2191,289.646157,0,0,...,6.338697,240.980957,1.959195,90.761838,43.488529,47.273309,101212.0,208.0,9.243,608.0
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0,-5.0166,25.166276,461.8415,291.937743,0,0,...,6.285206,240.843065,1.958074,87.967406,40.534498,47.432908,107844.0,215.0,10.212,608.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,0,-1.1989,1.437361,110.0599,68.196962,0,0,...,6.385128,57.707840,1.989926,19.684197,7.618398,12.065799,2286.0,42.0,3.395,138.0
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,0,-0.6229,0.388004,104.2367,62.009790,0,0,...,6.565470,53.707230,1.989157,19.692329,7.619967,12.072362,1880.0,40.0,2.679,130.0
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0,-3.4938,12.206638,88.5588,61.958962,0,0,...,6.170967,50.804604,1.954023,22.049967,10.061855,11.988112,1648.0,37.0,2.588,118.0
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0,-3.7494,14.058000,84.1070,58.865376,0,0,...,6.249866,48.797056,1.951882,22.005013,10.045259,11.959755,1472.0,37.0,1.808,114.0


In [29]:
df_ordered.to_csv('Descriptors/Train_2d_padel_curated.csv', index=False)

In [30]:
#2d test padel descriptors
df_test = pd.read_csv('Descriptors/Test_2d_padel.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test


Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ID
0,0,-4.2390,17.969121,111.3714,69.282962,0,0,66,32,34,...,61.494697,1.921709,33.025324,14.973326,18.051998,2732.0,56.0,0.188,156.0,1019
1,0,-4.0344,16.276383,122.7648,75.470134,0,0,72,34,38,...,65.139553,1.915869,33.439372,14.950444,18.488928,3129.0,64.0,-0.386,168.0,1024
2,0,-1.0901,1.188318,197.5996,125.288752,0,0,117,53,64,...,104.585334,1.973308,36.997868,17.883760,19.114109,9990.0,88.0,5.837,266.0,1063
3,0,-1.0901,1.188318,197.5996,125.288752,0,0,117,53,64,...,104.585334,1.973308,36.997868,17.883760,19.114109,9990.0,88.0,5.837,266.0,1057
4,0,-0.9878,0.975749,203.2963,128.382338,0,0,120,54,66,...,106.402941,1.970425,37.173566,17.864226,19.309340,10372.0,92.0,5.550,272.0,1059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,0,-5.2403,27.460744,475.3685,294.303778,0,0,272,126,146,...,247.936262,1.967748,90.383811,40.574663,47.283631,111779.0,220.0,10.264,626.0,908
1388,0,-3.5373,12.512491,439.2397,261.472332,0,0,239,115,124,...,227.956848,1.982233,82.160615,37.988792,44.171822,84553.0,198.0,7.104,576.0,514
1389,0,-2.4343,5.925816,326.1493,184.096268,0,0,161,85,76,...,172.565284,2.030180,57.110921,25.550678,31.560243,36826.0,140.0,6.007,438.0,7333
1390,0,-4.2478,18.043805,425.6011,254.229574,0,0,230,112,118,...,223.189715,1.992765,79.540550,35.551907,41.463277,85015.0,194.0,7.233,568.0,880


In [31]:
df = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
df

  df = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,155.498239,2.424408,4.847496,...,11.617204,168.482860,1776.076051,6.529691,111779,220,626.0,728.0,55.444444,28.277778
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.138684,2.427131,4.854261,...,11.589822,165.351205,1724.125588,6.338697,99910,213,610.0,706.0,56.284722,27.402778
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.676716,2.420176,4.840351,...,11.578086,164.308833,1700.068073,6.464137,103927,212,606.0,704.0,53.222222,27.333333
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,151.698924,2.415398,4.830623,...,11.527095,177.329996,1685.010893,6.633901,95572,199,610.0,701.0,47.777778,27.083333
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,145.614560,2.422204,4.844100,...,11.547955,161.156358,1668.062988,6.415627,93670,207,588.0,682.0,53.972222,26.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,37.855097,2.317338,4.629714,...,9.940542,65.841858,430.258006,6.619354,2644,45,148.0,164.0,11.750000,7.000000
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,39.745462,2.403441,4.738314,...,10.079414,79.819128,430.258006,6.619354,2650,47,154.0,176.0,10.250000,6.972222


In [32]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1019,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.30,0,-4.2390,17.969121,111.3714,69.282962,0,0,...,6.882636,61.494697,1.921709,33.025324,14.973326,18.051998,2732.0,56.0,0.188,156.0
1,1024,C[C@H]1C(=O)N(C)[C@H](C)C(=O)N[C@H](C)C(=O)N(C...,-7.10,0,-4.0344,16.276383,122.7648,75.470134,0,0,...,6.698407,65.139553,1.915869,33.439372,14.950444,18.488928,3129.0,64.0,-0.386,168.0
2,1063,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@...,-6.17,0,-1.0901,1.188318,197.5996,125.288752,0,0,...,6.328920,104.585334,1.973308,36.997868,17.883760,19.114109,9990.0,88.0,5.837,266.0
3,1057,CC(C)C[C@H]1C(=O)N(C)[C@H](CC(C)C)C(=O)N[C@H](...,-5.42,0,-1.0901,1.188318,197.5996,125.288752,0,0,...,6.328920,104.585334,1.973308,36.997868,17.883760,19.114109,9990.0,88.0,5.837,266.0
4,1059,CC(C)C[C@H]1C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2cc...,-5.73,0,-0.9878,0.975749,203.2963,128.382338,0,0,...,6.287494,106.402941,1.970425,37.173566,17.864226,19.309340,10372.0,92.0,5.550,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0,-5.2403,27.460744,475.3685,294.303778,0,0,...,6.529691,247.936262,1.967748,90.383811,40.574663,47.283631,111779.0,220.0,10.264,626.0
1388,514,CC(C)C[C@H]1C(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[...,-6.70,0,-3.5373,12.512491,439.2397,261.472332,0,0,...,6.665009,227.956848,1.982233,82.160615,37.988792,44.171822,84553.0,198.0,7.104,576.0
1389,7333,C[C@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](Cc2ccccc2...,-5.37,0,-2.4343,5.925816,326.1493,184.096268,0,0,...,7.183693,172.565284,2.030180,57.110921,25.550678,31.560243,36826.0,140.0,6.007,438.0
1390,880,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.00,0,-4.2478,18.043805,425.6011,254.229574,0,0,...,6.816787,223.189715,1.992765,79.540550,35.551907,41.463277,85015.0,194.0,7.233,568.0


In [33]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0,-5.2403,27.460744,475.3685,294.303778,0,0,...,6.529691,247.936262,1.967748,90.383811,40.574663,47.283631,111779.0,220.0,10.264,626.0
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0,-5.4756,29.982195,460.5846,289.646157,0,0,...,6.338697,240.768109,1.957464,90.840338,43.433503,47.406835,99910.0,213.0,9.667,610.0
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0,-5.2907,27.991506,457.4260,283.509813,0,0,...,6.464137,240.251303,1.969273,87.601926,40.568304,47.033622,103927.0,212.0,9.469,606.0
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0,-2.6817,7.191515,466.7971,278.764676,0,0,...,6.633901,242.969389,1.991552,85.904257,35.712474,50.191783,95572.0,199.0,10.338,610.0
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0,-5.8836,34.616749,443.4036,277.271813,0,0,...,6.415627,233.157008,1.959303,90.305554,42.950496,47.355058,93670.0,207.0,9.346,588.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0,-1.7749,3.150270,115.8831,74.384134,0,0,...,6.236151,61.708009,1.990581,19.682164,7.618006,12.064158,2750.0,44.0,4.111,146.0
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0,-1.7749,3.150270,115.8831,74.384134,0,0,...,6.236151,61.708009,1.990581,19.682164,7.618006,12.064158,2750.0,44.0,4.111,146.0
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0,-1.6788,2.818369,115.5380,70.758962,0,0,...,6.619354,61.526924,1.984739,22.541021,10.153913,12.387108,2644.0,45.0,1.941,148.0
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0,-3.7309,13.919615,109.3044,70.758962,0,0,...,6.619354,62.566820,2.018285,22.680885,10.197129,12.483757,2650.0,47.0,1.336,154.0


In [34]:
df_ordered.to_csv('Descriptors/Test_2d_padel_curated.csv', index=False)

In [35]:
#3d Train descriptors
df_train = pd.read_csv('Descriptors/Train_3d_padel.csv')
df_train['ID'] = df_train['Name'].str.extract(r'_(\d+)$')
df_train['ID'] = df_train['ID'].astype(int)
df_train = df_train.drop('Name',axis=1)
df_train = df_train.fillna(0)
df_train

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.258677,2.181907,3.032577,3.751778,4.542085,5.436053,6.061333,6.883971,7.511899,7.984996,...,0.379232,0.503675,0.466782,0.448824,18.219614,96.663741,242.823577,0.336507,1.419281,1020
1,1.258151,2.169168,3.045547,3.765723,4.458617,5.344197,6.075984,6.689540,7.211720,7.529589,...,0.390780,0.520405,0.516072,0.469627,18.059430,95.399228,238.753641,0.336836,1.506104,1023
2,1.254768,2.177139,3.018816,3.747342,4.519925,5.346287,6.105176,6.777472,7.449173,7.786049,...,0.373243,0.539885,0.531924,0.405711,18.065710,93.748644,229.141201,0.347749,1.477520,1022
3,1.257484,2.181844,3.026954,3.778220,4.553884,5.333574,6.089321,6.583092,7.134701,7.354121,...,0.328820,0.539673,0.625227,0.390931,17.337491,85.951045,215.094400,0.329874,1.555831,1021
4,1.267343,2.192585,3.021923,3.783319,4.635484,5.372101,6.064741,6.876386,7.575124,8.239529,...,0.402299,0.408087,0.395876,0.345536,28.791246,247.499680,834.325613,0.316638,1.149498,1031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,1.278092,2.215001,3.043869,3.843912,4.623583,5.443092,6.178733,6.795858,7.430749,8.081524,...,0.421838,0.408161,0.453337,0.434838,27.227967,223.611123,742.428326,0.307485,1.296335,991
5564,1.262454,2.182843,3.017805,3.747871,4.554741,5.324063,6.026313,6.880243,7.547196,8.247904,...,0.333799,0.608296,0.553480,0.410091,60.397102,988.170796,4616.586235,0.375166,1.571868,802
5565,1.254741,2.167737,2.988235,3.687151,4.536381,5.390884,6.060162,6.792495,7.674889,8.489007,...,0.345318,0.584424,0.505315,0.353970,37.409237,372.188532,1130.553893,0.398042,1.443709,979
5566,1.255777,2.167449,2.997789,3.706984,4.536124,5.305341,6.022268,6.792038,7.459594,8.196334,...,0.366150,0.507076,0.446236,0.424250,34.287158,324.379295,1005.698844,0.381478,1.377563,978


In [36]:
df = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
df 

  df = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,155.712529,2.416852,4.832818,...,11.593980,169.458441,1772.125588,6.420745,107741,214,630.0,725.0,55.173611,28.291667
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,152.500384,2.445965,4.858291,...,11.590155,178.997365,1742.937365,6.835048,106182,208,620.0,719.0,49.861111,27.277778
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,3,154.777488,2.415089,4.830179,...,11.523579,167.253791,1732.084392,6.511595,102214,204,616.0,705.0,50.027778,28.083333
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,149.378369,2.419912,4.839823,...,11.562315,165.286866,1724.125588,6.338697,101212,208,608.0,699.0,55.673611,27.319444
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.261697,2.429936,4.859872,...,11.591302,165.339859,1722.146324,6.285206,107844,215,608.0,706.0,56.194444,27.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,35.896500,2.304816,4.609632,...,9.859065,63.474076,402.263091,6.385128,2286,42,138.0,153.0,10.638889,6.583333
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,33.357653,2.311467,4.622934,...,9.832367,61.242657,374.231791,6.565470,1880,40,130.0,145.0,10.138889,6.083333
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,31.502840,2.295993,4.591987,...,9.633842,59.559229,370.258006,6.170967,1648,37,118.0,129.0,10.777778,6.083333
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,30.506102,2.303541,4.607082,...,9.648660,58.482085,356.242356,6.249866,1472,37,114.0,126.0,10.527778,5.861111


In [37]:
merged_df = df_train.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1020,C[C@H]1C(=O)N[C@H](C)C(=O)N[C@H](C)C(=O)N[C@H]...,-8.20,1.258677,2.181907,3.032577,3.751778,4.542085,5.436053,6.061333,...,0.511773,0.379232,0.503675,0.466782,0.448824,18.219614,96.663741,242.823577,0.336507,1.419281
1,1023,C[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@@H](C)N(C)C(...,-7.00,1.258151,2.169168,3.045547,3.765723,4.458617,5.344197,6.075984,...,0.500444,0.390780,0.520405,0.516072,0.469627,18.059430,95.399228,238.753641,0.336836,1.506104
2,1022,C[C@H]1C(=O)N[C@H](C)C(=O)N[C@H](C)C(=O)N[C@H]...,-8.30,1.254768,2.177139,3.018816,3.747342,4.519925,5.346287,6.105176,...,0.525256,0.373243,0.539885,0.531924,0.405711,18.065710,93.748644,229.141201,0.347749,1.477520
3,1021,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.20,1.257484,2.181844,3.026954,3.778220,4.553884,5.333574,6.089321,...,0.553249,0.328820,0.539673,0.625227,0.390931,17.337491,85.951045,215.094400,0.329874,1.555831
4,1031,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]...,-5.17,1.267343,2.192585,3.021923,3.783319,4.635484,5.372101,6.064741,...,0.475460,0.402299,0.408087,0.395876,0.345536,28.791246,247.499680,834.325613,0.316638,1.149498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,991,CC[C@H](C)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](...,-4.96,1.278092,2.215001,3.043869,3.843912,4.623583,5.443092,6.178733,...,0.449818,0.421838,0.408161,0.453337,0.434838,27.227967,223.611123,742.428326,0.307485,1.296335
5564,802,CC(C)C[C@H]1C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[...,-4.24,1.262454,2.182843,3.017805,3.747871,4.554741,5.324063,6.026313,...,0.582978,0.333799,0.608296,0.553480,0.410091,60.397102,988.170796,4616.586235,0.375166,1.571868
5565,979,C/C1=C\[C@@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C...,-5.00,1.254741,2.167737,2.988235,3.687151,4.536381,5.390884,6.060162,...,0.586710,0.345318,0.584424,0.505315,0.353970,37.409237,372.188532,1130.553893,0.398042,1.443709
5566,978,C/C1=C\[C@H](CC(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@...,-5.52,1.255777,2.167449,2.997789,3.706984,4.536124,5.305341,6.022268,...,0.554835,0.366150,0.507076,0.446236,0.424250,34.287158,324.379295,1005.698844,0.381478,1.377563


In [38]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,1.259916,2.186876,3.009692,3.725392,4.523604,5.356425,6.101257,...,0.651022,0.310326,0.471748,0.474816,0.390502,96.230112,2214.927222,9269.709607,0.476534,1.337065
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,1.260956,2.192365,3.021734,3.762950,4.572194,5.344586,6.073077,...,0.536705,0.383044,0.499076,0.511123,0.387397,78.913503,1739.872712,9926.351313,0.379623,1.397597
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,1.258339,2.178215,3.010056,3.721919,4.524116,5.356053,6.048656,...,0.649923,0.291878,0.453686,0.515599,0.399368,91.549655,2049.323105,10612.165217,0.474884,1.368653
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,1.258104,2.177912,3.009463,3.724264,4.504869,5.327087,5.994629,...,0.811324,0.126642,0.576205,0.431076,0.342110,102.180579,1680.289350,8582.496833,0.716985,1.349391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,1.258340,2.190559,2.988594,3.760183,4.605040,5.364203,5.988572,...,0.700532,0.213598,0.438801,0.369091,0.433886,20.020232,91.436181,214.560045,0.550798,1.241777
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,1.256228,2.193394,2.995311,3.754843,4.566531,5.287344,6.011638,...,0.797000,0.144638,0.463418,0.440235,0.444609,20.228575,69.658199,145.575438,0.695500,1.348262
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,1.247814,2.171273,2.946442,3.677297,4.529005,5.317514,6.031092,...,0.461348,0.346276,0.479049,0.456672,0.464743,15.466525,75.381197,204.552808,0.211436,1.400463
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,1.252958,2.173473,2.962155,3.704700,4.554279,5.463495,6.143902,...,0.627527,0.312461,0.522381,0.504997,0.427949,16.762042,70.940669,143.120522,0.441290,1.455327


In [39]:
df_ordered.to_csv('Descriptors/Train_3d_padel_curated.csv', index=False)

In [40]:
#3d test padel descriptors
df_test = pd.read_csv('Descriptors/Test_3d_padel.csv')
df_test['ID'] = df_test['Name'].str.extract(r'_(\d+)$')
df_test['ID'] = df_test['ID'].astype(int)
df_test = df_test.drop('Name',axis=1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,TDB10u,...,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds,ID
0,1.259601,2.184022,3.028525,3.776665,4.598608,5.440405,6.076351,6.876115,7.469327,7.873813,...,0.365309,0.530264,0.564295,0.400640,18.617008,96.225371,222.738812,0.375553,1.495199,1019
1,1.255364,2.168505,3.041962,3.757705,4.461371,5.347965,6.163756,6.996452,7.546754,7.944022,...,0.441030,0.503734,0.470243,0.429292,19.073646,104.504200,248.230281,0.370782,1.403268,1024
2,1.266388,2.191253,3.014903,3.772663,4.604766,5.383582,6.100319,6.930472,7.627887,8.388117,...,0.326986,0.413450,0.490170,0.351423,29.721677,260.867860,936.546482,0.297149,1.255043,1058
3,1.260487,2.179588,3.018280,3.755848,4.574111,5.305548,6.027836,6.850318,7.608406,8.198741,...,0.371449,0.402730,0.483134,0.396887,29.800393,272.085816,1018.575625,0.269815,1.282752,1059
4,1.260418,2.193904,3.045081,3.765709,4.573512,5.308550,5.957905,6.780413,7.504277,8.168649,...,0.360817,0.387847,0.445816,0.354904,29.141255,254.870804,904.571083,0.291615,1.188568,1087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,1.264008,2.187560,3.024078,3.768410,4.499937,5.320964,6.009401,6.732692,7.346768,7.968295,...,0.174328,0.539789,0.475113,0.431884,96.206319,1657.209281,7203.627712,0.671048,1.446786,908
1388,1.260560,2.189333,2.998673,3.732600,4.650473,5.415660,6.130208,6.729424,7.279925,7.945541,...,0.407448,0.414320,0.412583,0.343072,28.670026,237.886148,727.211399,0.354850,1.169974,8487
1389,1.262509,2.189570,3.004417,3.738197,4.645399,5.398300,6.100812,6.796822,7.448699,8.321299,...,0.379103,0.505450,0.381868,0.423116,30.866994,282.420116,993.780663,0.316322,1.310434,8475
1390,1.256711,2.159005,3.001484,3.714792,4.492812,5.354403,6.103030,6.757506,7.331021,8.138971,...,0.423481,0.505019,0.496731,0.437093,27.431799,212.944400,579.873260,0.383171,1.438844,947


In [41]:
df = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
df

  df = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount.19,WalkCount.20,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,155.498239,2.424408,4.847496,...,11.617204,168.482860,1776.076051,6.529691,111779,220,626.0,728.0,55.444444,28.277778
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.138684,2.427131,4.854261,...,11.589822,165.351205,1724.125588,6.338697,99910,213,610.0,706.0,56.284722,27.402778
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.676716,2.420176,4.840351,...,11.578086,164.308833,1700.068073,6.464137,103927,212,606.0,704.0,53.222222,27.333333
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,151.698924,2.415398,4.830623,...,11.527095,177.329996,1685.010893,6.633901,95572,199,610.0,701.0,47.777778,27.083333
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,145.614560,2.422204,4.844100,...,11.547955,161.156358,1668.062988,6.415627,93670,207,588.0,682.0,53.972222,26.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,37.855097,2.317338,4.629714,...,9.940542,65.841858,430.258006,6.619354,2644,45,148.0,164.0,11.750000,7.000000
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,39.745462,2.403441,4.738314,...,10.079414,79.819128,430.258006,6.619354,2650,47,154.0,176.0,10.250000,6.972222


In [42]:
merged_df = df_test.merge(df[['ID', 'SMILES', 'Permeability']], on='ID', how='left')
merged_df = merged_df[['ID', 'SMILES', 'Permeability'] + [col for col in merged_df.columns if col not in ['ID', 'SMILES', 'Permeability']]]
merged_df

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,1019,C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@@H](C)N(C)C(=O)...,-8.300,1.259601,2.184022,3.028525,3.776665,4.598608,5.440405,6.076351,...,0.551726,0.365309,0.530264,0.564295,0.400640,18.617008,96.225371,222.738812,0.375553,1.495199
1,1024,C[C@H]1C(=O)N(C)[C@H](C)C(=O)N[C@H](C)C(=O)N(C...,-7.100,1.255364,2.168505,3.041962,3.757705,4.461371,5.347965,6.163756,...,0.472824,0.441030,0.503734,0.470243,0.429292,19.073646,104.504200,248.230281,0.370782,1.403268
2,1058,CC(C)C[C@H]1C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@H]...,-5.950,1.266388,2.191253,3.014903,3.772663,4.604766,5.383582,6.100319,...,0.531433,0.326986,0.413450,0.490170,0.351423,29.721677,260.867860,936.546482,0.297149,1.255043
3,1059,CC(C)C[C@H]1C(=O)N2CCC[C@H]2C(=O)N[C@@H](Cc2cc...,-5.730,1.260487,2.179588,3.018280,3.755848,4.574111,5.305548,6.027836,...,0.475094,0.371449,0.402730,0.483134,0.396887,29.800393,272.085816,1018.575625,0.269815,1.282752
4,1087,CC(C)C[C@H]1C(=O)N[C@@H](Cc2ccc(O)cc2)C(=O)N(C...,-6.030,1.260418,2.193904,3.045081,3.765709,4.573512,5.308550,5.957905,...,0.500259,0.360817,0.387847,0.445816,0.354904,29.141255,254.870804,904.571083,0.291615,1.188568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.000,1.264008,2.187560,3.024078,3.768410,4.499937,5.320964,6.009401,...,0.780699,0.174328,0.539789,0.475113,0.431884,96.206319,1657.209281,7203.627712,0.671048,1.446786
1388,8487,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...,-6.495,1.260560,2.189333,2.998673,3.732600,4.650473,5.415660,6.130208,...,0.495785,0.407448,0.414320,0.412583,0.343072,28.670026,237.886148,727.211399,0.354850,1.169974
1389,8475,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H...,-5.960,1.262509,2.189570,3.004417,3.738197,4.645399,5.398300,6.100812,...,0.498444,0.379103,0.505450,0.381868,0.423116,30.866994,282.420116,993.780663,0.316322,1.310434
1390,947,CC[C@H](C)[C@H]1OC(=O)[C@H](C(C)C)N(C)C(=O)[C@...,-5.340,1.256711,2.159005,3.001484,3.714792,4.492812,5.354403,6.103030,...,0.498634,0.423481,0.505019,0.496731,0.437093,27.431799,212.944400,579.873260,0.383171,1.438844


In [43]:
df_ordered = merged_df.merge(df[['ID']], on='ID', how='right')
df_ordered = df_ordered.reindex(df.index)
df_ordered

Unnamed: 0,ID,SMILES,Permeability,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,1.264008,2.187560,3.024078,3.768410,4.499937,5.320964,6.009401,...,0.780699,0.174328,0.539789,0.475113,0.431884,96.206319,1657.209281,7203.627712,0.671048,1.446786
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,1.259693,2.178296,3.015183,3.739056,4.540203,5.372658,6.102178,...,0.554143,0.382135,0.519415,0.521290,0.389075,83.091448,1873.928421,9698.038245,0.404417,1.429780
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,1.263602,2.198019,3.028624,3.771362,4.587914,5.370613,6.119600,...,0.596153,0.354072,0.403854,0.454070,0.363028,79.882613,1648.772500,7084.348331,0.425338,1.220953
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,1.258249,2.182046,3.011120,3.728983,4.495079,5.348828,6.080735,...,0.540248,0.406175,0.592882,0.546456,0.401213,86.828192,2036.636544,9819.558017,0.419634,1.540551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,1.253581,2.186082,2.969442,3.701711,4.614219,5.425873,6.120615,...,0.694755,0.244015,0.423204,0.403477,0.434739,22.780437,117.807253,263.303199,0.542133,1.261420
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,1.254420,2.185529,2.967675,3.706578,4.580772,5.367258,6.140751,...,0.642128,0.311598,0.491460,0.489046,0.424396,22.210103,120.470127,244.118194,0.463192,1.404903
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,1.259455,2.190894,3.004168,3.765576,4.610101,5.281473,6.055001,...,0.622128,0.285545,0.480992,0.447816,0.381100,20.440926,109.241371,269.765567,0.433192,1.309908
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,1.264940,2.206905,2.990951,3.785669,4.572371,5.351900,6.153192,...,0.638515,0.303978,0.444298,0.461387,0.468771,20.244697,101.762869,214.619408,0.457773,1.374456


In [44]:
df_ordered.to_csv('Descriptors/Test_3d_padel_curated.csv', index=False)

In [45]:
#2d Padel descriptors
df_train = pd.read_csv('Descriptors/Train_2d_padel_curated.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_padel_curated.csv')
df_test = df_test.dropna()
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 1444)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 1444)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 229917
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1037
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 229915
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1037
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threa

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2069,0.3343,0.4549,0.6677,0.8177,0.785,0.21,0.3301,0.4583,0.6681,0.8182,0.802
DecisionTreeRegressor,0.302,0.3857,0.5495,0.5151,0.7359,0.716,0.2516,0.354,0.5015,0.6025,0.7788,0.761
RandomForestRegressor,0.2173,0.3437,0.4661,0.6511,0.807,0.7743,0.222,0.3407,0.4712,0.6492,0.806,0.7879
GradientBoostingRegressor,0.2231,0.3511,0.4723,0.6418,0.8039,0.7651,0.2233,0.3435,0.4725,0.6472,0.8081,0.7818
AdaBoostRegressor,0.3868,0.4982,0.6219,0.379,0.6507,0.6217,0.3685,0.483,0.6071,0.4176,0.6828,0.6764
XGBRegressor,0.2184,0.3439,0.4673,0.6494,0.8064,0.7744,0.2155,0.3329,0.4642,0.6594,0.8122,0.7939
ExtraTreesRegressor,0.214,0.3421,0.4626,0.6563,0.8101,0.78,0.2242,0.3405,0.4735,0.6457,0.8037,0.7854
LinearRegression,0.4615,0.4239,0.6793,0.259,0.6589,0.7225,0.3912,0.3924,0.6255,0.3818,0.6884,0.7531
KNeighborsRegressor,0.2722,0.3806,0.5217,0.563,0.7568,0.7192,0.2735,0.3779,0.5229,0.5679,0.7578,0.7414
SVR,0.226,0.3406,0.4754,0.6372,0.8014,0.7715,0.2379,0.3456,0.4877,0.6241,0.7938,0.775


In [46]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.748226705099755, -7.058676485953355, -6.12...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.026980191209685, -6.1958294473008335, -6....","[-6.968904120808022, -6.461984757077597, -6.83...","[0.09608199954542887, 0.20382150982622516, 0.1..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -5.05, -5.05,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -6.85, -7.0, -5.04, -5.09, -6.8...","[-6.702, -6.42, -6.703999999999999, -6.396, -5...","[0.2881943788487207, 0.5248999904743759, 0.347..."
2,RandomForestRegressor,"[-6.693500000000001, -6.770899999999999, -5.98...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.852100000000002, -6.397482293163333, -6.7...","[-6.84246, -6.479811458632666, -6.68976, -6.43...","[0.04291375537051084, 0.08092698230793745, 0.0..."
3,GradientBoostingRegressor,"[-6.92929454355441, -7.10869687862372, -5.9029...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.100230521734597, -5.8831878424872865, -6....","[-7.039261627846814, -6.215010213701844, -6.62...","[0.0486498107289497, 0.2589163164055372, 0.173..."
4,AdaBoostRegressor,"[-5.761552755847047, -6.289976615186668, -5.67...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.180000000000001, -5.672218629177666, -5.7...","[-5.919183039253319, -5.652822605561984, -5.72...","[0.23367593450940866, 0.026734769978079585, 0...."
5,XGBRegressor,"[-7.139188, -7.175612, -6.230652, -5.4634495, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0666866, -5.9977975, -6.6378975, -6.32408...","[-7.168223, -6.3888144, -6.8232203, -6.563575,...","[0.16538478, 0.304069, 0.1131331, 0.35762882, ..."
6,ExtraTreesRegressor,"[-6.728134245075, -6.9156, -6.0836000000000015...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.956600000000001, -6.497350000000003, -6.9...","[-6.941160000000001, -6.487427117476119, -6.87...","[0.026632806836681677, 0.06569128970568618, 0...."
7,LinearRegression,"[-7.224913629327748, -6.996034795908745, -5.86...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-8.672600117973307, -5.013391257928706, -7.3...","[-8.197405059528572, -5.860461402203563, -7.58...","[0.5076845740872549, 0.6171885678628157, 0.278..."
8,KNeighborsRegressor,"[-6.156666666666666, -6.986666666666667, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -6.503333333333333, -7.0...","[-6.989333333333333, -6.469333333333333, -6.82...","[0.005333333333333102, 0.4431373000173502, 0.2..."
9,SVR,"[-6.926809109998603, -6.890815062391336, -5.69...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.403459977495986, -6.647869120614306, -7.1...","[-7.359497234472568, -6.6962759308451725, -7.0...","[0.09656983009150148, 0.1147405316283296, 0.04..."


In [47]:
result_df.to_csv('descriptors_results/Results_2D_padel.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_padel.csv')


In [48]:
#2d padel descriptors const removal
df_train = pd.read_csv('Descriptors/Train_2d_padel_curated.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_padel_curated.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 1094)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 1094)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 229917
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1037
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 229915
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1037
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threa

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2069,0.3343,0.4549,0.6677,0.8177,0.785,0.21,0.3301,0.4583,0.6681,0.8182,0.802
DecisionTreeRegressor,0.3118,0.3883,0.5584,0.4994,0.7279,0.7159,0.2447,0.3524,0.4947,0.6133,0.7849,0.7617
RandomForestRegressor,0.2165,0.3435,0.4653,0.6523,0.8078,0.7753,0.2223,0.3407,0.4715,0.6487,0.8057,0.7878
GradientBoostingRegressor,0.223,0.351,0.4722,0.642,0.804,0.7651,0.2233,0.3436,0.4726,0.6471,0.8079,0.7814
AdaBoostRegressor,0.387,0.4977,0.6221,0.3786,0.6499,0.616,0.3689,0.4835,0.6073,0.4171,0.6819,0.6788
XGBRegressor,0.2184,0.3439,0.4673,0.6494,0.8064,0.7744,0.2155,0.3329,0.4642,0.6594,0.8122,0.7939
ExtraTreesRegressor,0.2139,0.3416,0.4625,0.6566,0.8103,0.7794,0.2231,0.3401,0.4724,0.6474,0.8048,0.7862
LinearRegression,0.4615,0.4239,0.6793,0.259,0.6589,0.7225,0.3912,0.3924,0.6255,0.3818,0.6884,0.7531
KNeighborsRegressor,0.2734,0.3813,0.5228,0.5611,0.7559,0.7182,0.2733,0.3777,0.5228,0.5681,0.7579,0.7422
SVR,0.226,0.3406,0.4754,0.6372,0.8014,0.7715,0.2379,0.3456,0.4877,0.6241,0.7938,0.775


In [49]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.748226705099755, -7.058676485953355, -6.12...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.026980191209685, -6.1958294473008335, -6....","[-6.968904120808022, -6.461984757077597, -6.83...","[0.09608199954542887, 0.20382150982622516, 0.1..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -5.09, -5.05,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -6.89, -7.0, -5.49, -5.05, -6.85...","[-6.884, -6.624, -6.720000000000001, -6.674000...","[0.13410443691392165, 0.47927445164540133, 0.3..."
2,RandomForestRegressor,"[-6.710433333333333, -6.805999999999999, -5.86...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.853900000000002, -6.353100000000001, -6.7...","[-6.819680000000001, -6.457955, -6.66580000000...","[0.055604366734996635, 0.08243495375142687, 0...."
3,GradientBoostingRegressor,"[-6.92929454355441, -7.10869687862372, -5.9029...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.100230521734597, -5.8831878424872865, -6....","[-7.039261627846814, -6.215010213701845, -6.61...","[0.0486498107289497, 0.2589163164055375, 0.163..."
4,AdaBoostRegressor,"[-5.672218629177667, -5.867452229299364, -5.63...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.992893139875726, -5.6085598031444235, -5....","[-5.923491564045244, -5.6392578386621075, -5.6...","[0.20596926720018796, 0.022431733825551074, 0...."
5,XGBRegressor,"[-7.139188, -7.175612, -6.230652, -5.4634495, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0666866, -5.9977975, -6.6378975, -6.32408...","[-7.168223, -6.3888144, -6.8232203, -6.563575,...","[0.16538478, 0.304069, 0.1131331, 0.35762882, ..."
6,ExtraTreesRegressor,"[-6.599120632720003, -6.913400000000001, -5.98...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.963599999999999, -6.479, -6.8888, -6.3617...","[-6.923080000000001, -6.45411, -6.878490000000...","[0.03290977970148008, 0.1434897431874482, 0.01..."
7,LinearRegression,"[-7.22491363056128, -6.996034798974, -5.861678...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-8.672600119557904, -5.013391259924395, -7.3...","[-8.197405060434756, -5.86046140295665, -7.588...","[0.5076845745075953, 0.6171885667096056, 0.278..."
8,KNeighborsRegressor,"[-6.156666666666666, -6.986666666666667, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -6.503333333333333, -7.0...","[-6.989333333333333, -6.469333333333333, -6.82...","[0.005333333333333102, 0.4431373000173502, 0.2..."
9,SVR,"[-6.92693162333586, -6.890609734357987, -5.699...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.40310627284776, -6.648065931433602, -7.11...","[-7.3593294920624, -6.696198636678213, -7.0385...","[0.09638599404428412, 0.11458023745920132, 0.0..."


In [50]:
result_df.to_csv('descriptors_results/Results_2D_padel_const_rem.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_padel_const_rem.csv')

In [51]:
#2d padel descriptors LVR
df_train = pd.read_csv('Descriptors/Train_2d_padel_curated.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_padel_curated.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
results_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)


X_train shape:  (5568, 726)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 726)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151175
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 715
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151157
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 715
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading

In [52]:
results_df

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2066,0.3353,0.4545,0.6683,0.8179,0.7874,0.2083,0.3296,0.4564,0.6709,0.8201,0.8038
DecisionTreeRegressor,0.3137,0.3895,0.5601,0.4963,0.726,0.7102,0.2439,0.3542,0.4938,0.6146,0.7855,0.7652
RandomForestRegressor,0.2178,0.3447,0.4667,0.6503,0.8065,0.7744,0.223,0.341,0.4722,0.6476,0.805,0.7878
GradientBoostingRegressor,0.2251,0.3523,0.4744,0.6386,0.8019,0.7636,0.2265,0.3464,0.4759,0.6421,0.805,0.7792
AdaBoostRegressor,0.392,0.5019,0.6261,0.3705,0.6458,0.6139,0.3709,0.4861,0.609,0.414,0.6834,0.6747
XGBRegressor,0.2165,0.3416,0.4653,0.6524,0.8083,0.7773,0.2169,0.3336,0.4657,0.6572,0.8109,0.7901
ExtraTreesRegressor,0.2124,0.3409,0.4609,0.6589,0.8117,0.7804,0.2211,0.3386,0.4702,0.6506,0.8067,0.7885
LinearRegression,0.3265,0.3864,0.5714,0.4757,0.7259,0.7432,0.3369,0.3789,0.5804,0.4676,0.7173,0.7675
KNeighborsRegressor,0.2724,0.3809,0.5219,0.5627,0.7573,0.7237,0.272,0.3765,0.5215,0.5702,0.7594,0.7437
SVR,0.2321,0.3444,0.4818,0.6273,0.7955,0.7665,0.2399,0.3469,0.4898,0.6209,0.7922,0.7731


In [53]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.771642724814257, -6.904749184979136, -5.92...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9520449595918725, -6.485434779673925, -6....","[-7.019630539902186, -6.42647565596941, -6.817...","[0.13248295910366936, 0.12234939605077523, 0.1..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -5.66, -5.15,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -6.85, -6.6, -5.62, -5.05, -6.8...","[-6.7299999999999995, -6.224, -6.9020000000000...","[0.2650283003756392, 0.861849174739989, 0.1342..."
2,RandomForestRegressor,"[-6.633599999999999, -6.7676000000000025, -5.8...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.940800000000001, -6.350300000000002, -6.7...","[-6.832240000000001, -6.476004999999999, -6.73...","[0.07797413930272035, 0.08975352973560334, 0.0..."
3,GradientBoostingRegressor,"[-6.943050297262008, -6.73695474371525, -5.963...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.865892759228352, -5.911146040263634, -6.8...","[-6.881720647547988, -6.1800753904055, -6.6796...","[0.13284182325766636, 0.19783969721160807, 0.1..."
4,AdaBoostRegressor,"[-5.707650726451405, -6.16879194630873, -5.643...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.936546275395076, -5.643063408313051, -5.7...","[-5.846918267248835, -5.663739785734554, -5.72...","[0.09236123899197157, 0.059106696379105525, 0...."
5,XGBRegressor,"[-6.992336, -6.6252017, -5.754438, -5.481389, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1227093, -6.1708193, -7.0632586, -6.77562...","[-6.9684496, -6.336802, -6.893136, -6.6493235,...","[0.11548046, 0.37445024, 0.21871419, 0.2313988..."
6,ExtraTreesRegressor,"[-6.774510000000003, -6.832700000000001, -6.11...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.956100000000003, -6.486200000000002, -6.9...","[-6.910920000000002, -6.517849125770001, -6.91...","[0.027672759168540353, 0.12243991928690154, 0...."
7,LinearRegression,"[-7.674373826579652, -7.610388580468792, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-8.773609819552915, -7.164248514071117, -7.5...","[-8.4917124950499, -6.620327387398552, -7.5878...","[0.25708571609118236, 0.34793739928733636, 0.1..."
8,KNeighborsRegressor,"[-6.156666666666666, -6.986666666666667, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -6.503333333333333, -7.0...","[-6.989333333333333, -6.624, -6.91200000000000...","[0.005333333333333102, 0.15548347536349721, 0...."
9,SVR,"[-6.926327666566776, -6.8805713201150445, -5.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.358942427476638, -6.672332593490298, -7.1...","[-7.293648944843165, -6.738798032096359, -7.05...","[0.10237663545738508, 0.09201900149988873, 0.0..."


In [54]:
results_df.to_csv('descriptors_results/Results_2D_padel_LVR.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_padel_const_LVR.csv')

In [55]:
#2d All descriptors
df_train_padel = pd.read_csv('Descriptors/Train_2d_padel_curated.csv')
df_train_rdkit = pd.read_csv('Descriptors/Train_2d_RDKit_des.csv')
df_train_mordred = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')

df_2d_train = df_train_rdkit.merge(df_train_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_train

  df_train_mordred = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,15.738544,15.738544,0.010382,-1.908222,0.047997,24.622047,1773.325,...,6.420745,250.045897,1.968865,90.654326,43.537099,47.117227,107741.0,214.0,11.490,630.0
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,15.975705,15.975705,0.027671,-1.943827,0.026511,21.983740,1745.057,...,6.835048,244.597932,1.988601,87.647449,38.186244,44.409996,106182.0,208.0,10.250,620.0
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,15.828473,15.828473,0.049834,-1.862294,0.021075,22.464000,1733.267,...,6.511595,248.034411,1.984275,88.350956,35.707994,52.642962,102214.0,204.0,10.960,616.0
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,15.595105,15.595105,0.004954,-1.811756,0.069603,24.479675,1725.281,...,6.338697,240.980957,1.959195,90.761838,43.488529,47.273309,101212.0,208.0,9.243,608.0
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,15.867592,15.867592,0.029192,-1.876686,0.046796,23.089431,1723.309,...,6.285206,240.843065,1.958074,87.967406,40.534498,47.432908,107844.0,215.0,10.212,608.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,12.835172,12.835172,0.168936,-0.728726,0.606745,24.965517,402.539,...,6.385128,57.707840,1.989926,19.684197,7.618398,12.065799,2286.0,42.0,3.395,138.0
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,12.673271,12.673271,0.196704,-0.722720,0.611202,25.037037,374.485,...,6.565470,53.707230,1.989157,19.692329,7.619967,12.072362,1880.0,40.0,2.679,130.0
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,12.572226,12.572226,0.181773,-1.016186,0.465702,27.576923,370.494,...,6.170967,50.804604,1.954023,22.049967,10.061855,11.988112,1648.0,37.0,2.588,118.0
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,12.364448,12.364448,0.112529,-1.084532,0.446971,28.200000,356.467,...,6.249866,48.797056,1.951882,22.005013,10.045259,11.959755,1472.0,37.0,1.808,114.0


In [56]:
df_2d_train.to_csv('Descriptors/Train_2d_all_descriptors.csv', index=False)

In [57]:
df_test_padel = pd.read_csv('Descriptors/Test_2d_padel_curated.csv')
df_test_rdkit = pd.read_csv('Descriptors/Test_2d_RDKit_des.csv')
df_test_mordred = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')

df_2d_test = df_test_rdkit.merge(df_test_mordred, on=['ID', 'SMILES', 'Permeability'], how='inner').merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_test

  df_test_mordred = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,15.806942,15.806942,0.022547,-1.946094,0.037676,23.507937,1777.744,...,6.529691,247.936262,1.967748,90.383811,40.574663,47.283631,111779.0,220.0,10.264,626.0
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,15.512854,15.512854,0.072929,-1.849230,0.082004,25.902439,1725.281,...,6.338697,240.768109,1.957464,90.840338,43.433503,47.406835,99910.0,213.0,9.667,610.0
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,16.029525,16.029525,0.042154,-1.949385,0.046040,22.827869,1701.218,...,6.464137,240.251303,1.969273,87.601926,40.568304,47.033622,103927.0,212.0,9.469,606.0
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,15.776936,15.776936,0.046352,-1.865645,0.035370,22.622951,1686.166,...,6.633901,242.969389,1.991552,85.904257,35.712474,50.191783,95572.0,199.0,10.338,610.0
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,15.432735,15.432735,0.040020,-1.845687,0.100132,26.663866,1669.173,...,6.415627,233.157008,1.959303,90.305554,42.950496,47.355058,93670.0,207.0,9.346,588.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,6.236151,61.708009,1.990581,19.682164,7.618006,12.064158,2750.0,44.0,4.111,146.0
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,6.236151,61.708009,1.990581,19.682164,7.618006,12.064158,2750.0,44.0,4.111,146.0
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,12.898424,12.898424,0.143657,-0.742398,0.662387,22.258065,430.549,...,6.619354,61.526924,1.984739,22.541021,10.153913,12.387108,2644.0,45.0,1.941,148.0
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,13.095577,13.095577,0.065481,-0.737917,0.561202,26.806452,430.549,...,6.619354,62.566820,2.018285,22.680885,10.197129,12.483757,2650.0,47.0,1.336,154.0


In [58]:
df_2d_test.to_csv('Descriptors/Test_2d_all_descriptors.csv', index=False)

In [59]:
#2d All descriptors
df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
# X_test = X_test.select_dtypes(include=['number'])
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')


X_train shape:  (5568, 3087)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')


X_test shape:  (1392, 3087)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.115761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514723
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2380
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514774
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2385
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099999 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2042,0.3325,0.4519,0.6721,0.8203,0.7881,0.2058,0.3277,0.4537,0.6748,0.8222,0.8044
DecisionTreeRegressor,0.3083,0.3879,0.5552,0.5051,0.7304,0.7092,0.2366,0.349,0.4864,0.6261,0.793,0.7722
RandomForestRegressor,0.2154,0.3432,0.4641,0.6542,0.809,0.776,0.2199,0.3399,0.469,0.6525,0.8081,0.7891
GradientBoostingRegressor,0.2208,0.3494,0.4699,0.6455,0.8059,0.767,0.2183,0.3406,0.4672,0.6551,0.8129,0.7837
AdaBoostRegressor,0.3743,0.492,0.6118,0.3991,0.6667,0.6325,0.3568,0.4753,0.5973,0.4362,0.6965,0.6861
XGBRegressor,0.2166,0.3429,0.4654,0.6522,0.808,0.7748,0.2151,0.3318,0.4638,0.6601,0.8127,0.7923
ExtraTreesRegressor,0.2088,0.338,0.457,0.6647,0.8153,0.7837,0.219,0.3374,0.468,0.6539,0.8088,0.7883
LinearRegression,1.1424,0.6212,1.0688,-0.8343,0.4362,0.5777,0.5253,0.4695,0.7248,0.1699,0.6108,0.6739
KNeighborsRegressor,0.2722,0.3808,0.5217,0.5629,0.7565,0.7221,0.271,0.3766,0.5206,0.5718,0.7598,0.7473
SVR,0.224,0.3392,0.4733,0.6403,0.8032,0.7743,0.2344,0.3451,0.4841,0.6296,0.7973,0.7769


In [60]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.66923493793545, -6.964947952869519, -6.187...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1131123303759125, -6.160156349224908, -6....","[-7.018860431317639, -6.49195061784413, -6.855...","[0.07600842296393379, 0.18118606790177316, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -6.27, -6.244...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -7.0, -6.6, -4.48, -5.74, -6.85,...","[-6.694, -7.0, -6.436, -6.618, -5.205999999999...","[0.5188294517469106, 0.0, 0.7650516322445171, ..."
2,RandomForestRegressor,"[-6.683282860196667, -6.861700000000001, -5.92...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.75595, -6.256974526863336, -6.64829999999...","[-6.791125000000001, -6.477154905372669, -6.67...","[0.04399054443854946, 0.13452735324136994, 0.0..."
3,GradientBoostingRegressor,"[-6.981264206991825, -6.939825737406757, -5.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.04721848136913, -6.0639393037152995, -6.9...","[-7.068218951225044, -6.263611790019377, -6.74...","[0.08655201939387533, 0.18907939943847402, 0.1..."
4,AdaBoostRegressor,"[-5.885562913907286, -6.204451254844452, -5.67...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.207477876106192, -5.627635834403068, -5.7...","[-6.121952989483036, -5.6523404038322385, -5.7...","[0.15640991449271419, 0.04285046501820804, 0.0..."
5,XGBRegressor,"[-7.2001286, -6.737824, -5.4927177, -4.932216,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.2036347, -6.6719604, -7.09824, -6.8739405...","[-7.0979676, -6.543934, -6.892276, -6.6681466,...","[0.14426151, 0.29652217, 0.1287557, 0.20234475..."
6,ExtraTreesRegressor,"[-6.743400000000003, -6.9152, -6.1861000000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9781, -6.486550000000001, -6.9529, -6.575...","[-6.966880000000001, -6.364003333333335, -6.92...","[0.01657460708433138, 0.20269284819921754, 0.0..."
7,LinearRegression,"[-3.9, -10.0, -6.561596547086083, -6.924810844...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -7.272955313128477, -10.0, -5.67883168...","[-5.119999999999999, -6.699749097312109, -9.12...","[2.44, 2.3505082398115613, 0.7756177413952802,..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -4.85333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -7.0, -5.90333333...","[-6.7780000000000005, -6.698, -6.9120000000000...","[0.18126224096595522, 0.09733333333333362, 0.1..."
9,SVR,"[-6.314827277960015, -6.951131927148965, -5.81...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.035044752014571, -6.667282141034452, -7.0...","[-6.094841932509903, -6.742371975393988, -7.04...","[0.0340231901308605, 0.13085399124966526, 0.05..."


In [61]:
result_df.to_csv('descriptors_results/Results_2D_All_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_All_desc.csv')

In [62]:
#2d All descriptors const rem
df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')


X_train shape:  (5568, 2504)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')


X_test shape:  (1392, 2504)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514723
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2380
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514774
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2385
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2042,0.3325,0.4519,0.6721,0.8203,0.7881,0.2058,0.3277,0.4537,0.6748,0.8222,0.8044
DecisionTreeRegressor,0.3059,0.3856,0.5531,0.5088,0.7329,0.7127,0.2314,0.3444,0.481,0.6344,0.7976,0.7745
RandomForestRegressor,0.2155,0.3434,0.4643,0.6539,0.8088,0.776,0.2196,0.3395,0.4686,0.653,0.8085,0.7895
GradientBoostingRegressor,0.2203,0.3491,0.4693,0.6463,0.8065,0.7672,0.2187,0.3408,0.4676,0.6545,0.8125,0.7834
AdaBoostRegressor,0.3778,0.4938,0.6146,0.3934,0.6626,0.6219,0.3575,0.4771,0.5979,0.4351,0.6957,0.6761
XGBRegressor,0.2166,0.3429,0.4654,0.6522,0.808,0.7748,0.2151,0.3318,0.4638,0.6601,0.8127,0.7923
ExtraTreesRegressor,0.2092,0.3381,0.4573,0.6642,0.815,0.784,0.218,0.337,0.4669,0.6555,0.8097,0.79
LinearRegression,1.1424,0.6212,1.0688,-0.8343,0.4362,0.5776,0.5253,0.4695,0.7248,0.1699,0.6108,0.6739
KNeighborsRegressor,0.2736,0.3818,0.5231,0.5607,0.755,0.719,0.2703,0.3757,0.5199,0.5728,0.7603,0.7484
SVR,0.224,0.3392,0.4733,0.6403,0.8032,0.7743,0.2344,0.3451,0.4841,0.6296,0.7973,0.7768


In [63]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.66923493793545, -6.964947952869519, -6.187...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1131123303759125, -6.160156349224908, -6....","[-7.018860431317639, -6.49195061784413, -6.855...","[0.07600842296393379, 0.18118606790177316, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -6.96, -7.0, -5.92, -6.244999999999999,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -7.0, -7.0, -4.43, -5.8, -6.85, ...","[-6.748, -6.992, -6.5760000000000005, -6.97000...","[0.5039999999999999, 0.016000000000000014, 0.5..."
2,RandomForestRegressor,"[-6.6145000000000005, -6.813439999999999, -5.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.801800000000001, -6.2448749999999995, -6....","[-6.815437, -6.483283139319335, -6.69159480598...","[0.0466001166951329, 0.13993377441419116, 0.09..."
3,GradientBoostingRegressor,"[-6.981264206991823, -6.939825737406756, -5.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.073263222609385, -6.063939303715299, -7.1...","[-7.073427899473096, -6.26005136233825, -6.775...","[0.08591276917451098, 0.19377985690008964, 0.1..."
4,AdaBoostRegressor,"[-5.885562913907283, -6.377272837070318, -5.60...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.304420600858371, -5.684994355584168, -5.8...","[-6.114998915502312, -5.697759913121283, -5.71...","[0.16125623562845817, 0.07888993482493699, 0.0..."
5,XGBRegressor,"[-7.2001286, -6.737824, -5.4927177, -4.932216,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.2036347, -6.6719604, -7.09824, -6.8739405...","[-7.0979676, -6.543934, -6.892276, -6.6681466,...","[0.14426151, 0.29652217, 0.1287557, 0.20234475..."
6,ExtraTreesRegressor,"[-6.780100000000001, -6.956799999999999, -6.22...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.956600000000002, -6.239750000000003, -6.9...","[-6.958940000000001, -6.319060000000001, -6.93...","[0.018105093206056685, 0.1205271811667388, 0.0..."
7,LinearRegression,"[-3.9, -10.0, -6.561595810264407, -6.924811289...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -7.27295671930915, -10.0, -5.678827745...","[-5.119999999999999, -6.699755758944614, -9.12...","[2.44, 2.350504710954084, 0.7756174763588719, ..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -4.85333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -7.0, -5.90333333...","[-6.7780000000000005, -6.698, -6.9120000000000...","[0.18126224096595522, 0.09733333333333362, 0.1..."
9,SVR,"[-6.314907965426872, -6.951309678541952, -5.81...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.035070885358877, -6.667518105491782, -7.0...","[-6.094828523113399, -6.742352408572694, -7.04...","[0.03399947448581303, 0.13083693271290386, 0.0..."


In [64]:
result_df.to_csv('descriptors_results/Results_2D_All_desc_const_rem.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_All_desc_const_rem.csv')

In [65]:
#2d All descriptors LVR
df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')


X_train shape:  (5568, 1696)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')


X_test shape:  (1392, 1696)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338492
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1668
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338547
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1673
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2067,0.3347,0.4547,0.6681,0.8178,0.7866,0.2076,0.3288,0.4556,0.6719,0.8204,0.8032
DecisionTreeRegressor,0.307,0.3866,0.5541,0.5071,0.7309,0.715,0.2354,0.3479,0.4852,0.628,0.7941,0.769
RandomForestRegressor,0.2159,0.3435,0.4646,0.6534,0.8084,0.7757,0.2188,0.3391,0.4678,0.6543,0.8092,0.7899
GradientBoostingRegressor,0.2226,0.3504,0.4718,0.6426,0.8041,0.7666,0.2208,0.344,0.4699,0.6511,0.8106,0.7825
AdaBoostRegressor,0.3737,0.4935,0.6113,0.4,0.6679,0.6207,0.3615,0.4806,0.6013,0.4287,0.6913,0.6732
XGBRegressor,0.2199,0.3433,0.4689,0.6469,0.8049,0.7748,0.2103,0.3272,0.4586,0.6677,0.8172,0.7992
ExtraTreesRegressor,0.209,0.3379,0.4572,0.6644,0.8151,0.7844,0.2166,0.3356,0.4655,0.6577,0.8111,0.7911
LinearRegression,0.5944,0.4568,0.771,0.0456,0.5986,0.6934,0.3824,0.3978,0.6184,0.3958,0.6862,0.7428
KNeighborsRegressor,0.275,0.3821,0.5244,0.5584,0.7538,0.7194,0.2712,0.3761,0.5208,0.5715,0.7593,0.7486
SVR,0.2304,0.3441,0.48,0.63,0.7968,0.7676,0.238,0.3465,0.4879,0.6239,0.7937,0.7737


In [66]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.775006295958385, -7.044334642785323, -5.84...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.038291096383678, -6.271709313862166, -6.7...","[-7.03025682412561, -6.449619435185689, -6.738...","[0.10026203920945181, 0.13198451709655168, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -6.96, -5.49, -5.92, -5.15, -4.57, -4.8...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -5.54, -7.0, -4.48, -6.22, -6.85...","[-6.494, -7.0, -6.424000000000001, -6.66800000...","[0.5922702086041471, 0.0, 0.6744953669225607, ..."
2,RandomForestRegressor,"[-6.648923861868334, -6.774400000000001, -5.83...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.896699999999999, -6.309600000000001, -6.6...","[-6.793405000000002, -6.5167, -6.7254513333333...","[0.08779690825991295, 0.13504658455510818, 0.0..."
3,GradientBoostingRegressor,"[-6.830447115808372, -6.853876744289797, -5.65...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.893899815154443, -5.916397077734523, -6.7...","[-7.135238853433302, -6.2329674118339256, -6.6...","[0.16272254147816945, 0.251219015862691, 0.082..."
4,AdaBoostRegressor,"[-5.941894216551937, -6.446379310344829, -5.67...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.288562091503275, -5.576861201549921, -6.0...","[-6.267533512299738, -5.674082992065467, -5.74...","[0.2723940793625328, 0.07899474097905826, 0.16..."
5,XGBRegressor,"[-6.9002123, -6.6230597, -6.2625566, -5.552017...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.2035394, -6.250859, -6.7339554, -6.79281,...","[-7.0722513, -6.454178, -6.8812876, -6.64526, ...","[0.25448653, 0.28094512, 0.1525774, 0.3016518,..."
6,ExtraTreesRegressor,"[-6.777150000000001, -6.958099999999999, -6.12...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9851, -6.347900000000002, -6.979399999999...","[-6.97114, -6.322830000000001, -6.95584, -6.48...","[0.01923045501281725, 0.11544971892559927, 0.0..."
7,LinearRegression,"[-3.9, -8.501492274088754, -6.424829769625298,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -5.700108670200656, -7.064154748317755...","[-4.367937851098814, -6.785695315010734, -7.18...","[0.935875702197629, 0.6789287272286878, 0.2096..."
8,KNeighborsRegressor,"[-6.63, -7.0, -5.88, -4.8533333333333335, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -7.0, -6.12666666...","[-6.7780000000000005, -6.696000000000001, -7.0...","[0.18126224096595522, 0.10133333333333354, 0.0..."
9,SVR,"[-6.253119177472127, -6.922623309742223, -5.80...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.095578009941959, -6.677675194339535, -7.0...","[-6.136380525718555, -6.749106046909458, -7.05...","[0.024674615565158084, 0.12866490314317372, 0...."


In [67]:
result_df.to_csv('descriptors_results/Results_2D_All_desc_LVR.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_All_desc_LVR.csv')

In [68]:
def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [69]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [70]:
df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
X_train = df_train[selected_features] 
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test =df_test.dropna()
X_test =  df_test[X_train.columns]
y_test =  df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df



  df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')


X_train shape:  (5568, 234)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')


X_test shape:  (1392, 234)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48150
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 225
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48170
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 227
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.211,0.3387,0.4593,0.6613,0.8137,0.783,0.2066,0.3265,0.4546,0.6735,0.8218,0.8038
DecisionTreeRegressor,0.3083,0.3873,0.5553,0.505,0.7313,0.7112,0.2354,0.3461,0.4852,0.628,0.7936,0.7693
RandomForestRegressor,0.2195,0.3463,0.4685,0.6476,0.8049,0.7732,0.2235,0.3401,0.4727,0.6469,0.8046,0.7852
GradientBoostingRegressor,0.231,0.3569,0.4807,0.629,0.7958,0.7595,0.2274,0.3461,0.4769,0.6406,0.8044,0.7783
AdaBoostRegressor,0.3941,0.5047,0.6278,0.3672,0.6379,0.5985,0.3734,0.4897,0.6111,0.41,0.6744,0.6613
XGBRegressor,0.2154,0.3412,0.4642,0.6541,0.8092,0.7758,0.2143,0.331,0.4629,0.6614,0.8134,0.7966
ExtraTreesRegressor,0.2097,0.3382,0.4579,0.6633,0.8145,0.7849,0.219,0.3361,0.4679,0.654,0.8088,0.79
LinearRegression,0.2933,0.3954,0.5416,0.5291,0.7326,0.7157,0.3058,0.3954,0.553,0.5168,0.7244,0.7357
KNeighborsRegressor,0.2749,0.3811,0.5243,0.5586,0.7529,0.7222,0.2818,0.379,0.5308,0.5547,0.7488,0.7464
SVR,0.2215,0.3379,0.4706,0.6444,0.8058,0.7798,0.2269,0.3383,0.4763,0.6415,0.8049,0.7905


In [71]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.7549618282272625, -6.987711431388642, -6.2...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.074990309575013, -6.609793699397045, -6.9...","[-6.961551822349934, -6.562746102781337, -6.83...","[0.0668039417381613, 0.15079700432525892, 0.14..."
1,DecisionTreeRegressor,"[-7.0, -4.89, -5.82, -6.05, -5.15, -4.68, -4.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.03, -7.0, -6.89, -7.0, -4.85, -6.85, -6.8...","[-6.3629999999999995, -6.8180000000000005, -6....","[0.5680985829941843, 0.2827295527531568, 0.074..."
2,RandomForestRegressor,"[-6.6549912274512515, -6.503700000000002, -5.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8553999999999995, -6.464442747710001, -6....","[-6.7909266666666666, -6.5053229085163595, -6....","[0.0678815648349709, 0.03922360462864112, 0.11..."
3,GradientBoostingRegressor,"[-6.919888161671546, -7.212716040381712, -6.10...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1037652168499825, -6.070382650817791, -6....","[-7.018475949506808, -6.1673176694771525, -6.7...","[0.10446561875696024, 0.18234180444811154, 0.0..."
4,AdaBoostRegressor,"[-5.843482645489244, -6.192974084635068, -5.41...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.217024367385192, -5.64615674968213, -5.82...","[-5.841713451537541, -5.596165271798127, -5.73...","[0.2166305209436102, 0.05522669285817165, 0.07..."
5,XGBRegressor,"[-7.0389667, -6.3067427, -5.884203, -5.0844836...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0756392, -6.764063, -7.0075846, -5.954829...","[-7.0791206, -6.635727, -6.828967, -6.54276, -...","[0.38051167, 0.1532088, 0.23159656, 0.3936177,..."
6,ExtraTreesRegressor,"[-6.652149999999999, -6.89, -5.993550000000003...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.982200000000001, -6.450450000000002, -6.9...","[-6.962840000000002, -6.488070000000002, -6.90...","[0.020212233919089925, 0.056068160305114294, 0..."
7,LinearRegression,"[-9.276354930741222, -6.363338610701553, -5.80...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-10.0, -6.835488005855444, -6.90118651473656...","[-10.0, -6.4473676792321895, -6.69638288115666...","[0.0, 0.20064442363861706, 0.10963485337969928..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -5.16, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.003333333333333, -7.0, -5.86666666...","[-6.7780000000000005, -5.8693333333333335, -6....","[0.18126224096595522, 0.5429565155496138, 0.32..."
9,SVR,"[-6.006092973511009, -6.764859817635018, -5.68...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.005583822325867, -6.506704065715205, -6.8...","[-6.030455136920914, -6.571820980584316, -6.85...","[0.017773283988844334, 0.18794379192255367, 0...."


In [72]:
result_df.to_csv('descriptors_results/Results_2D_All_desc_LVR_remove_corr_features.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_All_desc_LVRremove_corr_features.csv')

In [73]:
#3d RDKit descriptors
df_train = pd.read_csv('Descriptors/Train_3d_RDKit_desc.csv')
df_train = df_train.fillna(0)
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_3d_RDKit_desc.csv')
df_test = df_test.fillna(0)
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 11)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 11)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 11
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 11
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ov



0.17162055730255132


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5321,0.5498,0.7295,0.1456,0.3837,0.3549,0.5229,0.5455,0.7231,0.1737,0.4172,0.3911
DecisionTreeRegressor,1.034,0.7508,1.0169,-0.6602,0.2001,0.2133,0.6364,0.5984,0.7977,-0.0056,0.3202,0.3102
RandomForestRegressor,0.544,0.5527,0.7376,0.1265,0.375,0.3485,0.5218,0.5428,0.7224,0.1754,0.4208,0.4034
GradientBoostingRegressor,0.532,0.5508,0.7293,0.1459,0.382,0.3477,0.5287,0.5506,0.7271,0.1646,0.4079,0.3775
AdaBoostRegressor,0.6518,0.656,0.8073,-0.0465,0.2918,0.2545,0.625,0.6469,0.7905,0.0124,0.3549,0.3127
XGBRegressor,0.6013,0.5796,0.7754,0.0346,0.3255,0.3134,0.5451,0.5506,0.7383,0.1387,0.3919,0.3743
ExtraTreesRegressor,0.5742,0.5695,0.7578,0.078,0.3384,0.315,0.538,0.5508,0.7335,0.1498,0.398,0.3826
LinearRegression,0.5934,0.5853,0.7704,0.0472,0.2175,0.2773,0.6001,0.5884,0.7747,0.0517,0.2284,0.298
KNeighborsRegressor,0.7067,0.6339,0.8407,-0.1347,0.2197,0.2001,0.6336,0.5989,0.796,-0.0012,0.2808,0.2715
SVR,0.547,0.5406,0.7396,0.1218,0.3791,0.36,0.5387,0.5358,0.734,0.1487,0.4168,0.3956


In [74]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.840990357809873, -6.018073104967825, -5.90...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.209379011163339, -5.383746226350325, -6.7...","[-6.1925158432639105, -5.5312465678818254, -6....","[0.07532828243317771, 0.16755961234189062, 0.0..."
1,DecisionTreeRegressor,"[-5.89, -6.89, -6.85, -4.55, -6.24, -4.55, -4....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.89, -5.49, -6.725, -4.43, -6.24, -6.51, -...","[-6.956, -5.634, -6.779999999999999, -5.745999...","[0.05388877434123007, 0.3188479261340739, 0.05..."
2,RandomForestRegressor,"[-6.1436999999999955, -6.5442, -5.874300000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.698799999999998, -5.6126, -6.737091747280...","[-6.574419999999998, -5.717779999999999, -6.78...","[0.10510642986991861, 0.12731819037356706, 0.0..."
3,GradientBoostingRegressor,"[-6.294512334860218, -6.3977833930946435, -5.3...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.2936647948792395, -5.429220929096311, -6....","[-6.434030478360389, -5.551352926144518, -6.81...","[0.12974709424756656, 0.14128450196098, 0.0871..."
4,AdaBoostRegressor,"[-5.92586153029426, -5.986273563540271, -5.706...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.125387271903219, -5.843205828550536, -5.9...","[-5.999338657008337, -5.730324583085172, -6.18...","[0.11226454205240709, 0.1611680908128182, 0.10..."
5,XGBRegressor,"[-5.5508122, -6.223559, -6.147515, -4.9822364,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.928677, -5.7488008, -6.745749, -4.775224,...","[-6.832751, -6.1194224, -6.779682, -5.1491566,...","[0.32754064, 0.2544124, 0.058654938, 0.2095480..."
6,ExtraTreesRegressor,"[-6.090299999999993, -6.64, -5.863700000000006...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.683599999999999, -5.910299999999996, -6.7...","[-6.7109799999999975, -5.931819999999996, -6.7...","[0.08441361027701591, 0.21792589933277703, 0.0..."
7,LinearRegression,"[-4.018517682485642, -5.234405485551888, -5.13...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-4.92955646363365, -3.9832246640142657, -6.6...","[-4.910671596402999, -3.98385874473409, -6.739...","[0.09101831979523514, 0.06625888974342976, 0.0..."
8,KNeighborsRegressor,"[-6.3066666666666675, -6.48, -5.43666666666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.963333333333334, -6.086666666666666, -6.9...","[-6.978, -5.914, -6.959999999999999, -5.451333...","[0.017962924780409735, 0.25727114965429837, 0...."
9,SVR,"[-6.147305211654844, -6.580774297310523, -5.72...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.138560494450946, -5.794382653594131, -6.8...","[-7.132451613669845, -5.823109086014815, -6.90...","[0.06635737681271917, 0.2997262652824907, 0.00..."


In [75]:
result_df.to_csv('descriptors_results/Results_3D_RDKit_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_3D_RDKit_desc.csv')

In [76]:
#3d Padel descriptors
df_train = pd.read_csv('Descriptors/Train_3d_padel_curated.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_3d_padel_curated.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 431)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 431)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109905
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 431
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010851 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109905
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 431
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2936,0.398,0.5419,0.5285,0.7284,0.7021,0.2847,0.3908,0.5336,0.5501,0.7456,0.7267
DecisionTreeRegressor,0.6009,0.5524,0.7752,0.0352,0.5163,0.5185,0.3564,0.4271,0.597,0.4369,0.6687,0.6577
RandomForestRegressor,0.3056,0.4058,0.5528,0.5093,0.7169,0.6937,0.2986,0.401,0.5465,0.5281,0.7313,0.7204
GradientBoostingRegressor,0.3191,0.4139,0.5649,0.4877,0.7017,0.6754,0.3189,0.4119,0.5647,0.4961,0.7086,0.6865
AdaBoostRegressor,0.4726,0.5532,0.6874,0.2412,0.5603,0.5377,0.4656,0.5511,0.6824,0.2642,0.583,0.5742
XGBRegressor,0.3242,0.4213,0.5694,0.4795,0.6952,0.6677,0.2901,0.3893,0.5386,0.5415,0.7364,0.7195
ExtraTreesRegressor,0.29,0.397,0.5385,0.5344,0.7367,0.7134,0.2834,0.3913,0.5323,0.5522,0.75,0.7359
LinearRegression,0.3918,0.4635,0.6259,0.371,0.6206,0.6287,0.3793,0.4547,0.6158,0.4007,0.6365,0.6547
KNeighborsRegressor,0.4537,0.4893,0.6736,0.2715,0.5472,0.5256,0.4292,0.4769,0.6551,0.3218,0.5789,0.5745
SVR,0.3763,0.4252,0.6134,0.3958,0.6371,0.6516,0.3849,0.4259,0.6204,0.3918,0.6375,0.6589


In [77]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.52443676829918, -6.714604480583924, -6.211...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.86439711736285, -6.042996605970523, -6.99...","[-6.739437824630317, -6.209154140001532, -6.95...","[0.12165931342401683, 0.10467939276475072, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -6.34, -7.0, -5.85, -5.15, -4.77, -6.64...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -6.944, -7.0, -6.24, -5.7, -7.0...","[-6.626, -6.054, -6.954266666666666, -7.0, -6....","[0.47123667089902926, 0.8075543325374461, 0.01..."
2,RandomForestRegressor,"[-6.461599999999999, -6.457763669595, -5.9325,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.616100000000001, -6.319700000000003, -6.9...","[-6.678523211521001, -6.31348, -6.948660466422...","[0.09270267867720863, 0.11674280106284991, 0.0..."
3,GradientBoostingRegressor,"[-6.71438830826164, -6.987742941314053, -5.992...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.874843750703767, -6.185688463385039, -7.0...","[-6.700682967121772, -6.1239074176024655, -7.0...","[0.22355202275293526, 0.13932672528509313, 0.0..."
4,AdaBoostRegressor,"[-5.719654692246028, -5.787213051418693, -5.60...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.667160549812053, -5.6082100469893055, -6....","[-5.667058179206006, -5.557522703846805, -6.02...","[0.025383246861456002, 0.06676266451369214, 0...."
5,XGBRegressor,"[-5.961434, -7.029575, -6.7682357, -5.217129, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.826388, -6.7332306, -6.937705, -6.854204,...","[-6.7621546, -6.6451316, -6.9526887, -6.586420...","[0.2512532, 0.24982134, 0.016342513, 0.3489102..."
6,ExtraTreesRegressor,"[-6.483400000000004, -6.612300000000001, -5.85...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.745699999999999, -6.671800000000003, -6.9...","[-6.774780000000002, -6.666960000000001, -6.95...","[0.06782938596213317, 0.08181984111448769, 0.0..."
7,LinearRegression,"[-6.787959992801994, -6.258804193149205, -5.98...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.766553490422402, -6.12365074629135, -6.94...","[-8.310734357422533, -5.7980794115694065, -6.9...","[0.4018684198617574, 0.29678968265340283, 0.01..."
8,KNeighborsRegressor,"[-6.733333333333333, -5.59, -5.949999999999999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.63, -6.973333333333334, -6.49666666...","[-6.949333333333334, -6.703999999999999, -6.96...","[0.10133333333333318, 0.14800000000000005, 0.0..."
9,SVR,"[-6.864974413534201, -6.112224532659442, -5.79...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.059509231524494, -6.599052038780751, -6.9...","[-7.06384983036281, -6.659886988027376, -6.900...","[0.025712017474068243, 0.10088117285515572, 0...."


In [78]:
result_df.to_csv('descriptors_results/Results_3D_padel_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_3D_padel_desc.csv')

In [79]:
df_train_rdkit = pd.read_csv('Descriptors/Train_3d_RDKit_desc.csv')
df_train_rdkit = df_train_rdkit.fillna(0)
df_train_padel = pd.read_csv('Descriptors/Train_3d_padel_curated.csv')

df_3d_descriptors = df_train_rdkit.merge(df_train_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,57271.392656,84706.704866,131312.284498,0.436146,0.645078,8.778150,0.000011,...,0.651022,0.310326,0.471748,0.474816,0.390502,96.230112,2214.927222,9269.709607,0.476534,1.337065
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,58101.718060,69316.298095,115939.041133,0.501140,0.597868,8.378654,0.000010,...,0.536705,0.383044,0.499076,0.511123,0.387397,78.913503,1739.872712,9926.351313,0.379623,1.397597
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,49788.948788,91676.720195,127366.763818,0.390910,0.719785,8.826650,0.000014,...,0.649923,0.291878,0.453686,0.515599,0.399368,91.549655,2049.323105,10612.165217,0.474884,1.368653
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.811324,0.126642,0.576205,0.431076,0.342110,102.180579,1680.289350,8582.496833,0.716985,1.349391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,1891.284732,5055.635080,6387.678814,0.296083,0.791467,4.069781,0.000418,...,0.700532,0.213598,0.438801,0.369091,0.433886,20.020232,91.436181,214.560045,0.550798,1.241777
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,1171.539745,4912.089011,5395.951856,0.217115,0.910329,3.914991,0.000777,...,0.797000,0.144638,0.463418,0.440235,0.444609,20.228575,69.658199,145.575438,0.695500,1.348262
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,2179.710197,3127.136956,4764.692067,0.457471,0.656315,3.686739,0.000301,...,0.461348,0.346276,0.479049,0.456672,0.464743,15.466525,75.381197,204.552808,0.211436,1.400463
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,1873.085082,2859.925158,4153.290896,0.450988,0.688593,3.530497,0.000368,...,0.627527,0.312461,0.522381,0.504997,0.427949,16.762042,70.940669,143.120522,0.441290,1.455327


In [80]:

nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [81]:
df_3d_descriptors.to_csv('Descriptors/Train_3d_all_descriptors.csv', index=False)

In [82]:
df_test_rdkit = pd.read_csv('Descriptors/Test_3d_RDKit_desc.csv')
df_test_rdkit = df_test_rdkit.fillna(0)
df_test_padel = pd.read_csv('Descriptors/Test_3d_padel_curated.csv')

df_3d_descriptors = df_test_rdkit.merge(df_test_padel, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_3d_descriptors

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,34902.495879,130685.725940,150085.857142,0.232550,0.870740,9.422580,0.000025,...,0.780699,0.174328,0.539789,0.475113,0.431884,96.206319,1657.209281,7203.627712,0.671048,1.446786
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,59935.579142,71658.902667,111362.460017,0.538203,0.643474,8.391116,0.000011,...,0.554143,0.382135,0.519415,0.521290,0.389075,83.091448,1873.928421,9698.038245,0.404417,1.429780
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,43711.954275,73710.637354,99360.782765,0.439932,0.741848,8.017663,0.000017,...,0.596153,0.354072,0.403854,0.454070,0.363028,79.882613,1648.772500,7084.348331,0.425338,1.220953
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,57017.133165,78022.452280,122140.702987,0.466815,0.638792,8.777143,0.000011,...,0.540248,0.406175,0.592882,0.546456,0.401213,86.828192,2036.636544,9819.558017,0.419634,1.540551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,2624.539814,5235.905285,7178.427044,0.365615,0.729395,4.178873,0.000278,...,0.694755,0.244015,0.423204,0.403477,0.434739,22.780437,117.807253,263.303199,0.542133,1.261420
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,3393.803325,4465.343773,6048.910610,0.561060,0.738206,4.018692,0.000218,...,0.642128,0.311598,0.491460,0.489046,0.424396,22.210103,120.470127,244.118194,0.463192,1.404903
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,2350.480932,4746.916538,6169.098467,0.381009,0.769467,3.925110,0.000327,...,0.622128,0.285545,0.480992,0.447816,0.381100,20.440926,109.241371,269.765567,0.433192,1.309908
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,2185.854125,6171.808705,7781.242611,0.280913,0.793165,4.329231,0.000363,...,0.638515,0.303978,0.444298,0.461387,0.468771,20.244697,101.762869,214.619408,0.457773,1.374456


In [83]:
nan_rows = df_3d_descriptors[df_3d_descriptors.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds


In [84]:
df_3d_descriptors.to_csv('Descriptors/Test_3d_all_descriptors.csv', index=False)

In [85]:
#3d All descriptors
df_train = pd.read_csv('Descriptors/Train_3d_all_descriptors.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_3d_all_descriptors.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models_3dall = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_3dall, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 442)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 442)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012361 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 112710
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 442
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 112710
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 442
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2935,0.3977,0.5418,0.5287,0.7287,0.7012,0.2851,0.3912,0.534,0.5495,0.7454,0.7267
DecisionTreeRegressor,0.6008,0.5535,0.7751,0.0354,0.5143,0.515,0.3494,0.4252,0.5911,0.4479,0.6762,0.6654
RandomForestRegressor,0.3046,0.4051,0.5519,0.511,0.7183,0.6954,0.2991,0.4016,0.5469,0.5273,0.7309,0.7194
GradientBoostingRegressor,0.321,0.4155,0.5666,0.4846,0.6994,0.6737,0.3205,0.4124,0.5661,0.4935,0.707,0.6856
AdaBoostRegressor,0.4713,0.5519,0.6865,0.2432,0.5614,0.5432,0.4622,0.5482,0.6799,0.2696,0.5882,0.5805
XGBRegressor,0.3258,0.4191,0.5708,0.4768,0.6933,0.6678,0.2859,0.3849,0.5347,0.5482,0.7415,0.7269
ExtraTreesRegressor,0.2921,0.3988,0.5405,0.5309,0.7347,0.7133,0.2823,0.3891,0.5313,0.5539,0.7521,0.7382
LinearRegression,0.3922,0.4648,0.6262,0.3703,0.6203,0.6268,0.3781,0.4538,0.6149,0.4025,0.6385,0.6563
KNeighborsRegressor,0.4604,0.4931,0.6786,0.2607,0.5406,0.5195,0.4256,0.4765,0.6524,0.3275,0.5825,0.5818
SVR,0.3765,0.4253,0.6136,0.3955,0.6367,0.6522,0.3854,0.4264,0.6208,0.3909,0.637,0.659


In [86]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.45855752718111, -6.825411863089695, -6.282...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.85104837944038, -5.950433080818467, -6.94...","[-6.753886113771424, -6.1638521492420395, -6.9...","[0.07463090504875482, 0.12584015624407557, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -6.11, -6.85, -4.24, -5.15, -4.92, -6.0...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.96, -6.24, -6.933333333333334, -7.0, -6.2...","[-6.806, -6.134, -6.953333333333333, -6.924000...","[0.3484594667963548, 0.7781927781726069, 0.024..."
2,RandomForestRegressor,"[-6.439099999999999, -6.42736440205, -6.016500...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.614600000000001, -6.221700000000002, -6.9...","[-6.658063211520999, -6.221480000000001, -6.93...","[0.10992087901242903, 0.12468400699367994, 0.0..."
3,GradientBoostingRegressor,"[-6.550853120433564, -7.050649626208297, -5.87...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0343111936747365, -6.16667537761464, -6.9...","[-6.686592830833765, -6.056583121379935, -6.98...","[0.24350001885856007, 0.12372159873631605, 0.0..."
4,AdaBoostRegressor,"[-5.763655352659802, -5.64659268495428, -5.608...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.645942944930148, -5.561489361702128, -6.1...","[-5.656426983272247, -5.566402319564771, -6.15...","[0.04209670415884818, 0.07108826854431505, 0.1..."
5,XGBRegressor,"[-6.1645055, -6.809942, -7.038272, -5.020259, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.122744, -6.4203334, -6.925336, -6.6688237...","[-6.88045, -6.5412607, -6.951003, -6.4983544, ...","[0.20866871, 0.16749975, 0.024016194, 0.319919..."
6,ExtraTreesRegressor,"[-6.549400000000005, -6.625699999999999, -5.94...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7298, -6.517800000000002, -6.933333333333...","[-6.77304, -6.517680000000001, -6.953333333333...","[0.03311450437497195, 0.17830430617346338, 0.0..."
7,LinearRegression,"[-6.77576546284677, -6.3800608221876445, -5.89...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.720933395771683, -5.882737907635661, -6.9...","[-8.216851714639512, -5.629258138548775, -6.93...","[0.366742851329353, 0.28704946618312804, 0.013..."
8,KNeighborsRegressor,"[-6.733333333333333, -7.0, -5.949999999999999,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.63, -6.933333333333334, -6.49666666...","[-6.866, -6.653333333333333, -6.94666666666666...","[0.11423951447142378, 0.04666666666666685, 0.0..."
9,SVR,"[-6.944922655750519, -6.123102655423644, -5.79...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.162723119046205, -6.552123207169262, -6.9...","[-7.1118802414628615, -6.605330552675314, -6.9...","[0.04178406899731635, 0.11312217531578814, 0.0..."


In [87]:
result_df.to_csv('descriptors_results/Results_3D_All_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_3D_All_desc.csv')

In [88]:
#3d All descriptors const rem
df_train = pd.read_csv('Descriptors/Train_3d_all_descriptors.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_3d_all_descriptors.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")


X_train shape:  (5568, 442)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 442)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [89]:
#3d All descriptors LVR
df_train = pd.read_csv('Descriptors/Train_3d_all_descriptors.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_3d_all_descriptors.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 391)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 391)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99705
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 391
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99705
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 391
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3016,0.4015,0.5491,0.5158,0.7198,0.6967,0.2926,0.3937,0.541,0.5376,0.7373,0.7215
DecisionTreeRegressor,0.6281,0.5631,0.7925,-0.0084,0.494,0.5004,0.3585,0.4345,0.5987,0.4335,0.6651,0.6436
RandomForestRegressor,0.3091,0.408,0.556,0.5037,0.7127,0.6898,0.3035,0.4039,0.5509,0.5204,0.7256,0.7152
GradientBoostingRegressor,0.3318,0.4215,0.576,0.4673,0.6868,0.6668,0.3297,0.4202,0.5742,0.4791,0.6975,0.6856
AdaBoostRegressor,0.4804,0.5582,0.6931,0.2287,0.5444,0.5239,0.4739,0.5563,0.6884,0.2512,0.5618,0.5586
XGBRegressor,0.3327,0.423,0.5768,0.4659,0.6855,0.6579,0.2945,0.3922,0.5427,0.5347,0.7318,0.7205
ExtraTreesRegressor,0.2996,0.4022,0.5473,0.519,0.7258,0.7064,0.2891,0.394,0.5377,0.5432,0.7449,0.7349
LinearRegression,0.3978,0.4667,0.6307,0.3613,0.611,0.6254,0.3958,0.4587,0.6291,0.3746,0.6185,0.6425
KNeighborsRegressor,0.4316,0.4731,0.657,0.3069,0.5768,0.56,0.4167,0.4682,0.6455,0.3415,0.5959,0.5938
SVR,0.3775,0.4256,0.6144,0.3939,0.635,0.6535,0.3855,0.4263,0.6209,0.3909,0.6366,0.6609


In [90]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.451545469355496, -6.825065471867783, -6.40...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.833217313554828, -5.977048683990337, -6.7...","[-6.71822649466763, -6.032911791647255, -6.672...","[0.07012062747907138, 0.059348005388028424, 0...."
1,DecisionTreeRegressor,"[-7.0, -6.34, -6.85, -6.64, -5.15, -4.72, -6.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -6.933333333333334, -7.0, -7.0,...","[-6.7780000000000005, -6.593999999999999, -6.9...","[0.4440000000000001, 0.3024962809688739, 0.024..."
2,RandomForestRegressor,"[-6.360399999999999, -6.508763669595001, -6.09...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.6788, -6.1431000000000004, -6.91100158730...","[-6.693180000000001, -6.173200000000001, -6.90...","[0.0669451835459433, 0.09708744512036556, 0.01..."
3,GradientBoostingRegressor,"[-6.263644395715716, -6.9512586571077355, -5.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.706187566307296, -6.1778349934010075, -6....","[-6.707304914321741, -6.009825271279352, -6.72...","[0.13955172638441618, 0.2136657932247692, 0.05..."
4,AdaBoostRegressor,"[-5.617673968453696, -5.608210046989292, -5.45...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.549302225854554, -5.5387093916912065, -5....","[-5.4926146883976905, -5.475617174640331, -5.9...","[0.03551251318563969, 0.039314397074336684, 0...."
5,XGBRegressor,"[-5.700738, -6.8018227, -6.3681507, -4.8876786...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.551938, -6.958453, -6.9369736, -6.1428475...","[-6.7773232, -6.566503, -6.9547462, -6.5946035...","[0.21164767, 0.41702402, 0.03044877, 0.3180512..."
6,ExtraTreesRegressor,"[-6.560800000000004, -6.582299999999999, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7672, -6.611000000000001, -6.933333333333...","[-6.766840000000002, -6.5604, -6.9533333333333...","[0.07134655142331583, 0.1346935039265073, 0.02..."
7,LinearRegression,"[-6.440543907797292, -6.581360472310361, -5.75...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-8.086652579425934, -5.561031128561217, -6.9...","[-8.62747530543513, -5.3982292308831825, -6.91...","[0.3832358740283303, 0.21632076825700564, 0.02..."
8,KNeighborsRegressor,"[-6.733333333333333, -6.746666666666667, -5.94...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.63, -6.933333333333334, -5.89666666...","[-6.967333333333333, -6.703999999999999, -6.94...","[0.0653333333333336, 0.14800000000000005, 0.02..."
9,SVR,"[-6.989557389498517, -6.151856393034773, -5.76...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.234519580229234, -6.5669164369164434, -6....","[-7.184419944123905, -6.594733265575715, -6.89...","[0.0469751052849769, 0.10459179964343146, 0.00..."


In [91]:
result_df.to_csv('descriptors_results/Results_3D_All_desc_LVR.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_3D_All_desc_LVR.csv')

In [92]:
#2d and 3d descriptors all
df_train_2d = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train_2d
df_train_3d = pd.read_csv('Descriptors/Train_3d_all_descriptors.csv')
df_train_3d

df_2d_3d_train = df_train_2d.merge(df_train_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_train.to_csv('Descriptors/Train_2d_3d_all_descriptors.csv', index=False)
df_2d_3d_train

  df_train_2d = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')


Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,15.738544,15.738544,0.010382,-1.908222,0.047997,24.622047,1773.325,...,0.651022,0.310326,0.471748,0.474816,0.390502,96.230112,2214.927222,9269.709607,0.476534,1.337065
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,15.975705,15.975705,0.027671,-1.943827,0.026511,21.983740,1745.057,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,15.828473,15.828473,0.049834,-1.862294,0.021075,22.464000,1733.267,...,0.536705,0.383044,0.499076,0.511123,0.387397,78.913503,1739.872712,9926.351313,0.379623,1.397597
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,15.595105,15.595105,0.004954,-1.811756,0.069603,24.479675,1725.281,...,0.649923,0.291878,0.453686,0.515599,0.399368,91.549655,2049.323105,10612.165217,0.474884,1.368653
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,15.867592,15.867592,0.029192,-1.876686,0.046796,23.089431,1723.309,...,0.811324,0.126642,0.576205,0.431076,0.342110,102.180579,1680.289350,8582.496833,0.716985,1.349391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,12.835172,12.835172,0.168936,-0.728726,0.606745,24.965517,402.539,...,0.700532,0.213598,0.438801,0.369091,0.433886,20.020232,91.436181,214.560045,0.550798,1.241777
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,12.673271,12.673271,0.196704,-0.722720,0.611202,25.037037,374.485,...,0.797000,0.144638,0.463418,0.440235,0.444609,20.228575,69.658199,145.575438,0.695500,1.348262
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,12.572226,12.572226,0.181773,-1.016186,0.465702,27.576923,370.494,...,0.461348,0.346276,0.479049,0.456672,0.464743,15.466525,75.381197,204.552808,0.211436,1.400463
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,12.364448,12.364448,0.112529,-1.084532,0.446971,28.200000,356.467,...,0.627527,0.312461,0.522381,0.504997,0.427949,16.762042,70.940669,143.120522,0.441290,1.455327


In [93]:
df_test_2d = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test_2d
df_test_3d = pd.read_csv('Descriptors/Test_3d_all_descriptors.csv')
df_test_3d

df_2d_3d_test = df_test_2d.merge(df_test_3d, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_2d_3d_test.to_csv('Descriptors/Test_2d_3d_all_descriptors.csv', index=False)
df_2d_3d_test

  df_test_2d = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')


Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,15.806942,15.806942,0.022547,-1.946094,0.037676,23.507937,1777.744,...,0.780699,0.174328,0.539789,0.475113,0.431884,96.206319,1657.209281,7203.627712,0.671048,1.446786
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,15.512854,15.512854,0.072929,-1.849230,0.082004,25.902439,1725.281,...,0.554143,0.382135,0.519415,0.521290,0.389075,83.091448,1873.928421,9698.038245,0.404417,1.429780
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,16.029525,16.029525,0.042154,-1.949385,0.046040,22.827869,1701.218,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,15.776936,15.776936,0.046352,-1.865645,0.035370,22.622951,1686.166,...,0.596153,0.354072,0.403854,0.454070,0.363028,79.882613,1648.772500,7084.348331,0.425338,1.220953
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,15.432735,15.432735,0.040020,-1.845687,0.100132,26.663866,1669.173,...,0.540248,0.406175,0.592882,0.546456,0.401213,86.828192,2036.636544,9819.558017,0.419634,1.540551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,0.694755,0.244015,0.423204,0.403477,0.434739,22.780437,117.807253,263.303199,0.542133,1.261420
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,0.642128,0.311598,0.491460,0.489046,0.424396,22.210103,120.470127,244.118194,0.463192,1.404903
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,12.898424,12.898424,0.143657,-0.742398,0.662387,22.258065,430.549,...,0.622128,0.285545,0.480992,0.447816,0.381100,20.440926,109.241371,269.765567,0.433192,1.309908
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,13.095577,13.095577,0.065481,-0.737917,0.561202,26.806452,430.549,...,0.638515,0.303978,0.444298,0.461387,0.468771,20.244697,101.762869,214.619408,0.457773,1.374456


In [94]:
#All 2d and 3d descriptors
df_train = pd.read_csv('Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_3d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_3d_all_descriptors.csv')


X_train shape:  (5568, 3529)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_3d_all_descriptors.csv')


X_test shape:  (1392, 3529)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.163986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 627433
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2822
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 627484
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2827
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2131,0.3419,0.4616,0.6579,0.8123,0.7798,0.2083,0.3305,0.4564,0.6709,0.8209,0.7995
DecisionTreeRegressor,0.4571,0.4815,0.6761,0.266,0.6371,0.6157,0.27,0.3741,0.5197,0.5733,0.7618,0.7366
RandomForestRegressor,0.2188,0.3463,0.4678,0.6486,0.8066,0.7717,0.2189,0.3398,0.4679,0.6541,0.8101,0.7924
GradientBoostingRegressor,0.2236,0.3519,0.4728,0.6411,0.8032,0.7653,0.2189,0.3414,0.4679,0.6541,0.8125,0.782
AdaBoostRegressor,0.3726,0.4944,0.6104,0.4017,0.672,0.6126,0.3538,0.477,0.5948,0.441,0.7073,0.6668
XGBRegressor,0.2372,0.3599,0.487,0.6192,0.7877,0.7514,0.2133,0.3328,0.4618,0.663,0.8146,0.7959
ExtraTreesRegressor,0.2133,0.3419,0.4619,0.6574,0.8109,0.7766,0.2174,0.337,0.4662,0.6565,0.8104,0.7945
LinearRegression,1.3033,0.6919,1.1416,-1.0926,0.3979,0.5148,0.6131,0.5063,0.783,0.0311,0.5693,0.6313
KNeighborsRegressor,0.2709,0.3803,0.5205,0.565,0.7576,0.7213,0.2731,0.3728,0.5226,0.5684,0.7589,0.753
SVR,0.2286,0.3423,0.4781,0.6329,0.7985,0.7689,0.2376,0.3481,0.4875,0.6245,0.7942,0.7743


In [95]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.610122497910056, -6.7118472430508325, -6.0...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.14049674901004, -6.094330502073826, -7.10...","[-6.9703559176681535, -6.470520780499749, -6.9...","[0.1015016457934427, 0.19312349788482314, 0.11..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -5.66, -5.92, -4.39, -4.77, -4.77...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -5.28, -7.0, -4.43, -5.89, -6.8...","[-6.558, -6.848000000000001, -6.376, -6.698, -...","[0.5581541005851341, 0.30399999999999994, 0.74..."
2,RandomForestRegressor,"[-6.561226888989999, -6.7975, -5.9785999999999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.822, -6.179899999999999, -6.7286999999999...","[-6.77008, -6.4154, -6.74289, -6.5973109037820...","[0.05654624302285809, 0.14985108941879668, 0.0..."
3,GradientBoostingRegressor,"[-6.783136812243261, -6.874669344282058, -5.58...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9862925310853745, -6.082039191116321, -7....","[-7.120897150622197, -6.335687075010827, -6.85...","[0.15557498130277855, 0.17614260400399936, 0.1..."
4,AdaBoostRegressor,"[-6.282484157160957, -6.253652856832848, -5.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.309117091161143, -5.665665745042047, -5.6...","[-6.085562325157911, -5.621651741326975, -5.80...","[0.15422593054434483, 0.040000278280094226, 0...."
5,XGBRegressor,"[-7.1511126, -7.412746, -5.908067, -5.183018, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.5201283, -6.4032817, -6.939549, -6.940841...","[-7.095515, -6.5521035, -6.8503914, -6.6145144...","[0.25033554, 0.32991844, 0.12700522, 0.2324599..."
6,ExtraTreesRegressor,"[-6.713600000000002, -6.946299999999999, -6.23...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.945899999999999, -6.350599999999999, -6.9...","[-6.95694, -6.35688, -6.9199399999999995, -6.5...","[0.01752650564145648, 0.07340428870304594, 0.0..."
7,LinearRegression,"[-3.9, -10.0, -6.724692568168393, -7.063368301...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -8.042590699882567, -10.0, -10.0, -4.4...","[-5.119999999999999, -6.9322721312943525, -6.3...","[2.44, 1.8195771404003285, 2.9883774861954775,..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -5.18666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -6.90666666666666...","[-6.703999999999999, -6.533333333333333, -6.92...","[0.14800000000000005, 0.31922823601095635, 0.0..."
9,SVR,"[-6.366628770783175, -6.857201278285261, -5.76...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.068261024569896, -6.67758376235078, -6.78...","[-6.122143162374437, -6.752492948332635, -6.74...","[0.03162662512466103, 0.13453657570073044, 0.0..."


In [96]:
result_df.to_csv('descriptors_results/Results_2D_3D_All_desc.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_3D_All_desc.csv')

In [97]:
#All 2d and 3d descriptors const rem
df_train = pd.read_csv('Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_3d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_3d_all_descriptors.csv')


X_train shape:  (5568, 2946)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_3d_all_descriptors.csv')


X_test shape:  (1392, 2946)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.165646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 627433
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2822
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.154059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 627484
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2827
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.165569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2131,0.3419,0.4616,0.6579,0.8123,0.7798,0.2083,0.3305,0.4564,0.6709,0.8209,0.7995
DecisionTreeRegressor,0.4538,0.4819,0.6736,0.2714,0.64,0.6124,0.2672,0.3702,0.5169,0.5778,0.7641,0.7358
RandomForestRegressor,0.2184,0.3465,0.4673,0.6494,0.8071,0.7718,0.2192,0.3398,0.4682,0.6536,0.8098,0.7923
GradientBoostingRegressor,0.2234,0.3517,0.4727,0.6412,0.8033,0.7657,0.2185,0.3412,0.4675,0.6547,0.813,0.7824
AdaBoostRegressor,0.378,0.4954,0.6148,0.3931,0.6622,0.6089,0.3585,0.4802,0.5988,0.4334,0.7001,0.6608
XGBRegressor,0.2372,0.3599,0.487,0.6192,0.7877,0.7514,0.2133,0.3328,0.4618,0.663,0.8146,0.7959
ExtraTreesRegressor,0.2142,0.3421,0.4628,0.6561,0.8101,0.7758,0.2173,0.3375,0.4662,0.6566,0.8104,0.793
LinearRegression,1.2982,0.6911,1.1394,-1.0844,0.3995,0.5157,0.6131,0.5063,0.783,0.0311,0.5693,0.6313
KNeighborsRegressor,0.2709,0.3803,0.5205,0.565,0.7576,0.7213,0.2731,0.3728,0.5226,0.5684,0.7589,0.753
SVR,0.2286,0.3423,0.4782,0.6329,0.7985,0.7689,0.2376,0.3481,0.4875,0.6245,0.7942,0.7743


In [98]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.610122497910056, -6.7118472430508325, -6.0...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.14049674901004, -6.094330502073826, -7.10...","[-6.9703559176681535, -6.470520780499749, -6.9...","[0.1015016457934427, 0.19312349788482314, 0.11..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -5.54, -6.27, -5.54, -4.54, -4.25...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -5.28, -7.0, -4.43, -6.24, -6.85...","[-6.692, -7.0, -6.348000000000001, -6.66200000...","[0.5768327313875314, 0.0, 0.7708281261085378, ..."
2,RandomForestRegressor,"[-6.533699999999999, -6.8033, -5.9515000000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7604999999999995, -6.241407960160001, -6....","[-6.755920000000001, -6.419191592032, -6.73167...","[0.024598569064073504, 0.13015447097737215, 0...."
3,GradientBoostingRegressor,"[-6.783136812243263, -6.8273490614601755, -5.5...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9862925310853745, -6.08203919111632, -7.0...","[-7.1208971506221985, -6.335687075010828, -6.8...","[0.1555749813027786, 0.17614260400399936, 0.13..."
4,AdaBoostRegressor,"[-5.885562913907284, -6.035257669059976, -5.58...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.885562913907284, -5.570662020905933, -5.7...","[-6.002449642522718, -5.5792795629244605, -5.7...","[0.12220815307772036, 0.02282846522456092, 0.1..."
5,XGBRegressor,"[-7.1511126, -7.412746, -5.908067, -5.183018, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.5201283, -6.4032817, -6.939549, -6.940841...","[-7.095515, -6.5521035, -6.8503914, -6.6145144...","[0.25033554, 0.32991844, 0.12700522, 0.2324599..."
6,ExtraTreesRegressor,"[-6.6552000000000024, -6.9008, -6.144100000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.964800000000001, -6.4173, -6.9376, -6.364...","[-6.956500000000001, -6.356239999999999, -6.92...","[0.0125969837659654, 0.18172592110098001, 0.03..."
7,LinearRegression,"[-3.9, -10.0, -6.724692221745499, -7.063368336...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -8.042612203429599, -10.0, -10.0, -4.4...","[-5.119999999999999, -6.932278428267504, -6.33...","[2.44, 1.8195795223100828, 2.9883774861954775,..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -5.18666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -6.90666666666666...","[-6.703999999999999, -6.533333333333333, -6.92...","[0.14800000000000005, 0.31922823601095635, 0.0..."
9,SVR,"[-6.366534348472898, -6.857206656274956, -5.76...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.06830167990944, -6.67726492792067, -6.785...","[-6.122145827158326, -6.752342848220148, -6.74...","[0.03159489990353279, 0.13452450681694603, 0.0..."


In [99]:
result_df.to_csv('descriptors_results/Results_2D_3D_All_desc_const_rem.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_3D_All_desc_const_rem.csv')

In [100]:
#All 2d and 3d descriptors LVR
df_train = pd.read_csv('Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train,  const_col =  remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_3d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_3d_all_descriptors.csv')


X_train shape:  (5568, 2087)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_3d_all_descriptors.csv')


X_test shape:  (1392, 2087)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 438197
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2059
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140957 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 438252
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2064
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.291732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2166,0.3434,0.4654,0.6522,0.8086,0.7742,0.2082,0.3306,0.4563,0.671,0.821,0.8011
DecisionTreeRegressor,0.4448,0.4792,0.6669,0.2859,0.6417,0.6147,0.262,0.3697,0.5119,0.5859,0.7698,0.7434
RandomForestRegressor,0.2188,0.3475,0.4677,0.6487,0.8066,0.7727,0.2166,0.3379,0.4654,0.6577,0.8124,0.7959
GradientBoostingRegressor,0.2244,0.353,0.4737,0.6396,0.8022,0.7646,0.2227,0.3453,0.4719,0.6481,0.8087,0.779
AdaBoostRegressor,0.3749,0.4956,0.6123,0.398,0.6671,0.6113,0.3566,0.4794,0.5972,0.4365,0.7021,0.6654
XGBRegressor,0.2376,0.3621,0.4875,0.6184,0.7872,0.7528,0.2163,0.3329,0.4651,0.6582,0.8115,0.7924
ExtraTreesRegressor,0.2116,0.3413,0.46,0.6603,0.8127,0.7793,0.2136,0.3352,0.4622,0.6625,0.8141,0.7974
LinearRegression,0.6802,0.4999,0.8248,-0.0922,0.5673,0.6552,0.4036,0.4249,0.6353,0.3622,0.6701,0.7123
KNeighborsRegressor,0.276,0.3834,0.5253,0.5569,0.7527,0.7172,0.2816,0.3778,0.5307,0.555,0.7503,0.7455
SVR,0.2363,0.3484,0.4861,0.6206,0.791,0.7617,0.2411,0.3496,0.491,0.619,0.7912,0.7713


In [101]:
result_df.to_csv('descriptors_results/Results_2D_3D_All_desc_LVR.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_3D_All_desc_LVR.csv')

In [102]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.629480537704184, -6.961267320200346, -5.76...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9674119364205795, -6.190069411098911, -7....","[-6.93010182094523, -6.429264398078691, -6.902...","[0.09170616688910763, 0.12865579920597386, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -5.54, -6.27, -5.15, -4.77, -4.77...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -5.68, -7.0, -4.48, -5.89, -6.85...","[-6.962000000000001, -7.0, -6.69, -6.475999999...","[0.05810335618533594, 0.0, 0.5080944794031914,..."
2,RandomForestRegressor,"[-6.6983, -6.747723237890001, -5.8301999999999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8627, -6.361048136360001, -6.641300000000...","[-6.80548, -6.517941847106, -6.756021493722001...","[0.037886007971281346, 0.13671302743145247, 0...."
3,GradientBoostingRegressor,"[-6.864185662388544, -6.689106046701863, -5.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.083488080190811, -5.893560909285513, -6.9...","[-7.052104194091752, -6.2183964226639725, -6.7...","[0.08861714546468681, 0.21801220270569444, 0.1..."
4,AdaBoostRegressor,"[-6.1657142857142855, -6.129000865051916, -5.5...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.0925402535986075, -5.50679830913561, -5.6...","[-6.12968386202099, -5.6493277412906755, -5.82...","[0.09701040675296335, 0.09504155979950392, 0.2..."
5,XGBRegressor,"[-6.759692, -6.8572116, -6.0722513, -5.1813464...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0449986, -5.7103806, -6.522188, -6.741412...","[-6.9990067, -6.534608, -6.834769, -6.5628395,...","[0.21869677, 0.53029037, 0.32683057, 0.3930120..."
6,ExtraTreesRegressor,"[-6.78118246139, -6.951499999999999, -6.081900...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.974399999999999, -6.320500000000001, -6.9...","[-6.95752, -6.308600000000001, -6.930840000000...","[0.015383289635184639, 0.16620123946589652, 0...."
7,LinearRegression,"[-3.9, -10.0, -6.627017502112523, -5.459072605...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -5.0845943155719056, -6.13521364123313...","[-3.9, -6.522392287596178, -6.96973367946692, ...","[0.0, 0.7211040033559892, 2.005531085191765, 1..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -5.12333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -6.90666666666666...","[-6.703999999999999, -6.6, -6.925333333333333,...","[0.14800000000000005, 0.1958343971602312, 0.02..."
9,SVR,"[-6.332478186723671, -6.862516413193279, -5.73...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.137916292912655, -6.690950124307616, -6.7...","[-6.168355683625504, -6.75364252214009, -6.717...","[0.022796506250359692, 0.11905193583480221, 0...."


In [4]:
from sklearn.model_selection import GridSearchCV
import os
import joblib
def train_and_test_predict_with_tuning(models, param_grids, X_train, y_train, X_test, y_test, save_dir):
   
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        best_params = None

        # hyperparameter tuning 
        if model_name in param_grids and param_grids[model_name]:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            grid_search = GridSearchCV(
                estimator=model, 
                param_grid=param_grids[model_name], 
                cv=kf,
                scoring='neg_mean_squared_error', 
                n_jobs=-1)
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            print(model_name)
            print(": best params",best_params)
        else:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            best_params = {}
            print(model_name, ':Used Default params')

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)  
            test_predictions_folds.append(predictions_test_fold)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
            'Best Parameters': best_params
        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }
        # Save the model
        model_path = os.path.join(save_dir, f"{model_name}.joblib")
        joblib.dump(model, model_path)
        print(f"Saved {model_name} model to {model_path}")

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df


In [None]:
param_grids = {
        'ExtraTreesRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None,1,5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'LGBMRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'num_leaves': [31, 50, 100]
        },
        'DecisionTreeRegressor': {
            'max_depth': [None, 10, 20, 50, 100],
            'min_samples_split': [2, 5, 10]
        },
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None, 1, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'GradientBoostingRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7, 10]
        },
        'AdaBoostRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.1, 1.0]
        },
        'SVR': {
            'C': [0.001, 0.1, 1, 10],
            'epsilon': [0.1, 0.2, 0.5],
            'gamma': [0.001, 0.1, 1, 10]
        },
        'KNeighborsRegressor': {
            'n_neighbors': [3, 5, 10],
            'weights': ['uniform', 'distance']
        },
        'MLPRegressor': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'learning_rate': ['constant', 'adaptive'],
            'max_iter': [100,200, 400, 500]
}
    }


In [7]:
#2d Mordred descriptors const removal
df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
save_dir = 'descriptors_results/Models_2d_Mordred_const_rem_with_HPT/'
os.makedirs(save_dir, exist_ok=True)
result_df, prediction_df = train_and_test_predict_with_tuning(models,param_grids, X_train,y_train, X_test,  y_test, save_dir)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


X_train shape:  (5568, 1227)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


X_test shape:  (1392, 1227)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
LGBMRegressor : Default params {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': 'regression', 'random_state': 101, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'metric': 'rmse'}




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057735 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265910
[LightGBM] [Info] Number of data points in the train set: 5568, number of used features: 1196
[LightGBM] [Info] Start training from score -5.742906
LGBMRegressor
: best params {'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 31}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265212
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1187
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265242
[LightGBM] [Info] N

KeyboardInterrupt: 

In [None]:
result_df.to_csv('descriptors_results/Results_2d_Mordred_const_rem_with_HPT.csv')
prediction_df.to_csv('descriptors_results/Prediction_df_2d_Mordred_const_rem_with_HPT.csv')

In [None]:
#2d All descriptors const rem
df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
save_dir = 'descriptors_results/Models_2D_All_desc_const_rem_with_HPT/'
os.makedirs(save_dir, exist_ok=True)
result_df, prediction_df = train_and_test_predict_with_tuning(models,param_grids, X_train,y_train, X_test,  y_test, save_dir)
result_df

In [None]:
result_df.to_csv('descriptors_results/Results_2D_All_desc_const_rem_with_HPT.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2D_All_desc_const_rem_with_HPT.csv')

In [None]:
#2d RDKit descriptors const removal
df_train = pd.read_csv('Descriptors/Train_2d_RDKit_des.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_RDKit_des.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
save_dir = 'descriptors_results/Models_2d_rdkit_const_rem_with_HPT/'
os.makedirs(save_dir, exist_ok=True)
result_df, prediction_df = train_and_test_predict_with_tuning(models,param_grids, X_train,y_train, X_test,  y_test, save_dir)
result_df

In [None]:
result_df.to_csv('descriptors_results/Results_2d_rdkit_const_rem_with_HPT.csv')
prediction_df.to_csv('descriptors_results/Prediction_data_2d_rdkit_const_rem_with_HPT.csv')

In [122]:
#2d Mordred descriptors const removal
df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models_2dM = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_2dM, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


X_train shape:  (5568, 1227)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


X_test shape:  (1392, 1227)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.170237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265212
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1187
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.470950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 265242
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1190
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.203769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2063,0.3347,0.4541,0.6688,0.8184,0.7858,0.2054,0.3276,0.4532,0.6754,0.8229,0.8032
DecisionTreeRegressor,0.3081,0.3866,0.5551,0.5053,0.7334,0.7176,0.2332,0.3453,0.4829,0.6315,0.7955,0.7797
RandomForestRegressor,0.2154,0.3436,0.4641,0.6542,0.8092,0.7796,0.2138,0.3351,0.4624,0.6621,0.8144,0.7952
GradientBoostingRegressor,0.2272,0.3549,0.4767,0.6352,0.8002,0.7646,0.2183,0.3425,0.4673,0.655,0.814,0.7856
AdaBoostRegressor,0.3832,0.4996,0.619,0.3847,0.6608,0.617,0.3681,0.4849,0.6067,0.4183,0.6886,0.6702
XGBRegressor,0.2174,0.3414,0.4662,0.651,0.8073,0.7742,0.2127,0.3301,0.4612,0.6639,0.815,0.7943
ExtraTreesRegressor,0.2145,0.3415,0.4632,0.6555,0.8097,0.7819,0.223,0.3396,0.4722,0.6476,0.8049,0.7867
LinearRegression,0.6512,0.4717,0.807,-0.0456,0.5858,0.69,0.3377,0.3822,0.5812,0.4663,0.7177,0.7463
KNeighborsRegressor,0.2717,0.3807,0.5213,0.5637,0.7575,0.7263,0.2642,0.3711,0.514,0.5824,0.7671,0.7575
SVR,0.2267,0.3421,0.4762,0.6359,0.8007,0.7714,0.2376,0.3499,0.4875,0.6245,0.7947,0.7756


In [123]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.803659819579023, -6.952297461961773, -6.31...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.077881107007449, -6.0194634527933095, -6....","[-7.095000868721698, -6.249811763734205, -6.66...","[0.025405667572675622, 0.18539061636084384, 0...."
1,DecisionTreeRegressor,"[-6.82, -7.0, -6.244999999999999, -5.05, -4.26...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -6.96, -7.0, -6.24, -7.0, -6.85,...","[-6.992, -6.43, -6.720000000000001, -6.694, -6...","[0.016000000000000014, 0.6298571266565139, 0.5..."
2,RandomForestRegressor,"[-6.535537681159422, -6.766108333333335, -6.00...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.747766666666669, -6.1027166666666695, -6....","[-6.874323333333335, -6.307488457199335, -6.75...","[0.06899517986384564, 0.1466726414596661, 0.08..."
3,GradientBoostingRegressor,"[-7.172151031819288, -7.023936244369912, -5.93...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0754472513178035, -6.052434368338021, -6....","[-7.154501095613862, -6.281628977846543, -6.57...","[0.12123129267795736, 0.21266660343343977, 0.0..."
4,AdaBoostRegressor,"[-6.092088465441948, -6.055812499999991, -5.29...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.1702678571428535, -5.566031587411215, -5....","[-6.100103518051506, -5.645258461494352, -5.66...","[0.2048426940735511, 0.07647162008521954, 0.03..."
5,XGBRegressor,"[-6.8409624, -6.8683248, -6.0015554, -5.321909...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8227315, -6.169962, -6.799593, -6.7047834...","[-7.014267, -6.187599, -6.730255, -6.7727537, ...","[0.116441816, 0.2011095, 0.11745094, 0.1260753..."
6,ExtraTreesRegressor,"[-6.5763000000000025, -6.813800000000001, -6.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.964599999999999, -6.627100000000001, -6.8...","[-6.955299999999999, -6.516620000000001, -6.86...","[0.022404374572836814, 0.13256841856188764, 0...."
7,LinearRegression,"[-7.248273384067716, -10.0, -6.148328486140372...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.56729127677113, -6.127714388314189, -8.63...","[-7.073458255354225, -6.3982907369437925, -8.6...","[2.7391545708959977, 0.9518729724276647, 0.050..."
8,KNeighborsRegressor,"[-7.0, -7.0, -5.88, -4.88, -4.733333333333333,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.746666666666667, -6.746666666666667...","[-7.0, -6.898666666666666, -6.797333333333334,...","[0.0, 0.12410748030101418, 0.10133333333333318..."
9,SVR,"[-7.169522117681083, -7.019981504744268, -5.91...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.300096341024478, -6.780839401372151, -7.0...","[-7.285910573423253, -6.740319863124401, -7.00...","[0.04465649746211758, 0.05312871296499056, 0.0..."


In [124]:
result_df.to_csv('descriptors_results/Results_2d_Mordred_const_rem_scaled.csv')
prediction_df.to_csv('descriptors_results/Prediction_df_2d_Mordred_const_rem_scaled.csv')

In [125]:
#2d Mordred descriptors LVR
df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col = remove_low_variance_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = X_test.select_dtypes(include=['number'])
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
results_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
results_df

  df_train = pd.read_csv('Descriptors/Train_2d_Mordred_desc.csv')


X_train shape:  (5568, 821)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_Mordred_desc.csv')


X_test shape:  (1392, 821)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170429
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 808
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170473
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 811
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tota

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2086,0.3356,0.4568,0.665,0.816,0.7849,0.2074,0.3286,0.4554,0.6722,0.8211,0.8034
DecisionTreeRegressor,0.2915,0.381,0.5399,0.5319,0.745,0.7269,0.2351,0.3464,0.4848,0.6285,0.7937,0.7721
RandomForestRegressor,0.217,0.3442,0.4658,0.6516,0.8075,0.7786,0.215,0.3359,0.4637,0.6603,0.8131,0.7948
GradientBoostingRegressor,0.2301,0.3559,0.4797,0.6305,0.7971,0.7617,0.224,0.348,0.4732,0.6461,0.8081,0.7787
AdaBoostRegressor,0.3914,0.5039,0.6256,0.3716,0.6477,0.6129,0.3709,0.4856,0.609,0.4139,0.682,0.6686
XGBRegressor,0.218,0.3432,0.4669,0.65,0.8072,0.7747,0.2122,0.3298,0.4606,0.6647,0.8154,0.793
ExtraTreesRegressor,0.2112,0.3394,0.4595,0.6609,0.813,0.784,0.2214,0.3374,0.4705,0.6502,0.8065,0.7898
LinearRegression,0.4254,0.4071,0.6522,0.317,0.6722,0.7356,0.3181,0.3735,0.564,0.4973,0.7294,0.7629
KNeighborsRegressor,0.2777,0.3834,0.527,0.5541,0.7513,0.718,0.2656,0.3714,0.5154,0.5802,0.7645,0.7563
SVR,0.2365,0.3482,0.4863,0.6203,0.7907,0.7618,0.2444,0.3529,0.4944,0.6138,0.7878,0.7692


In [126]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.590723796184461, -6.864465493660207, -6.30...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1934338278999785, -6.071564767852124, -6....","[-7.104324347085287, -6.313665025245345, -6.70...","[0.11911803727514793, 0.17757965983696158, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.03, -5.2, -6.244999999999999, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -5.89, -5.949999999999999...","[-7.37, -6.49, -6.728, -6.868, -5.854000000000...","[0.7601578783384412, 0.6307455905513726, 0.524..."
2,RandomForestRegressor,"[-6.683116057605002, -6.8004, -6.1641083333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.867600000000001, -6.257900000000002, -6.7...","[-6.884765, -6.37418474261543, -6.71476, -6.54...","[0.040921154675790536, 0.12665785981964658, 0...."
3,GradientBoostingRegressor,"[-6.9538278086749825, -7.052640720542491, -5.8...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.229801769686274, -6.1517648848612785, -6....","[-7.191434474201043, -6.310766037726277, -6.70...","[0.11328046235770686, 0.19618053637325925, 0.0..."
4,AdaBoostRegressor,"[-6.308654545454544, -6.308654545454544, -5.55...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.239743589743589, -5.672502385440382, -5.7...","[-6.060696597590882, -5.708925619905679, -5.66...","[0.1503225840297493, 0.057164129004639755, 0.0..."
5,XGBRegressor,"[-7.2010727, -7.05254, -6.1245418, -4.817527, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.3523107, -6.408034, -7.0629597, -6.966821...","[-7.18943, -6.34603, -6.8971586, -6.7855177, -...","[0.1483452, 0.2780104, 0.15064599, 0.15745322,..."
6,ExtraTreesRegressor,"[-6.860600000000002, -6.872300000000001, -6.09...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9811000000000005, -6.597400000000001, -6....","[-6.970460000000001, -6.585390000000001, -6.91...","[0.006783391482141547, 0.08001171414236755, 0...."
7,LinearRegression,"[-7.539499947810782, -7.185399541640891, -6.23...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.847063878560675, -7.483975315476073, -7.5...","[-7.553554799793129, -7.109674921969928, -7.88...","[1.4875345840168233, 0.6016532257184658, 0.296..."
8,KNeighborsRegressor,"[-7.0, -7.0, -5.88, -4.853333333333333, -4.733...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.746666666666667, -6.746666666666667...","[-7.0, -6.898666666666666, -6.797333333333334,...","[0.0, 0.12410748030101418, 0.10133333333333318..."
9,SVR,"[-7.1115116900074185, -7.009329529671163, -5.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.118067335275781, -6.790359365497927, -7.0...","[-7.115816315634726, -6.767185642475207, -7.01...","[0.03800995531330861, 0.03985679228380903, 0.0..."


In [127]:
result_df.to_csv('descriptors_results/Results_2d_Mordred_LVR_scaled.csv')
prediction_df.to_csv('descriptors_results/Prediction_df_2d_Mordred_LVR_scaled.csv')

In [6]:
#2d All descriptors const rem
df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')
df_train = df_train.dropna()
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = X_train.select_dtypes(include=['number'])
X_train, const_col =  remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')
df_test = df_test.dropna()
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

  df_train = pd.read_csv('Descriptors/Train_2d_all_descriptors.csv')


X_train shape:  (5568, 2504)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


  df_test = pd.read_csv('Descriptors/Test_2d_all_descriptors.csv')


X_test shape:  (1392, 2504)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514723
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2380
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514774
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2385
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2042,0.3325,0.4519,0.6721,0.8203,0.7881,0.2058,0.3277,0.4537,0.6748,0.8222,0.8044
DecisionTreeRegressor,0.3059,0.3856,0.5531,0.5088,0.7329,0.7127,0.2314,0.3444,0.481,0.6344,0.7976,0.7745
RandomForestRegressor,0.2155,0.3434,0.4643,0.6539,0.8088,0.776,0.2196,0.3395,0.4686,0.653,0.8085,0.7895
GradientBoostingRegressor,0.2203,0.3491,0.4693,0.6463,0.8065,0.7672,0.2187,0.3408,0.4676,0.6545,0.8125,0.7834
AdaBoostRegressor,0.3778,0.4938,0.6146,0.3934,0.6626,0.6219,0.3575,0.4771,0.5979,0.4351,0.6957,0.6761
XGBRegressor,0.2166,0.3429,0.4654,0.6522,0.808,0.7748,0.2151,0.3318,0.4638,0.6601,0.8127,0.7923
ExtraTreesRegressor,0.2092,0.3381,0.4573,0.6642,0.815,0.784,0.218,0.337,0.4669,0.6555,0.8097,0.79
LinearRegression,1.1424,0.6212,1.0688,-0.8343,0.4362,0.5776,0.5253,0.4695,0.7248,0.1699,0.6108,0.6739
KNeighborsRegressor,0.2736,0.3818,0.5231,0.5607,0.755,0.719,0.2703,0.3757,0.5199,0.5728,0.7603,0.7484
SVR,0.224,0.3392,0.4733,0.6403,0.8032,0.7743,0.2344,0.3451,0.4841,0.6296,0.7973,0.7768


In [7]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.66923493793545, -6.964947952869519, -6.187...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1131123303759125, -6.160156349224908, -6....","[-7.018860431317639, -6.49195061784413, -6.855...","[0.07600842296393379, 0.18118606790177316, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -6.96, -7.0, -5.92, -6.244999999999999,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -7.0, -7.0, -7.0, -4.43, -5.8, -6.85, ...","[-6.748, -6.992, -6.5760000000000005, -6.97000...","[0.5039999999999999, 0.016000000000000014, 0.5..."
2,RandomForestRegressor,"[-6.6145000000000005, -6.813439999999999, -5.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.801800000000001, -6.244875, -6.6953999999...","[-6.815437, -6.483283139319333, -6.69159480598...","[0.04660011669513284, 0.13993377441419044, 0.0..."
3,GradientBoostingRegressor,"[-6.981264206991823, -6.939825737406756, -5.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.073263222609385, -6.063939303715299, -7.1...","[-7.073427899473096, -6.26005136233825, -6.775...","[0.08591276917451098, 0.19377985690008964, 0.1..."
4,AdaBoostRegressor,"[-5.885562913907283, -6.377272837070318, -5.60...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.304420600858371, -5.684994355584168, -5.8...","[-6.114998915502312, -5.697759913121283, -5.71...","[0.16125623562845817, 0.07888993482493699, 0.0..."
5,XGBRegressor,"[-7.2001286, -6.737824, -5.4927177, -4.932216,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.2036347, -6.6719604, -7.09824, -6.8739405...","[-7.0979676, -6.543934, -6.892276, -6.6681466,...","[0.14426151, 0.29652217, 0.1287557, 0.20234475..."
6,ExtraTreesRegressor,"[-6.780100000000001, -6.956799999999999, -6.22...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.956600000000002, -6.239750000000002, -6.9...","[-6.958940000000001, -6.319060000000001, -6.93...","[0.018105093206056567, 0.12052718116673893, 0...."
7,LinearRegression,"[-3.9, -10.0, -6.561595810264407, -6.924811289...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-3.9, -7.27295671930915, -10.0, -5.678827745...","[-5.119999999999999, -6.699755758944614, -9.12...","[2.44, 2.350504710954084, 0.7756174763588719, ..."
8,KNeighborsRegressor,"[-6.63, -6.986666666666667, -5.88, -4.85333333...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.63, -6.746666666666667, -7.0, -5.90333333...","[-6.7780000000000005, -6.698, -6.9120000000000...","[0.18126224096595522, 0.09733333333333362, 0.1..."
9,SVR,"[-6.314907965426872, -6.951309678541952, -5.81...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.035070885358877, -6.667518105491782, -7.0...","[-6.094828523113399, -6.742352408572694, -7.04...","[0.03399947448581303, 0.13083693271290386, 0.0..."


In [8]:
result_df.to_csv('descriptors_results/Results_2d_all_desc_const_col_scaled.csv')
prediction_df.to_csv('descriptors_results/Prediction_df_2d_all_desc_const_col_scaled.csv')