In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [13]:
#Atomic descriptors
df_train = pd.read_csv('Atomic_features/Train_all_atomic_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Atomic_features/Test_all_atomic_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_degree = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models_degree, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 23)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 23)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 17
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 784
[LightGBM] [Info] Number of data points in the train set: 4454, number of used featu



0.454063830430938


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3003,0.4043,0.548,0.5179,0.7206,0.6835,0.3087,0.4052,0.5556,0.5122,0.7164,0.7008
DecisionTreeRegressor,0.3624,0.4142,0.602,0.4182,0.6748,0.6618,0.3083,0.3918,0.5552,0.5129,0.7221,0.7129
RandomForestRegressor,0.2833,0.3848,0.5322,0.5452,0.7389,0.7051,0.2869,0.3865,0.5356,0.5467,0.7402,0.721
GradientBoostingRegressor,0.3314,0.4333,0.5757,0.4679,0.6893,0.6365,0.3404,0.4361,0.5835,0.4621,0.684,0.6628
AdaBoostRegressor,0.5505,0.5909,0.7419,0.1162,0.4503,0.4349,0.531,0.582,0.7287,0.1609,0.488,0.4821
XGBRegressor,0.2836,0.3826,0.5325,0.5447,0.7396,0.7046,0.2851,0.3833,0.534,0.5494,0.7426,0.7237
ExtraTreesRegressor,0.2832,0.3823,0.5321,0.5454,0.7409,0.7073,0.2921,0.3876,0.5404,0.5385,0.7362,0.7184
LinearRegression,0.4846,0.526,0.6961,0.2219,0.4713,0.4781,0.4895,0.5267,0.6996,0.2266,0.4767,0.4913
KNeighborsRegressor,0.3321,0.4138,0.5763,0.4668,0.6943,0.6584,0.3343,0.4108,0.5782,0.4717,0.6967,0.6821
SVR,0.3514,0.4253,0.5928,0.4358,0.6648,0.6406,0.3723,0.4307,0.6102,0.4116,0.6501,0.6446


In [4]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.405973830435521, -7.0384272989401495, -6.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.992337917250762, -6.465291933458534, -6.5...","[-7.019794368729011, -6.453482372820912, -6.62...","[0.07466501971828703, 0.10907219133879548, 0.0..."
1,DecisionTreeRegressor,"[-5.49, -7.0, -7.0, -4.77, -4.55, -4.885, -5.0...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -5.49, -6.244999999999999, -7.0...","[-7.0, -6.518000000000001, -6.396, -6.819, -6....","[0.0, 0.39640383449204913, 0.7397459023205197,..."
2,RandomForestRegressor,"[-6.623700000000001, -6.9048, -6.4150266666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.934900000000002, -6.651100000000003, -6.7...","[-6.9674000000000005, -6.70864, -6.73914200000...","[0.025088244259014517, 0.10905856408370683, 0...."
3,GradientBoostingRegressor,"[-6.414804492400767, -6.745252825886334, -5.87...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.019412028211235, -6.3399219342912145, -6....","[-7.15415627455958, -6.4807759111983305, -6.45...","[0.07836106393656384, 0.1595291429758868, 0.15..."
4,AdaBoostRegressor,"[-5.786376090159455, -5.869923779070385, -5.78...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.869923779070385, -5.786376090159455, -5.8...","[-5.785333304981799, -5.703138760558792, -5.73...","[0.05472609266014173, 0.06611179707686692, 0.0..."
5,XGBRegressor,"[-6.369783, -7.138793, -6.2215147, -5.1384716,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.616264, -6.427373, -6.568331, -5.539065, ...","[-6.8326797, -6.6089973, -6.7012877, -6.077459...","[0.1327724, 0.19248006, 0.099578865, 0.3078653..."
6,ExtraTreesRegressor,"[-6.8585, -6.9889, -5.928466666666663, -4.7699...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.964700000000002, -6.944400000000001, -6.2...","[-6.975240000000002, -6.849799999999999, -6.42...","[0.010616515435866846, 0.2304392240917349, 0.2..."
7,LinearRegression,"[-4.127934954999589, -6.787257203614937, -5.56...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.599264686444286, -3.9, -4.887417778847503...","[-6.57357645783934, -3.930882232119134, -4.836...","[0.06058044276649575, 0.039914685448258484, 0...."
8,KNeighborsRegressor,"[-5.603333333333333, -6.986666666666667, -5.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.603333333333333, -6.4...","[-6.989333333333333, -6.294, -6.43333333333333...","[0.005333333333333102, 0.4281142111373762, 0.3..."
9,SVR,"[-6.030529280036772, -6.962966906331343, -5.55...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.893252488494217, -5.7866606060715275, -6....","[-6.969591059520324, -6.048453782919074, -5.99...","[0.07993592437697429, 0.18249963727699886, 0.1..."


In [14]:
result_df.to_csv('atomic_results/Results_all_atomic_desc.csv')
prediction_df.to_csv('atomic_results/Prediction_data_all_atomic_desc.csv')


In [3]:
#Atomic + monomeric_composition based features
df1 = pd.read_csv('Monomer_features/Train_mon_comp.csv')
df2 = pd.read_csv('Atomic_features/Train_all_atomic_desc.csv')
df_train = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_train

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,0.066667,0.0,0.066667,0.0,0.0,0.0,0.0,...,2,98,15,0,18,0,0,163,1,1
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0.000000,0.0,0.071429,0.0,0.0,0.0,0.0,...,1,91,14,0,24,0,0,167,1,1
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,0.071429,0.0,0.000000,0.0,0.0,0.0,0.0,...,1,91,15,0,24,0,0,156,1,1
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,0.000000,0.0,0.066667,0.0,0.0,0.0,0.0,...,2,99,15,0,12,0,0,155,1,1
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0.066667,0.0,0.066667,0.0,0.0,0.0,0.0,...,1,99,15,0,12,0,0,153,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,0.333333,0.0,0.000000,0.0,0.0,0.0,0.0,...,1,21,3,0,6,0,0,35,1,1
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,0.333333,0.0,0.000000,0.0,0.0,0.0,0.0,...,1,19,3,0,6,0,0,35,1,1
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,1,23,3,0,0,0,0,33,0,1
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,1,22,3,0,0,0,0,33,0,1


In [4]:
df1 = pd.read_csv('Monomer_features/Test_mon_comp.csv')
df2 = pd.read_csv('Atomic_features/Test_all_atomic_desc.csv')
df_test = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_test

Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0.000000,0.0,0.133333,0.000000,0.0,0.0,0.0,...,1,97,15,0,18,0,0,166,1,1
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0.000000,0.0,0.133333,0.066667,0.0,0.0,0.0,...,2,99,15,0,12,0,0,157,1,1
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.000000,0.0,0.133333,0.000000,0.0,0.0,0.0,...,1,93,15,0,18,0,0,159,1,1
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0.071429,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,85,14,0,29,0,0,157,1,1
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0.000000,0.0,0.133333,0.066667,0.0,0.0,0.0,...,1,95,15,0,12,0,0,156,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.333333,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,23,3,0,6,0,0,35,1,1
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0.333333,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,23,3,0,6,0,0,35,1,1
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.250000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,22,4,0,6,0,0,40,1,1
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,24,3,0,6,0,0,39,1,1


In [5]:
import re
def clean_feature_names(df):
    def clean_name(name):
        return re.sub(r'[^a-zA-Z0-9_]', '_', name)
    df.columns = [clean_name(col) for col in df.columns]
    return df

In [6]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [7]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 408)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 408)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1350
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 78
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 82
[LightGBM] [Info] Start training from score -5.74269

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2456,0.3656,0.4955,0.6057,0.7796,0.7363,0.2505,0.3642,0.5005,0.6041,0.7787,0.7532
DecisionTreeRegressor,0.3888,0.4412,0.6236,0.3757,0.6735,0.633,0.3006,0.3893,0.5483,0.525,0.7386,0.6946
RandomForestRegressor,0.2656,0.3767,0.5154,0.5736,0.7586,0.7128,0.256,0.3627,0.506,0.5954,0.7723,0.7375
GradientBoostingRegressor,0.2882,0.4041,0.5368,0.5373,0.7403,0.694,0.2856,0.4018,0.5344,0.5487,0.7499,0.7183
AdaBoostRegressor,0.4671,0.5466,0.6835,0.25,0.5584,0.4851,0.4629,0.5435,0.6804,0.2685,0.5741,0.5379
XGBRegressor,0.2257,0.3464,0.475,0.6377,0.7987,0.7547,0.2227,0.3395,0.4719,0.6481,0.8053,0.7721
ExtraTreesRegressor,0.2778,0.3821,0.527,0.554,0.7503,0.7068,0.2587,0.3665,0.5086,0.5912,0.7722,0.7291
LinearRegression,0.3899,0.4426,0.6244,0.3739,0.6337,0.6568,0.3727,0.438,0.6105,0.4111,0.6514,0.6831
KNeighborsRegressor,0.3468,0.4336,0.5889,0.4432,0.677,0.637,0.3334,0.4209,0.5774,0.4732,0.694,0.6643
SVR,0.302,0.3921,0.5496,0.515,0.7215,0.6982,0.2987,0.3896,0.5466,0.5279,0.7299,0.7095


In [8]:
result_df.to_csv('atomic_results/Results_all_atomic_desc_and_mono_comp.csv')
prediction_df.to_csv('atomic_results/Prediction_data_all_atomic_desc_and_mono_comp.csv')

In [9]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 262)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 262)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1350
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 78
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 4454, number of used f

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2456,0.3656,0.4955,0.6057,0.7796,0.7363,0.2505,0.3642,0.5005,0.6041,0.7787,0.7532
DecisionTreeRegressor,0.3923,0.4434,0.6264,0.3701,0.6697,0.6279,0.3009,0.3918,0.5485,0.5246,0.7384,0.6909
RandomForestRegressor,0.2654,0.3765,0.5152,0.5739,0.7588,0.7128,0.2559,0.3624,0.5058,0.5957,0.7725,0.738
GradientBoostingRegressor,0.2882,0.4041,0.5368,0.5373,0.7403,0.6942,0.2852,0.4014,0.534,0.5493,0.7505,0.7188
AdaBoostRegressor,0.4798,0.5561,0.6927,0.2297,0.5549,0.4856,0.4681,0.5487,0.6842,0.2603,0.5825,0.5489
XGBRegressor,0.2257,0.3464,0.475,0.6377,0.7987,0.7547,0.2227,0.3395,0.4719,0.6481,0.8053,0.7721
ExtraTreesRegressor,0.2768,0.3816,0.5261,0.5556,0.7514,0.7075,0.2587,0.3666,0.5086,0.5912,0.7722,0.7293
LinearRegression,0.3899,0.4426,0.6244,0.3739,0.6337,0.6568,0.3727,0.438,0.6105,0.4111,0.6514,0.6831
KNeighborsRegressor,0.3464,0.4335,0.5885,0.4438,0.6772,0.6366,0.3328,0.4204,0.5769,0.4742,0.6945,0.6645
SVR,0.302,0.3921,0.5496,0.515,0.7215,0.6982,0.2987,0.3896,0.5466,0.5279,0.7299,0.7095


In [10]:
result_df.to_csv('atomic_results/Results_all_atomic_desc_and_mono_comp_const_rem.csv')
prediction_df.to_csv('atomic_results/Prediction_data_all_atomic_desc_and_mono_comp_const_rem.csv')


In [11]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.658856691033044, -6.949840742667949, -5.77...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.053776975089306, -6.472870625624551, -7.0...","[-6.991698615663445, -6.411408445763892, -6.89...","[0.14585691905540257, 0.10495600928578822, 0.1..."
1,DecisionTreeRegressor,"[-8.0, -7.0, -5.15, -4.85, -5.85, -5.09, -5.09...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -8.0, -7.0, -7.0, -6.24, -5.15, -6.85,...","[-7.0, -7.048, -6.731999999999999, -7.0, -6.05...","[0.0, 0.5596570378365665, 0.5359999999999999, ..."
2,RandomForestRegressor,"[-6.646585714285713, -6.7796999999999965, -6.0...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.904400000000001, -6.696482461389999, -6.7...","[-6.856584666666668, -6.708258492278, -6.61847...","[0.05192408182893296, 0.09590561162345373, 0.1..."
3,GradientBoostingRegressor,"[-6.42764231529753, -6.65592018174737, -5.5782...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7004713117236046, -6.320734974959236, -6....","[-6.621170816630338, -6.267014705290754, -6.74...","[0.12209059489006517, 0.10126695056081805, 0.1..."
4,AdaBoostRegressor,"[-5.798135090265214, -5.798135090265214, -5.79...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.76579539559827, -5.76579539559827, -5.765...","[-5.763516193627824, -5.763099762649212, -5.77...","[0.013481632314794649, 0.012934910665411936, 0..."
5,XGBRegressor,"[-6.742195, -6.990505, -5.996362, -5.2777576, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.596901, -6.454024, -7.3800654, -6.5661736...","[-6.708071, -6.613443, -7.3051972, -6.545186, ...","[0.14307745, 0.08720191, 0.26316768, 0.1302703..."
6,ExtraTreesRegressor,"[-6.807700000000002, -6.925599999999999, -5.73...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.965500000000002, -6.693000000000001, -6.8...","[-6.963620000000001, -6.729140000000001, -6.78...","[0.006400046874828443, 0.054015057160018184, 0..."
7,LinearRegression,"[-5.847762903472198, -6.978361273968966, -5.18...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.096561243440529, -5.438960291853395, -5.6...","[-7.162350714307841, -5.444344040572174, -5.56...","[0.0582278485483556, 0.10941785363993842, 0.04..."
8,KNeighborsRegressor,"[-4.76, -7.0, -5.853333333333333, -4.926666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.533333333333332, -6.04, -5.38333333...","[-6.9093333333333335, -5.18, -6.23199999999999...","[0.04533333333333331, 0.32550132274863536, 0.2..."
9,SVR,"[-5.2767587783303975, -6.9897419865102215, -4....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.046643613936923, -5.114429729215835, -5.7...","[-7.0170324713951615, -5.139884039360652, -5.5...","[0.025014925909915168, 0.028926368389684987, 0..."


In [12]:
const_col

['Ala_tBu_',
 'Me_Ala_indol_2_yl_',
 'Me_Abu_morpholino_',
 'meD',
 'Asp_Ph_2_NH2__',
 'Glu_3R_Me_',
 'Phe_CHF2_',
 'Me_Phe_4_Cl_',
 'Bn_4_OH__Gly',
 'Bu_Gly',
 'EtOEt_Gly',
 'PhEt_Gly',
 'isoamyl_Gly',
 '2_pyridylmethyl_Gly',
 'Me_Hph',
 'Hph_2_Cl_',
 'Hph_3_Cl_',
 'Hph_4_Cl_',
 'Hse_Et_',
 'Hyp_Et_',
 'dK',
 'meK',
 'Me_dK',
 'Lys_Cbz_',
 'Lys_iPr_',
 'Lys_Me_',
 'Me_Lys_Me_',
 'dLeu_3R_OH_',
 'dN',
 'Nle_CHF2_',
 'Nle_OH_',
 'Orn',
 '4Pal',
 'dPip',
 'Gln_Mes_',
 'Ser_Bn_',
 'Ser_EtNMe2_',
 'Ser_EtOH_',
 'Ser_isoamyl_',
 'dSer_Me_',
 'Ser_Ph_2_Cl__',
 'Ser_Ph_3_Cl__',
 'Ser_Pr_',
 'Me_Ser_isoamyl_',
 'Me_Ser_Pr_',
 'dT',
 'Me_Tza',
 '_N__O_Val',
 'meW',
 'Me_dW',
 'Trp_6_Br_',
 'Tyr_CHF2_',
 'dTyr_bR_OMe_',
 '_N__O_Tyr',
 'Mono3',
 'Mono4',
 'Mono5',
 'Mono15',
 'Mono17',
 'Mono18',
 'Mono19',
 'Mono20',
 'Mono23',
 'Mono24',
 'Mono25',
 'Mono32',
 'Mono33',
 'Mono36',
 'Mono48',
 'Mono49',
 'Mono50',
 'Mono51',
 'Mono52',
 'Mono53',
 'Mono54',
 'Mono55',
 'Mono56',
 'Mono57',
 'Mon

In [13]:
prediction_df.iloc[0]['Y Test actual']

0      -7.00
1      -7.00
2      -7.00
3      -6.74
4      -5.54
        ... 
1387   -4.50
1388   -4.80
1389   -6.38
1390   -7.80
1391   -4.90
Name: Permeability, Length: 1392, dtype: float64

In [14]:
#LGBM
prediction_df.iloc[0]['Test prediction folds']

[array([-7.05377698, -6.47287063, -7.01247687, ..., -6.51824957,
        -7.28763952, -5.20882713]),
 array([-6.99388582, -6.44469643, -6.89785042, ..., -6.63432063,
        -7.40562944, -4.88128099]),
 array([-6.90343197, -6.36440712, -6.85006668, ..., -6.44529051,
        -7.20895817, -5.16867784]),
 array([-6.7864048 , -6.2344915 , -6.60284052, ..., -6.46675722,
        -6.9922974 , -5.2819053 ]),
 array([-7.22099351, -6.54057655, -7.12462562, ..., -6.39770284,
        -6.89359372, -5.15860885])]

In [15]:
prediction_df.iloc[0]['Test Predictions Mean']

array([-6.99169862, -6.41140845, -6.89757202, ..., -6.49246415,
       -7.15762365, -5.13986002])

In [16]:
#Linear regression
prediction_df.iloc[7]['Test prediction folds']

[array([-7.09656124, -5.43896029, -5.63531842, ..., -6.74296175,
        -5.41343869, -5.2602085 ]),
 array([-7.19334573, -5.24121322, -5.5532147 , ..., -6.69315839,
        -6.61879627, -5.99895416]),
 array([-7.1106    , -5.54058297, -5.49003549, ..., -7.21      ,
        -6.12696974, -5.02479292]),
 array([-7.15443592, -5.46171731, -5.56909788, ..., -6.79989033,
        -5.87172276, -4.33066229]),
 array([-7.25681068, -5.5392464 , -5.56188773, ..., -6.66173606,
        -5.78848423, -5.55013985])]

In [17]:
prediction_df.iloc[7]['Test Predictions Mean']

array([-7.16235071, -5.44434404, -5.56191084, ..., -6.82154931,
       -5.96388234, -5.23295154])

In [20]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        mse_test_folds = []
        mae_test_folds = []
        rmse_test_folds = []
        r2_test_folds = []
        pearson_test_folds = []
        spearman_test_folds = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)
            

            mse_test_folds.append(mean_squared_error(y_test, predictions_test_fold))
            mae_test_folds.append(mean_absolute_error(y_test, predictions_test_fold))
            rmse_test_folds.append(np.sqrt(mse_test_folds[-1]))
            r2_test_folds.append(r2_score(y_test, predictions_test_fold))
            pearson_test, _ = pearsonr(y_test, predictions_test_fold)
            spearman_test, _ = spearmanr(y_test, predictions_test_fold)
            pearson_test_folds.append(pearson_test)
            spearman_test_folds.append(spearman_test)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        # Helper function to format mean ± std string
        def format_metric(metric_values):
            mean_val = np.mean(metric_values)
            std_val = np.std(metric_values)
            return f"{mean_val:.4f} ± {std_val:.4f}"

        mse_test_str = format_metric(mse_test_folds)
        mae_test_str = format_metric(mae_test_folds)
        rmse_test_str = format_metric(rmse_test_folds)
        r2_test_str = format_metric(r2_test_folds)
        pearson_test_str = format_metric(pearson_test_folds)
        spearman_test_str = format_metric(spearman_test_folds)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)
        

        predictions.append({
            'Model': model_name,
            'Y Test': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
            'Test MSE folds': mse_test_folds,
            'Test MAE folds': mae_test_folds,
            'Test RMSE folds': rmse_test_folds,
            'Test R2 folds': r2_test_folds,
            'Test PCC folds': pearson_test_folds,
            'Test SCC folds': spearman_test_folds,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': mse_test_str,
            'Test MAE': mae_test_str,
            'Test RMSE': rmse_test_str,
            'Test R2': r2_test_str,
            'Test Pearson Correlation': pearson_test_str,
            'Test Spearman Correlation': spearman_test_str,
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [21]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 262)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 262)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000672 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1350
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 78
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 4454, number of used f

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2456,0.3656,0.4955,0.6057,0.7796,0.7363,0.2558 ± 0.0026,0.3676 ± 0.0018,0.5057 ± 0.0025,0.5959 ± 0.0040,0.7728 ± 0.0028,0.7482 ± 0.0043
DecisionTreeRegressor,0.3923,0.4434,0.6264,0.3701,0.6697,0.6279,0.3998 ± 0.0120,0.4420 ± 0.0068,0.6322 ± 0.0095,0.3683 ± 0.0189,0.6738 ± 0.0091,0.6370 ± 0.0087
RandomForestRegressor,0.2654,0.3765,0.5152,0.5739,0.7588,0.7128,0.2718 ± 0.0067,0.3730 ± 0.0027,0.5213 ± 0.0064,0.5705 ± 0.0106,0.7579 ± 0.0063,0.7227 ± 0.0058
GradientBoostingRegressor,0.2882,0.4041,0.5368,0.5373,0.7403,0.6942,0.2887 ± 0.0028,0.4036 ± 0.0016,0.5373 ± 0.0026,0.5438 ± 0.0044,0.7454 ± 0.0033,0.7137 ± 0.0043
AdaBoostRegressor,0.4798,0.5561,0.6927,0.2297,0.5549,0.4856,0.4796 ± 0.0120,0.5537 ± 0.0103,0.6925 ± 0.0087,0.2421 ± 0.0190,0.5617 ± 0.0155,0.5431 ± 0.0226
XGBRegressor,0.2257,0.3464,0.475,0.6377,0.7987,0.7547,0.2375 ± 0.0044,0.3507 ± 0.0036,0.4873 ± 0.0045,0.6247 ± 0.0069,0.7906 ± 0.0043,0.7580 ± 0.0057
ExtraTreesRegressor,0.2768,0.3816,0.5261,0.5556,0.7514,0.7076,0.2854 ± 0.0087,0.3831 ± 0.0046,0.5342 ± 0.0082,0.5490 ± 0.0137,0.7503 ± 0.0071,0.7100 ± 0.0074
LinearRegression,0.4313,0.4481,0.6567,0.3075,0.6039,0.6568,0.4002 ± 0.0377,0.4450 ± 0.0081,0.6319 ± 0.0292,0.3676 ± 0.0596,0.6305 ± 0.0283,0.6779 ± 0.0031
KNeighborsRegressor,0.3464,0.4335,0.5885,0.4438,0.6772,0.6366,0.3639 ± 0.0084,0.4379 ± 0.0061,0.6032 ± 0.0070,0.4250 ± 0.0133,0.6684 ± 0.0087,0.6375 ± 0.0086
SVR,0.302,0.3921,0.5496,0.515,0.7215,0.6982,0.3034 ± 0.0035,0.3921 ± 0.0034,0.5508 ± 0.0032,0.5205 ± 0.0056,0.7245 ± 0.0041,0.7068 ± 0.0034


In [22]:
prediction_df

Unnamed: 0,Model,Y Test,Test prediction folds,Test Predictions Mean,Test Predictions Std,Test MSE folds,Test MAE folds,Test RMSE folds,Test R2 folds,Test PCC folds,Test SCC folds
0,LGBMRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.053776975089306, -6.472870625624551, -7.0...","[-6.991698615663445, -6.411408445763892, -6.89...","[0.14585691905540257, 0.10495600928578822, 0.1...","[0.25749449953304926, 0.2526025574936686, 0.25...","[0.3693024990998288, 0.3655825321746018, 0.367...","[0.5074391584545376, 0.5025958192162651, 0.506...","[0.5931053271378839, 0.6008356093745292, 0.595...","[0.7711017867505152, 0.7765789773273701, 0.772...","[0.7487237042680333, 0.7533817333028223, 0.749..."
1,DecisionTreeRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -8.0, -7.0, -7.0, -6.24, -5.15, -6.85,...","[-7.0, -7.048, -6.731999999999999, -7.0, -6.05...","[0.0, 0.5596570378365665, 0.5359999999999999, ...","[0.4022647110009843, 0.40266591148537617, 0.40...","[0.4441470510342425, 0.4427447753945707, 0.438...","[0.6342434162062578, 0.6345596201188476, 0.633...","[0.36433839059264617, 0.36370441068183335, 0.3...","[0.6706803452601046, 0.6726502650215294, 0.675...","[0.6218421858454376, 0.6378608036165951, 0.640..."
2,RandomForestRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.904400000000001, -6.696482461390001, -6.7...","[-6.856584666666667, -6.708258492278, -6.61847...","[0.05192408182893306, 0.09590561162345398, 0.1...","[0.2813030689439093, 0.26111452648631195, 0.26...","[0.37760110018524495, 0.36931799291672046, 0.3...","[0.5303801174100602, 0.5109936657986202, 0.518...","[0.5554828533401339, 0.5873849343311598, 0.574...","[0.7485183329234384, 0.7674603145351845, 0.761...","[0.7164362723306894, 0.7328499773266577, 0.724..."
3,GradientBoostingRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7004713117236046, -6.320734974959236, -6....","[-6.621170816630338, -6.267014705290754, -6.74...","[0.12209059489006517, 0.10126695056081805, 0.1...","[0.29387751905763826, 0.28628672384241655, 0.2...","[0.406071795260299, 0.40212956800493316, 0.403...","[0.5421047122628969, 0.5350576827244111, 0.536...","[0.5356126162099226, 0.547607645779339, 0.5450...","[0.7400125762979344, 0.7499783654708734, 0.744...","[0.7103144015198812, 0.7215225542774073, 0.710..."
4,AdaBoostRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.76579539559827, -5.76579539559827, -5.765...","[-5.763516193627824, -5.763099762649212, -5.77...","[0.013481632314794649, 0.012934910665411936, 0...","[0.4833331406322164, 0.47632274795055485, 0.49...","[0.5535954655636331, 0.5539336257502787, 0.567...","[0.6952216485641226, 0.6901613926833019, 0.704...","[0.2362334710154732, 0.24731134429825197, 0.21...","[0.5349037620112272, 0.5786370300195084, 0.554...","[0.5867663371393864, 0.5344030821729185, 0.521..."
5,XGBRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.596901, -6.454024, -7.3800654, -6.5661736...","[-6.708071, -6.613443, -7.3051972, -6.545186, ...","[0.14307745, 0.08720191, 0.26316768, 0.1302703...","[0.23614515352220944, 0.23215694951370533, 0.2...","[0.34766199900274636, 0.34804309811688156, 0.3...","[0.4859476859932656, 0.4818266799521435, 0.491...","[0.626841718309944, 0.6331439071655912, 0.6186...","[0.7919731399565366, 0.7958195923948379, 0.786...","[0.7622964527933916, 0.7656273736139149, 0.758..."
6,ExtraTreesRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.965500000000002, -6.693000000000001, -6.8...","[-6.963620000000001, -6.729140000000001, -6.78...","[0.006400046874828443, 0.054015057160018067, 0...","[0.29035641543528795, 0.268507211910031, 0.286...","[0.38446977201797833, 0.3752100175350928, 0.38...","[0.5388473025220484, 0.5181768152957357, 0.535...","[0.541176689652765, 0.5757029592889293, 0.5476...","[0.7457623904968043, 0.76390163757286, 0.75113...","[0.7038885470430697, 0.7184600868405062, 0.719..."
7,LinearRegression,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.096561243440529, -5.438960291853395, -5.6...","[-7.162350714307841, -5.444344040572174, -5.56...","[0.0582278485483556, 0.10941785363993842, 0.04...","[0.40800648771748604, 0.4694447631925365, 0.36...","[0.44460981659818644, 0.4605201904830304, 0.43...","[0.6387538553445185, 0.6851603923115641, 0.606...","[0.35526519344495944, 0.25817998562123023, 0.4...","[0.6227760468179098, 0.5794189308309932, 0.655...","[0.6795196496635527, 0.6736425538393991, 0.679..."
8,KNeighborsRegressor,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.533333333333332, -6.04, -5.38333333...","[-6.9093333333333335, -5.18, -6.23199999999999...","[0.04533333333333331, 0.32550132274863536, 0.2...","[0.3751935753249752, 0.3498073467579668, 0.361...","[0.4462702014954103, 0.4282700536198516, 0.434...","[0.6125304688952014, 0.5914451341907945, 0.600...","[0.4071163952291371, 0.4472318974503796, 0.429...","[0.6555706316991783, 0.6812849280004103, 0.673...","[0.6260751805702771, 0.651736728972367, 0.6411..."
9,SVR,0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.046643613936923, -5.114429729215835, -5.7...","[-7.0170324713951615, -5.139884039360652, -5.5...","[0.025014925909915168, 0.028926368389684987, 0...","[0.30326053065402364, 0.29937422825286675, 0.3...","[0.3911359910292487, 0.38730514290234747, 0.39...","[0.5506909574834361, 0.5471510104649966, 0.553...","[0.5207855133362838, 0.5269266765340594, 0.515...","[0.7252035911074748, 0.7287885485529068, 0.720...","[0.7081107084563254, 0.7120511832099063, 0.705..."


In [23]:
r2_score(y_test,prediction_df.iloc[5]['Test prediction folds'][0])

0.626841718309944

In [24]:
r2_score(y_test,prediction_df.iloc[5]['Test prediction folds'][1])

0.6331439071655912

In [25]:
r2_score(y_test,prediction_df.iloc[5]['Test prediction folds'][2])

0.618685622548827

In [26]:
r2_score(y_test,prediction_df.iloc[5]['Test prediction folds'][3])

0.6301400020155099

In [27]:
r2_score(y_test,prediction_df.iloc[5]['Test prediction folds'][4])

0.6148014793013263

In [28]:
prediction_df.iloc[5]['Test R2 folds']

[0.626841718309944,
 0.6331439071655912,
 0.618685622548827,
 0.6301400020155099,
 0.6148014793013263]

In [29]:
np.mean(prediction_df.iloc[5]['Test R2 folds'])

0.6247225458682397

In [30]:
prediction_df.iloc[5]['Test prediction folds']

[array([-6.596901 , -6.454024 , -7.3800654, ..., -6.996824 , -7.1583815,
        -4.9488564], dtype=float32),
 array([-6.8292785, -6.6557565, -7.0362616, ..., -6.8795815, -7.23661  ,
        -4.9434943], dtype=float32),
 array([-6.6868086, -6.5923004, -7.6477904, ..., -6.9417286, -7.797114 ,
        -5.1967216], dtype=float32),
 array([-6.907063 , -6.7027225, -6.9664016, ..., -6.921767 , -6.841078 ,
        -5.1967645], dtype=float32),
 array([-6.5203056, -6.6624136, -7.4954653, ..., -6.1670094, -7.6066008,
        -5.045636 ], dtype=float32)]

In [31]:
np.mean(prediction_df.iloc[5]['Test prediction folds'], axis=0)

array([-6.708071 , -6.613443 , -7.3051972, ..., -6.7813826, -7.327956 ,
       -5.0662947], dtype=float32)

In [32]:
r2_score(y_test,np.mean(prediction_df.iloc[5]['Test prediction folds'], axis=0))

0.6480607939538405

In [33]:
r2_score(y_test,prediction_df.iloc[5]['Test Predictions Mean'])

0.6480607939538405

In [34]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [35]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 262)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 262)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.743515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1350
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 78
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 82
[LightGBM] [Info] Start training from score -5.74269

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2456,0.3656,0.4955,0.6057,0.7796,0.7363,0.2505,0.3642,0.5005,0.6041,0.7787,0.7532
DecisionTreeRegressor,0.3923,0.4434,0.6264,0.3701,0.6697,0.6279,0.3009,0.3918,0.5485,0.5246,0.7384,0.6909
RandomForestRegressor,0.2654,0.3765,0.5152,0.5739,0.7588,0.7128,0.2559,0.3624,0.5058,0.5957,0.7725,0.738
GradientBoostingRegressor,0.2882,0.4041,0.5368,0.5373,0.7403,0.6942,0.2852,0.4014,0.534,0.5493,0.7505,0.7188
AdaBoostRegressor,0.4798,0.5561,0.6927,0.2297,0.5549,0.4856,0.4681,0.5487,0.6842,0.2603,0.5825,0.5489
XGBRegressor,0.2257,0.3464,0.475,0.6377,0.7987,0.7547,0.2227,0.3395,0.4719,0.6481,0.8053,0.7721
ExtraTreesRegressor,0.2768,0.3816,0.5261,0.5556,0.7514,0.7076,0.2587,0.3666,0.5086,0.5912,0.7722,0.7293
LinearRegression,0.3899,0.4426,0.6244,0.3739,0.6337,0.6568,0.3727,0.438,0.6105,0.4111,0.6514,0.6831
KNeighborsRegressor,0.3464,0.4335,0.5885,0.4438,0.6772,0.6366,0.3328,0.4204,0.5769,0.4742,0.6945,0.6645
SVR,0.302,0.3921,0.5496,0.515,0.7215,0.6982,0.2987,0.3896,0.5466,0.5279,0.7299,0.7095


In [36]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.658856691033044, -6.949840742667949, -5.77...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.053776975089306, -6.472870625624551, -7.0...","[-6.991698615663445, -6.411408445763892, -6.89...","[0.14585691905540257, 0.10495600928578822, 0.1..."
1,DecisionTreeRegressor,"[-8.0, -7.0, -5.15, -4.85, -5.85, -5.09, -5.09...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -8.0, -7.0, -7.0, -6.24, -5.15, -6.85,...","[-7.0, -7.048, -6.731999999999999, -7.0, -6.05...","[0.0, 0.5596570378365665, 0.5359999999999999, ..."
2,RandomForestRegressor,"[-6.646585714285715, -6.779699999999996, -6.07...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.904399999999999, -6.696482461390001, -6.7...","[-6.856584666666667, -6.708258492278, -6.61847...","[0.05192408182893273, 0.09590561162345379, 0.1..."
3,GradientBoostingRegressor,"[-6.42764231529753, -6.65592018174737, -5.5782...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7004713117236046, -6.320734974959236, -6....","[-6.621170816630338, -6.267014705290754, -6.74...","[0.12209059489006517, 0.10126695056081805, 0.1..."
4,AdaBoostRegressor,"[-5.798135090265214, -5.798135090265214, -5.79...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.76579539559827, -5.76579539559827, -5.765...","[-5.763516193627824, -5.763099762649212, -5.77...","[0.013481632314794649, 0.012934910665411936, 0..."
5,XGBRegressor,"[-6.742195, -6.990505, -5.996362, -5.2777576, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.596901, -6.454024, -7.3800654, -6.5661736...","[-6.708071, -6.613443, -7.3051972, -6.545186, ...","[0.14307745, 0.08720191, 0.26316768, 0.1302703..."
6,ExtraTreesRegressor,"[-6.8077000000000005, -6.925599999999999, -5.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.965500000000002, -6.693000000000001, -6.8...","[-6.963620000000001, -6.729140000000001, -6.78...","[0.00640004687482879, 0.054015057160018067, 0...."
7,LinearRegression,"[-5.847762903472198, -6.978361273968966, -5.18...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.096561243440529, -5.438960291853395, -5.6...","[-7.162350714307841, -5.444344040572174, -5.56...","[0.0582278485483556, 0.10941785363993842, 0.04..."
8,KNeighborsRegressor,"[-4.76, -7.0, -5.853333333333333, -4.926666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.533333333333332, -6.04, -5.38333333...","[-6.9093333333333335, -5.18, -6.23199999999999...","[0.04533333333333331, 0.32550132274863536, 0.2..."
9,SVR,"[-5.2767587783303975, -6.9897419865102215, -4....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.046643613936923, -5.114429729215835, -5.7...","[-7.0170324713951615, -5.139884039360652, -5.5...","[0.025014925909915168, 0.028926368389684987, 0..."


In [37]:
np.mean(prediction_df.iloc[5]['Test prediction folds'], axis=0)

array([-6.708071 , -6.613443 , -7.3051972, ..., -6.7813826, -7.327956 ,
       -5.0662947], dtype=float32)

In [38]:
r2_score(y_test,np.mean(prediction_df.iloc[5]['Test prediction folds'], axis=0))

0.6480607939538405

In [39]:
r2_score(y_test,prediction_df.iloc[5]['Test Predictions Mean'])

0.6480607939538405

In [38]:
#Atomic + monomeric_composition based features
df1 = pd.read_csv('Monomer_features/Train_mon_comp.csv')
df2 = pd.read_csv('Atomic_features/Train_all_atomic_desc.csv')
df_train = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_train
df1 = pd.read_csv('Monomer_features/Test_mon_comp.csv')
df2 = pd.read_csv('Atomic_features/Test_all_atomic_desc.csv')
df_test = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_test


Unnamed: 0,ID,SMILES,Permeability,A,dA,meA,Me_dA,Ala(tBu),Ala(indol-2-yl),dAla(indol-2-yl),...,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0.000000,0.0,0.133333,0.000000,0.0,0.0,0.0,...,1,97,15,0,18,0,0,166,1,1
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0.000000,0.0,0.133333,0.066667,0.0,0.0,0.0,...,2,99,15,0,12,0,0,157,1,1
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.000000,0.0,0.133333,0.000000,0.0,0.0,0.0,...,1,93,15,0,18,0,0,159,1,1
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0.071429,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,85,14,0,29,0,0,157,1,1
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0.000000,0.0,0.133333,0.066667,0.0,0.0,0.0,...,1,95,15,0,12,0,0,156,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.333333,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,23,3,0,6,0,0,35,1,1
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0.333333,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,23,3,0,6,0,0,35,1,1
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.250000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,22,4,0,6,0,0,40,1,1
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,1,24,3,0,6,0,0,39,1,1


In [26]:
from sklearn.model_selection import GridSearchCV
def train_and_test_predict_with_tuning(models, param_grids, X_train, y_train, X_test, y_test):
   
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        best_params = None

        # hyperparameter tuning 
        if model_name in param_grids and param_grids[model_name]:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            grid_search = GridSearchCV(
                estimator=model, 
                param_grid=param_grids[model_name], 
                cv=kf,
                scoring='neg_mean_squared_error', 
                n_jobs=-1)
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            print(model_name)
            print(": best params",best_params)
        else:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            best_params = {}
            print(model_name, ':Used Default params')

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)  
            test_predictions_folds.append(predictions_test_fold)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
            'Best Parameters': best_params
        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df


In [36]:
param_grids = {
        'ExtraTreesRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None,1,5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'LGBMRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'num_leaves': [31, 50, 100]
        },
        'DecisionTreeRegressor': {
            'max_depth': [None, 10, 20, 50, 100],
            'min_samples_split': [2, 5, 10]
        },
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None, 1, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'GradientBoostingRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7, 10]
        },
        'AdaBoostRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.1, 1.0]
        },
        'SVR': {
            'C': [0.001, 0.1, 1, 10],
            'epsilon': [0.1, 0.2, 0.5],
            'gamma': [0.001, 0.1, 1, 10]
        },
        'KNeighborsRegressor': {
            'n_neighbors': [3, 5, 10],
            'weights': ['uniform', 'distance']
        },
        'MLPRegressor': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'learning_rate': ['constant', 'adaptive'],
            'max_iter': [100,200, 400]
}
    }


In [39]:
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train = clean_feature_names(X_train)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
X_test = clean_feature_names(X_test)
X_test = X_test.drop(const_col,axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(objective='reg:squarederror',random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]

result_df, prediction_df = train_and_test_predict_with_tuning(models, param_grids, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 262)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 262)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
LGBMRegressor : Default params {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': 'regression', 'random_state': 101, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'metric': 'rmse'}




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1465
[LightGBM] [Info] Number of data points in the train set: 5568, number of used features: 88
[LightGBM] [Info] Start training from score -5.742906
LGBMRegressor
: best params {'learning_rate': 0.05, 'n_estimators': 400, 'num_leaves': 31}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000825 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1350
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 78
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001548 se

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2232,0.3432,0.4724,0.6416,0.801,0.7597,0.2254,0.3406,0.4747,0.6439,0.8025,0.7748
DecisionTreeRegressor,0.333,0.4137,0.5771,0.4653,0.703,0.6588,0.2647,0.3705,0.5145,0.5817,0.7645,0.7218
RandomForestRegressor,0.2468,0.3646,0.4968,0.6038,0.7771,0.7338,0.2451,0.3582,0.495,0.6128,0.7829,0.7518
GradientBoostingRegressor,0.2255,0.3469,0.4748,0.638,0.7988,0.758,0.2251,0.343,0.4745,0.6443,0.8029,0.7693
AdaBoostRegressor,0.4333,0.5004,0.6583,0.3043,0.5528,0.4393,0.4337,0.5046,0.6585,0.3147,0.5624,0.5842
XGBRegressor,0.2257,0.3464,0.475,0.6377,0.7987,0.7547,0.2227,0.3395,0.4719,0.6481,0.8053,0.7721
ExtraTreesRegressor,0.2336,0.3557,0.4833,0.6249,0.7911,0.7463,0.2343,0.3531,0.484,0.6298,0.794,0.7565
LinearRegression,0.4313,0.4481,0.6567,0.3075,0.6039,0.6568,0.3727,0.438,0.6105,0.4111,0.6514,0.6831
KNeighborsRegressor,0.3311,0.4255,0.5754,0.4683,0.6869,0.6499,0.3329,0.4205,0.577,0.4739,0.6905,0.6629
SVR,0.2854,0.3848,0.5342,0.5417,0.7371,0.7031,0.2889,0.3851,0.5375,0.5434,0.738,0.7127


In [40]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test Predictions Mean,Test Predictions Std,Best Parameters
0,LGBMRegressor,"[-6.817866261888496, -7.137931914876334, -5.84...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-7.162042678459291, -6.632231262119943, -7.21...","[0.16348363939659113, 0.18819895725168487, 0.1...","{'learning_rate': 0.05, 'n_estimators': 400, '..."
1,DecisionTreeRegressor,"[-7.5, -6.91, -6.145714285714286, -4.935, -6.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.991000000000001, -7.0503333333333345, -6.7...","[0.002905932629026918, 0.3295795301086926, 0.2...","{'max_depth': 20, 'min_samples_split': 10}"
2,RandomForestRegressor,"[-6.6731120624630424, -6.77669287048659, -6.03...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.840542426420598, -6.661657907110151, -6.61...","[0.03511300318449715, 0.10450245657358637, 0.1...","{'max_depth': 20, 'min_samples_split': 5, 'n_e..."
3,GradientBoostingRegressor,"[-6.933956402733556, -7.417083917270874, -5.84...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-7.023203952777467, -6.689588218207044, -7.34...","[0.11651566568524581, 0.14537379738421946, 0.2...","{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
4,AdaBoostRegressor,"[-5.7208223238949945, -5.770533661161989, -5.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-5.728374099340698, -5.727809713594145, -5.75...","[0.0064370506631836535, 0.006295110751366476, ...","{'learning_rate': 0.01, 'n_estimators': 100}"
5,XGBRegressor,"[-6.742195, -6.990505, -5.996362, -5.2777576, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.708071, -6.613443, -7.3051972, -6.545186, ...","[0.14307745, 0.08720191, 0.26316768, 0.1302703...",{}
6,ExtraTreesRegressor,"[-6.718260356645456, -6.840379939028495, -5.73...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.9292457917709855, -6.678886810525709, -6.5...","[0.02737726458589667, 0.1086417150641323, 0.08...","{'max_depth': 20, 'min_samples_split': 10, 'n_..."
7,LinearRegression,"[-5.847762903472198, -6.978361273968966, -5.18...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-7.162350714307841, -5.444344040572174, -5.56...","[0.0582278485483556, 0.10941785363993842, 0.04...",{}
8,KNeighborsRegressor,"[-4.682, -6.7780000000000005, -5.322, -4.93999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.8995999999999995, -5.0496, -6.4824, -5.428...","[0.06479999999999962, 0.1920141661440635, 0.09...","{'n_neighbors': 5, 'weights': 'uniform'}"
9,SVR,"[-5.377493057196427, -6.97376881883095, -4.854...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-7.125511785364933, -5.144874651063345, -5.52...","[0.05562096334793436, 0.0595969220811658, 0.06...","{'C': 10, 'epsilon': 0.2, 'gamma': 0.001}"


In [41]:
result_df.to_csv('atomic_results/Results_all_atomic_desc_and_mono_comp_with_HPT.csv')
prediction_df.to_csv('atomic_results/Prediction_data_all_atomic_desc_and_mono_comp_with_HPT.csv')

In [42]:
import sklearn
print(sklearn.__version__)

1.6.0


In [43]:
#Atomic descriptors
df_train = pd.read_csv('Atomic_features/Train_all_atomic_desc.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Atomic_features/Test_all_atomic_desc.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models_degree = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict_with_tuning(models, param_grids, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 23)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 23)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
LGBMRegressor : Default params {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': 'regression', 'random_state': 101, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'metric': 'rmse'}




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 826
[LightGBM] [Info] Number of data points in the train set: 5568, number of used features: 17
[LightGBM] [Info] Start training from score -5.742906
LGBMRegressor
: best params {'learning_rate': 0.1, 'n_estimators': 400, 'num_leaves': 31}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 17
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000225 secon



MLPRegressor
: best params {'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'max_iter': 100}




0.44949495890294255




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2812,0.3844,0.5302,0.5486,0.7417,0.7018,0.2829,0.385,0.5318,0.553,0.7446,0.724
DecisionTreeRegressor,0.3442,0.4087,0.5867,0.4474,0.6846,0.6681,0.305,0.3995,0.5523,0.518,0.7228,0.7073
RandomForestRegressor,0.2821,0.385,0.5312,0.547,0.7397,0.7051,0.2857,0.3866,0.5345,0.5486,0.7411,0.7221
GradientBoostingRegressor,0.2775,0.3851,0.5268,0.5545,0.7446,0.7025,0.2838,0.3874,0.5327,0.5515,0.7429,0.7227
AdaBoostRegressor,0.4726,0.5305,0.6875,0.2411,0.4954,0.4349,0.4857,0.5395,0.697,0.2324,0.4855,0.5137
XGBRegressor,0.2836,0.3826,0.5325,0.5447,0.7396,0.7046,0.2851,0.3833,0.534,0.5494,0.7426,0.7237
ExtraTreesRegressor,0.2702,0.3792,0.5198,0.5662,0.7525,0.7164,0.2846,0.3877,0.5335,0.5502,0.742,0.7238
LinearRegression,0.4848,0.5261,0.6963,0.2216,0.471,0.4781,0.4895,0.5267,0.6996,0.2266,0.4767,0.4913
KNeighborsRegressor,0.3112,0.4016,0.5579,0.5003,0.7128,0.6767,0.3265,0.4051,0.5714,0.484,0.7018,0.6883
SVR,0.3031,0.3917,0.5506,0.5133,0.7185,0.692,0.301,0.3948,0.5486,0.5244,0.7266,0.7135


In [44]:
result_df.to_csv('atomic_results/Results_all_atomic_desc_with_HPT.csv')
prediction_df.to_csv('atomic_results/Prediction_data_all_atomic_desc_with_HPT.csv')

In [45]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test Predictions Mean,Test Predictions Std,Best Parameters
0,LGBMRegressor,"[-6.265791613932967, -7.3557504535674845, -6.5...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-7.106041947223659, -6.809055754094911, -6.69...","[0.07542260238518948, 0.09346301987587483, 0.1...","{'learning_rate': 0.1, 'n_estimators': 400, 'n..."
1,DecisionTreeRegressor,"[-5.995000000000001, -6.99, -6.279999999999999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.996, -6.526222222222222, -6.484, -6.721000...","[0.004898979485566252, 0.42420519726037764, 0....","{'max_depth': 20, 'min_samples_split': 10}"
2,RandomForestRegressor,"[-6.56264261904762, -6.952133482142858, -6.378...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.945640992243869, -6.634083020202018, -6.74...","[0.02913598232735361, 0.10005692730556243, 0.0...","{'max_depth': 20, 'min_samples_split': 5, 'n_e..."
3,GradientBoostingRegressor,"[-5.972378010025126, -7.00341994239865, -6.240...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-7.0447297099781085, -6.719699428321012, -6.5...","[0.08610692692626387, 0.16856260581893254, 0.2...","{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
4,AdaBoostRegressor,"[-5.330750747106483, -5.653247422680414, -5.23...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-5.679952412756883, -5.2983773340508105, -5.5...","[0.030452640313360643, 0.018661615664841272, 0...","{'learning_rate': 0.01, 'n_estimators': 200}"
5,XGBRegressor,"[-6.369783, -7.138793, -6.2215147, -5.1384716,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.8326797, -6.6089973, -6.7012877, -6.077459...","[0.1327724, 0.19248006, 0.099578865, 0.3078653...",{}
6,ExtraTreesRegressor,"[-6.663466646825389, -6.923384146825396, -6.10...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.969946396825394, -6.640271549603168, -6.49...","[0.006007764501206538, 0.22005739700162202, 0....","{'max_depth': 20, 'min_samples_split': 10, 'n_..."
7,LinearRegression,"[-4.127934954999589, -6.787257203614937, -5.56...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.57357645783934, -3.930882232119134, -4.836...","[0.06058044276649575, 0.039914685448258484, 0....",{}
8,KNeighborsRegressor,"[-5.317353170651615, -6.957369374354451, -5.86...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.969276168025876, -6.245583741929467, -5.85...","[0.028033002827688506, 0.4834593258446868, 0.0...","{'n_neighbors': 10, 'weights': 'distance'}"
9,SVR,"[-6.249897911401852, -6.848998070187065, -6.35...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[-6.4504672745053515, -6.7115146555090375, -6....","[0.05074646033404196, 0.16872994871226235, 0.5...","{'C': 10, 'epsilon': 0.2, 'gamma': 1}"
