In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
# from lightgbm.lgb import LGBMRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [3]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [6]:
#All fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/All_fingerprints_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/All_fingerprints_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 20188)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 20188)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.185951 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17668
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 4093
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.153224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17848
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 4154
[LightGBM] [Info] Start training from scor

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2166,0.3418,0.4654,0.6523,0.8087,0.7776,0.2174,0.3404,0.4662,0.6565,0.8124,0.8
DecisionTreeRegressor,0.3053,0.3848,0.5525,0.5099,0.7351,0.7215,0.2522,0.3489,0.5022,0.6014,0.7778,0.7756
RandomForestRegressor,0.221,0.3445,0.4702,0.6451,0.8032,0.7753,0.2263,0.3424,0.4757,0.6424,0.8019,0.7905
GradientBoostingRegressor,0.2549,0.3738,0.5049,0.5908,0.7747,0.7402,0.258,0.3728,0.5079,0.5924,0.7764,0.7571
AdaBoostRegressor,0.4559,0.5432,0.6752,0.2681,0.5941,0.5686,0.42,0.5214,0.6481,0.3363,0.6457,0.6197
XGBRegressor,0.2129,0.3374,0.4614,0.6582,0.8115,0.7813,0.215,0.333,0.4637,0.6602,0.8127,0.7967
ExtraTreesRegressor,0.2199,0.3434,0.4689,0.6469,0.8047,0.7752,0.2289,0.3379,0.4784,0.6383,0.7993,0.7926
LinearRegression,0.607,0.4885,0.7791,0.0254,0.5634,0.6441,0.4334,0.4419,0.6583,0.3152,0.629,0.6754
KNeighborsRegressor,0.3169,0.4093,0.5629,0.4912,0.7131,0.6872,0.2944,0.3864,0.5426,0.5348,0.7372,0.7346
SVR,0.2802,0.3776,0.5293,0.5501,0.7446,0.7279,0.2795,0.3725,0.5286,0.5584,0.7501,0.75


In [7]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.723532325095607, -6.757128308474924, -6.11...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.025019269942802, -6.132890118021946, -6.8...","[-7.02602636807954, -6.259715784974982, -6.862...","[0.030522988836645018, 0.11372157756993759, 0...."
1,DecisionTreeRegressor,"[-6.96, -7.0, -6.244999999999999, -5.05, -5.15...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -7.0, -5.82, -7.0, -6.85,...","[-7.0, -6.17, -6.736, -6.784000000000001, -5.9...","[0.0, 0.1400000000000002, 0.5280000000000001, ..."
2,RandomForestRegressor,"[-6.645700000000001, -6.8139, -6.1496416666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8477, -6.355033333333336, -6.853650000000...","[-6.885160000000001, -6.363293333333336, -6.78...","[0.06119926796947837, 0.07587743450240447, 0.1..."
3,GradientBoostingRegressor,"[-6.834707610411482, -6.780458603379971, -5.94...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.014800255424059, -6.354714096498743, -6.5...","[-7.081008707549399, -6.094698571665337, -6.62...","[0.1409827103098058, 0.2119767294302283, 0.150..."
4,AdaBoostRegressor,"[-5.703940298507472, -5.967714653831089, -5.65...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.801543859649126, -5.703940298507472, -5.8...","[-5.8356372499046785, -5.673631414828848, -5.8...","[0.04212357213313098, 0.09297574324594704, 0.1..."
5,XGBRegressor,"[-7.035523, -6.8859987, -6.4079747, -5.390928,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.879666, -6.3244767, -6.7222857, -6.899798...","[-6.944411, -6.289349, -6.9273634, -6.7366242,...","[0.11411563, 0.11128185, 0.25650784, 0.3348497..."
6,ExtraTreesRegressor,"[-6.8808000000000025, -6.8926000000000025, -6....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9411000000000005, -6.614100000000003, -6....","[-6.93304, -6.536130000000002, -6.853319999999...","[0.03675685514295252, 0.1769068331071474, 0.11..."
7,LinearRegression,"[-4.184666015309894, -3.9, -5.462597510763377,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-9.469087112897139, -6.217165338319693, -10....","[-7.529922643134244, -4.5520748361292265, -9.4...","[1.617090346638004, 0.8839149990555397, 0.6897..."
8,KNeighborsRegressor,"[-5.633333333333333, -6.113333333333333, -5.62...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.293333333333334, -5.786666666666666...","[-6.9093333333333335, -5.173333333333334, -6.0...","[0.04533333333333331, 0.29806039656418654, 0.3..."
9,SVR,"[-6.068647309352145, -6.479146229893767, -4.98...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.6575983988830085, -5.650275727322878, -5....","[-6.787339039858615, -5.647180199957969, -5.99...","[0.07004657008641471, 0.1387517159060542, 0.10..."


In [14]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_All_fingerprints_fp.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_All_fingerprints_fp.csv')

In [15]:
#Removal of constant columns
def remove_constant_columns(df):
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    
    df_cleaned = df.drop(columns=constant_columns)
    
    return df_cleaned, constant_columns

In [16]:
#Low variance column removal
def remove_low_variance_columns(df, threshold=0.005):
    df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

In [21]:
#All fingerprints constant removal
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/All_fingerprints_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/All_fingerprints_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 6820)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 6820)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 86.239655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17668
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 4093
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 92.098963 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17848
[LightGBM] [Info] Number of data points in the train set: 4454, number o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2166,0.3418,0.4654,0.6523,0.8087,0.7776,0.2174,0.3404,0.4662,0.6565,0.8124,0.8
DecisionTreeRegressor,0.3058,0.3857,0.553,0.509,0.7336,0.7183,0.2501,0.3481,0.5001,0.6049,0.78,0.7769
RandomForestRegressor,0.2211,0.3447,0.4702,0.645,0.8032,0.7752,0.2265,0.3423,0.4759,0.6421,0.8017,0.7903
GradientBoostingRegressor,0.2546,0.3737,0.5046,0.5912,0.775,0.7402,0.2582,0.3727,0.5081,0.5921,0.7762,0.7575
AdaBoostRegressor,0.4593,0.5484,0.6777,0.2625,0.5952,0.5562,0.427,0.5293,0.6535,0.3252,0.641,0.6085
XGBRegressor,0.2129,0.3374,0.4614,0.6582,0.8115,0.7813,0.215,0.333,0.4637,0.6602,0.8127,0.7967
ExtraTreesRegressor,0.2196,0.3431,0.4687,0.6473,0.8049,0.7753,0.2292,0.3384,0.4787,0.6379,0.7991,0.7913
LinearRegression,0.607,0.4885,0.7791,0.0254,0.5634,0.6441,0.4334,0.4419,0.6583,0.3152,0.629,0.6754
KNeighborsRegressor,0.3169,0.4093,0.5629,0.4912,0.7131,0.6872,0.2944,0.3864,0.5426,0.5348,0.7372,0.7346
SVR,0.2802,0.3776,0.5293,0.5501,0.7446,0.728,0.2795,0.3725,0.5286,0.5584,0.7501,0.75


In [22]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.723532325095607, -6.757128308474924, -6.11...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.025019269942802, -6.132890118021946, -6.8...","[-7.02602636807954, -6.259715784974982, -6.862...","[0.030522988836645018, 0.11372157756993759, 0...."
1,DecisionTreeRegressor,"[-7.0, -6.96, -6.244999999999999, -5.05, -5.15...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -5.89, -6.24, -7.0, -6.85...","[-6.992, -6.328, -6.728, -6.26, -5.934, -6.886...","[0.016000000000000014, 0.35812846856959024, 0...."
2,RandomForestRegressor,"[-6.669400000000003, -6.829200000000002, -6.20...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.839600000000001, -6.315100000000004, -6.8...","[-6.859360000000001, -6.341393333333335, -6.77...","[0.05744759699064883, 0.06991268014061906, 0.1..."
3,GradientBoostingRegressor,"[-6.834707610411482, -6.780458603379971, -5.94...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.014800255424059, -6.354714096498744, -6.5...","[-7.095369312619124, -6.0946985716653375, -6.6...","[0.1653608132690947, 0.21197672943022805, 0.17..."
4,AdaBoostRegressor,"[-6.09392156862745, -5.928226797350341, -5.740...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.992080200501258, -6.09392156862745, -5.93...","[-5.8763893250350465, -5.749057965919758, -5.8...","[0.06688163443042254, 0.19197065180204043, 0.1..."
5,XGBRegressor,"[-7.035523, -6.8859987, -6.4079747, -5.390928,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.879666, -6.3244767, -6.7222857, -6.899798...","[-6.944411, -6.289349, -6.9273634, -6.7366242,...","[0.11411563, 0.11128185, 0.25650784, 0.3348497..."
6,ExtraTreesRegressor,"[-6.874499999999999, -6.838150000000001, -6.09...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.981599999999999, -6.595000000000002, -6.9...","[-6.945400000000001, -6.542400000000002, -6.91...","[0.04719563539142124, 0.12452805306436107, 0.0..."
7,LinearRegression,"[-4.184666015311104, -3.9, -5.462597510763344,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-9.469087112895288, -6.217165338320154, -10....","[-7.52992264313348, -4.552074836128948, -9.441...","[1.6170903466378592, 0.8839149990556785, 0.689..."
8,KNeighborsRegressor,"[-5.633333333333333, -6.113333333333333, -5.62...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.293333333333334, -5.786666666666666...","[-6.9093333333333335, -5.173333333333334, -6.0...","[0.04533333333333331, 0.29806039656418654, 0.3..."
9,SVR,"[-6.068738157365336, -6.479147174669174, -4.98...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.657570972953301, -5.650214246750718, -5.9...","[-6.787374427454907, -5.647125611832698, -5.99...","[0.07009217631071249, 0.13875249812984033, 0.1..."


In [23]:
X_train.columns

Index(['Morgan_fp_1', 'Morgan_fp_2', 'Morgan_fp_5', 'Morgan_fp_7',
       'Morgan_fp_11', 'Morgan_fp_12', 'Morgan_fp_13', 'Morgan_fp_14',
       'Morgan_fp_18', 'Morgan_fp_19',
       ...
       'SubFPC295', 'SubFPC296', 'SubFPC297', 'SubFPC298', 'SubFPC299',
       'SubFPC300', 'SubFPC301', 'SubFPC302', 'SubFPC303', 'SubFPC307'],
      dtype='object', length=6820)

In [24]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_All_const_rem_fingerprints.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_All_const_rem_fingerprints.csv')

In [25]:
#Morgan fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/morgan_fp_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/morgan_fp_test.csv')
X_test = df_test.drop(['ID','SMILES','Permeability'],axis=1)
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_fp

X_train shape:  (5568, 2048)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 2048)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 12.134623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1314
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 438
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 12.129180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1332
[LightGBM] [Info] Number of data points in the train set: 4454, number of u

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2626,0.3764,0.5124,0.5784,0.7616,0.7391,0.2628,0.3762,0.5126,0.5848,0.7669,0.748
DecisionTreeRegressor,0.3357,0.3992,0.5794,0.4611,0.7055,0.6982,0.2745,0.3668,0.5239,0.5662,0.7566,0.7579
RandomForestRegressor,0.2484,0.3649,0.4984,0.6012,0.7755,0.7531,0.2495,0.36,0.4995,0.6057,0.7784,0.7689
GradientBoostingRegressor,0.3076,0.4107,0.5547,0.506,0.7167,0.688,0.3165,0.4154,0.5626,0.4998,0.7133,0.6994
AdaBoostRegressor,0.4891,0.5599,0.6994,0.2147,0.5377,0.5077,0.4615,0.5426,0.6793,0.2707,0.5873,0.5623
XGBRegressor,0.2449,0.3626,0.4949,0.6068,0.7793,0.7501,0.2488,0.3608,0.4988,0.6068,0.7791,0.77
ExtraTreesRegressor,0.3202,0.3947,0.5658,0.4859,0.7155,0.704,0.269,0.3643,0.5187,0.5749,0.7616,0.763
LinearRegression,0.386,0.4349,0.6213,0.3802,0.6491,0.6659,0.3492,0.4182,0.5909,0.4482,0.6781,0.6906
KNeighborsRegressor,0.3391,0.4229,0.5823,0.4555,0.6885,0.6678,0.3121,0.3951,0.5587,0.5068,0.7195,0.7233
SVR,0.3028,0.3941,0.5502,0.5139,0.7198,0.701,0.2927,0.3834,0.5411,0.5374,0.7358,0.7295


In [26]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.190319007163, -6.718037755521681, -5.03932...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.830677855994803, -6.211924608378423, -6.7...","[-6.901668170436784, -5.9541336042575566, -6.6...","[0.06515699863454225, 0.20047925485507848, 0.1..."
1,DecisionTreeRegressor,"[-4.74, -6.96, -6.244999999999999, -4.6, -5.15...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.89, -7.0, -6.1762500000000005, -5.3...","[-6.9319999999999995, -5.809, -7.0, -6.1198076...","[0.13599999999999995, 0.8360047846753033, 0.0,..."
2,RandomForestRegressor,"[-5.830900000000003, -6.801200000000002, -5.72...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.815499999999998, -6.330100000000004, -6.8...","[-6.794, -6.038314333333334, -6.73889999999999...","[0.06246160420610471, 0.20606979308423143, 0.0..."
3,GradientBoostingRegressor,"[-6.030942209162285, -6.71097942841647, -5.050...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.7944894415330035, -5.761297620685506, -6....","[-6.91414210056994, -5.649887917323126, -6.194...","[0.211799414970668, 0.11671699407950403, 0.203..."
4,AdaBoostRegressor,"[-5.420085531005007, -5.6748803819546865, -5.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.6444754852570505, -5.404039423379018, -5....","[-5.612450773743555, -5.487947238284226, -5.70...","[0.06469839638262588, 0.06108324217503584, 0.1..."
5,XGBRegressor,"[-5.6834197, -6.7127576, -5.787127, -4.926085,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0361915, -6.325289, -7.7694483, -6.632345...","[-7.047, -6.2467294, -7.2835402, -6.8017426, -...","[0.29608455, 0.26674294, 0.39136094, 0.3842641..."
6,ExtraTreesRegressor,"[-4.9710000000000045, -6.9868000000000015, -6....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.88999999999999, -7.0, -6.1142588827...","[-6.94356, -6.058909999999995, -6.979600000000...","[0.11288000000000019, 0.4816452557640336, 0.04..."
7,LinearRegression,"[-6.3965947344629575, -5.379095452248684, -5.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-8.396759804506443, -5.724038085901541, -7.1...","[-8.736919145735147, -5.011474211239254, -6.36...","[1.8329213392801977, 0.4174016309455181, 1.712..."
8,KNeighborsRegressor,"[-5.633333333333333, -5.650000000000001, -5.64...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.3999999999999995, -5.38, -6.2266666666666...","[-6.789333333333333, -4.868666666666667, -6.20...","[0.1946666666666669, 0.27924978862023225, 0.24..."
9,SVR,"[-5.863830515598096, -6.011731763145798, -4.92...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.016751774444159, -5.586055809139273, -5.9...","[-6.0838399422311324, -5.459675200464696, -5.9...","[0.05302493339319797, 0.07383541476259833, 0.0..."


In [27]:
df_morgan_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Morgan_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Morgan_fp.csv')

In [28]:
#Morgan count fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/count_morgan_fp_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/count_morgan_fp_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_morgan_count_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_morgan_count_fp

X_train shape:  (5568, 2048)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 2048)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 11.512201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2049
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 447
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 10.975774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2072
[LightGBM] [Info] Number of data points in the train set: 4454, number of u

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2349,0.356,0.4846,0.6229,0.7903,0.7605,0.2322,0.3497,0.4818,0.6331,0.7981,0.7837
DecisionTreeRegressor,0.3016,0.3854,0.5492,0.5158,0.7375,0.721,0.2522,0.3502,0.5022,0.6015,0.777,0.7758
RandomForestRegressor,0.2296,0.3519,0.4792,0.6313,0.7946,0.7663,0.2388,0.3478,0.4887,0.6226,0.7893,0.7841
GradientBoostingRegressor,0.2791,0.3916,0.5283,0.5519,0.7492,0.7135,0.2849,0.3911,0.5337,0.5499,0.75,0.7272
AdaBoostRegressor,0.4779,0.5531,0.6913,0.2326,0.5649,0.5507,0.4447,0.5331,0.6669,0.2973,0.613,0.596
XGBRegressor,0.2199,0.3426,0.469,0.6469,0.8044,0.7774,0.2234,0.3416,0.4726,0.647,0.8046,0.7915
ExtraTreesRegressor,0.2241,0.3471,0.4734,0.6402,0.8005,0.7724,0.2305,0.3416,0.4801,0.6358,0.7976,0.7894
LinearRegression,0.348,0.4224,0.59,0.4412,0.6727,0.6714,0.3385,0.4161,0.5818,0.4651,0.6855,0.7046
KNeighborsRegressor,0.3187,0.4119,0.5646,0.4882,0.7084,0.6849,0.2985,0.3899,0.5464,0.5283,0.7331,0.7331
SVR,0.2948,0.3856,0.5429,0.5267,0.7282,0.7135,0.2857,0.3764,0.5345,0.5485,0.7425,0.7398


In [29]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.449185163697374, -6.596589019935487, -5.95...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.056119628542196, -6.096394503453853, -6.7...","[-7.017115840615927, -6.052887869020509, -6.73...","[0.08968130039351668, 0.26396027609363937, 0.0..."
1,DecisionTreeRegressor,"[-5.36, -5.77, -6.244999999999999, -5.05, -4.2...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.27, -7.0, -7.0, -6.24, -7.0, -6.85,...","[-7.0, -5.952, -6.868, -6.476000000000001, -5....","[0.0, 0.5911142021640152, 0.26400000000000007,..."
2,RandomForestRegressor,"[-6.099700000000002, -6.251600000000001, -6.13...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.705799999999999, -5.8228000000000035, -6....","[-6.813979999999999, -5.816485000000002, -6.76...","[0.07220718523803626, 0.2891633515852259, 0.05..."
3,GradientBoostingRegressor,"[-6.247998976200171, -6.328466421002222, -5.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.683679257615391, -5.718991021754435, -6.4...","[-6.923902195759925, -5.7316140316949395, -6.4...","[0.12967163152247468, 0.22189762746993172, 0.0..."
4,AdaBoostRegressor,"[-5.628926396875004, -5.628926396875004, -5.64...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.670701618893616, -5.608007149539823, -5.6...","[-5.664471367414736, -5.5718437500836, -5.7671...","[0.05008250675800045, 0.0569005873715254, 0.11..."
5,XGBRegressor,"[-6.5082974, -6.6570473, -6.6209173, -5.598532...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.826899, -6.211263, -6.8936095, -7.1406193...","[-7.15606, -6.0492535, -7.0972013, -6.8656754,...","[0.20176062, 0.2768126, 0.22253177, 0.2013783,..."
6,ExtraTreesRegressor,"[-5.9293000000000005, -6.716400000000002, -6.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.902099999999999, -6.201400000000004, -6.7...","[-6.90134, -6.127810000000004, -6.782379999999...","[0.055375106320439196, 0.2751944265424015, 0.0..."
7,LinearRegression,"[-6.290107798868079, -5.088392957610974, -4.83...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.17635510576849, -5.219679763498499, -7.92...","[-7.124173486876117, -5.162183623304773, -7.61...","[0.6173754068886426, 0.18405284167909028, 0.38..."
8,KNeighborsRegressor,"[-5.633333333333333, -5.650000000000001, -5.62...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.3999999999999995, -5.38, -6.2266666666666...","[-6.789333333333333, -4.868666666666667, -6.20...","[0.1946666666666669, 0.27924978862023225, 0.24..."
9,SVR,"[-5.879302111430403, -5.967099556949184, -5.14...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.0146032130810045, -5.53883521583049, -5.9...","[-6.073979411009359, -5.4198879227094645, -5.9...","[0.04962852406456833, 0.08950116031302308, 0.0..."


In [31]:
df_morgan_count_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Count_Morgan_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Count_Morgan_fp.csv')

In [32]:
#AtomPairs2d fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/AtomPairs2D_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/AtomPairs2D_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2D_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2D_fp

X_train shape:  (5568, 780)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 780)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.735819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 94
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.968910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 4454, number of used fea

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4726,0.5227,0.6875,0.2411,0.4912,0.408,0.4689,0.5256,0.6847,0.2591,0.5097,0.4451
DecisionTreeRegressor,0.4671,0.5164,0.6835,0.25,0.5024,0.4188,0.4628,0.5179,0.6803,0.2687,0.5189,0.4717
RandomForestRegressor,0.4657,0.516,0.6825,0.2522,0.5036,0.4189,0.4609,0.5184,0.6789,0.2717,0.5216,0.4683
GradientBoostingRegressor,0.4719,0.5233,0.687,0.2422,0.4926,0.4081,0.4697,0.526,0.6854,0.2578,0.5104,0.4564
AdaBoostRegressor,0.5591,0.5956,0.7477,0.1023,0.3975,0.3135,0.543,0.5896,0.7369,0.1419,0.4403,0.378
XGBRegressor,0.4687,0.5172,0.6846,0.2475,0.4998,0.4169,0.4634,0.5194,0.6807,0.2677,0.5178,0.467
ExtraTreesRegressor,0.4686,0.5168,0.6845,0.2476,0.5003,0.4177,0.463,0.5184,0.6804,0.2684,0.5186,0.4702
LinearRegression,0.498,0.5378,0.7057,0.2005,0.4487,0.3847,0.4936,0.5374,0.7025,0.2201,0.4695,0.4263
KNeighborsRegressor,1.0221,0.761,1.011,-0.6411,0.1001,0.046,1.0013,0.7524,1.0007,-0.5823,0.1097,0.0597
SVR,0.4984,0.5149,0.706,0.1998,0.4685,0.389,0.4985,0.5165,0.706,0.2123,0.4837,0.4462


In [33]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-4.9086869084682085, -6.102272875476731, -4.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.102272875476731, -4.9086869084682085, -4....","[-6.201250970679353, -4.8980753335735185, -4.8...","[0.10658676709104507, 0.006126817587410635, 0...."
1,DecisionTreeRegressor,"[-4.9063112745098, -6.891818181818182, -4.9063...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.891818181818182, -4.9063112745098, -4.906...","[-6.905297702297702, -4.891983787248, -4.89198...","[0.029975904951973498, 0.007545921192803515, 0..."
2,RandomForestRegressor,"[-4.904347947744239, -6.897547414701313, -4.90...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.897547414701313, -4.904347947744239, -4.9...","[-6.907922054244575, -4.891921724700505, -4.89...","[0.02748085836576134, 0.00648588223071733, 0.0..."
3,GradientBoostingRegressor,"[-4.942770303007276, -6.38923752183285, -4.942...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.38923752183285, -4.942770303007276, -4.94...","[-6.443654054904175, -4.934503426654722, -4.93...","[0.10428035464208042, 0.006686660428874856, 0...."
4,AdaBoostRegressor,"[-5.189037037037035, -5.595292353800221, -5.18...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.595292353800221, -5.189037037037035, -5.1...","[-5.476668145499879, -5.217025649232757, -5.21...","[0.07943032530477756, 0.03226372975112234, 0.0..."
5,XGBRegressor,"[-4.905953, -6.8887696, -4.905953, -4.905953, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8887696, -4.905953, -4.905953, -5.7109632...","[-6.903834, -4.8910723, -4.8910723, -5.7000566...","[0.02937797, 0.007928123, 0.007928123, 0.23334..."
6,ExtraTreesRegressor,"[-4.906311274509793, -6.8918181818181745, -4.9...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8918181818181745, -4.906311274509793, -4....","[-6.905297702297702, -4.891983787248001, -4.89...","[0.029975904951973303, 0.00754592119280148, 0...."
7,LinearRegression,"[-4.981106872180091, -6.077775887965633, -4.98...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.077775887965633, -4.981106872180091, -4.9...","[-6.122504837623203, -4.9665795364759475, -4.9...","[0.10912193600715611, 0.007268269204355609, 0...."
8,KNeighborsRegressor,"[-7.0, -6.986666666666667, -7.0, -7.0, -7.0, -...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -7.0, -7.0, -6.206666666...","[-6.984, -7.0, -7.0, -5.948666666666667, -7.0,...","[0.005333333333333456, 0.0, 0.0, 0.36911666328..."
9,SVR,"[-4.7003010070712685, -6.9000107340154155, -4....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9000107340154155, -4.7003010070712685, -4...","[-6.900020644181604, -4.697967948288505, -4.69...","[8.999915137827895e-05, 0.0041328897161808644,..."


In [34]:
df_AtomPairs2D_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_AtomPairs2D_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_AtomPairs2D_fp.csv')


In [35]:
#AtomPairs2d Count fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/AtomPairs2DCount_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/AtomPairs2DCount_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_AtomPairs2DCount_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_AtomPairs2DCount_fp

X_train shape:  (5568, 780)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 780)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.973961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2865
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 133
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.629845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2882
[LightGBM] [Info] Number of data points in the train set: 4454, number of used 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2382,0.3571,0.4881,0.6175,0.7874,0.758,0.2436,0.3568,0.4935,0.6151,0.786,0.7739
DecisionTreeRegressor,0.3242,0.3893,0.5694,0.4795,0.7143,0.7161,0.2542,0.3524,0.5042,0.5983,0.7766,0.7752
RandomForestRegressor,0.2342,0.3512,0.4839,0.624,0.7899,0.7649,0.2447,0.3492,0.4947,0.6133,0.7835,0.7791
GradientBoostingRegressor,0.2798,0.3954,0.529,0.5507,0.7498,0.7107,0.291,0.3977,0.5395,0.5401,0.7425,0.731
AdaBoostRegressor,0.5206,0.5902,0.7215,0.1641,0.5374,0.4745,0.4873,0.5716,0.6981,0.2299,0.5885,0.5179
XGBRegressor,0.2346,0.3498,0.4843,0.6233,0.7902,0.7639,0.238,0.3439,0.4878,0.624,0.7906,0.7829
ExtraTreesRegressor,0.2234,0.3461,0.4727,0.6412,0.8009,0.7748,0.2318,0.3426,0.4814,0.6338,0.7966,0.7847
LinearRegression,0.4127,0.4729,0.6424,0.3373,0.5846,0.6074,0.394,0.463,0.6277,0.3773,0.6149,0.645
KNeighborsRegressor,0.2751,0.3804,0.5245,0.5583,0.7538,0.7212,0.2642,0.3683,0.514,0.5825,0.7674,0.7547
SVR,0.3351,0.4119,0.5789,0.4619,0.6875,0.6782,0.3466,0.4162,0.5888,0.4522,0.6811,0.6885


In [36]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.776166605845609, -6.961229630383343, -6.76...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.061584125883738, -6.776166605845609, -6.8...","[-7.085543113794145, -6.750700688177885, -6.83...","[0.06921389987843728, 0.03254166801992616, 0.0..."
1,DecisionTreeRegressor,"[-6.51, -7.0, -7.0, -5.05, -7.0, -5.05, -5.05,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.11, -7.0, -7.0, -6.11, -6.8, -6.85,...","[-6.962000000000001, -6.518000000000001, -7.0,...","[0.058103356185335936, 0.39640383449204913, 0...."
2,RandomForestRegressor,"[-6.8196, -6.9311, -6.837433333333334, -5.3637...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9185, -6.748300000000001, -6.939900000000...","[-6.924170000000001, -6.724100000000002, -6.92...","[0.023132868391100958, 0.07852151297574445, 0...."
3,GradientBoostingRegressor,"[-6.758107393407671, -6.848592545320579, -6.61...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.204544935589532, -6.627400163868768, -6.9...","[-7.233705996106201, -6.644991246699744, -6.83...","[0.12057279321492244, 0.08712158720539408, 0.0..."
4,AdaBoostRegressor,"[-6.020984126984135, -6.016307649232414, -6.01...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.158888888888891, -6.185599999999999, -6.1...","[-6.2060298891208845, -6.200845391967694, -6.2...","[0.22918834427835977, 0.22192059621590743, 0.2..."
5,XGBRegressor,"[-6.817171, -7.015907, -6.890958, -5.4465914, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.190226, -6.550555, -7.0217266, -6.8290143...","[-7.000787, -6.688304, -6.9831047, -6.4883804,...","[0.19644047, 0.097358644, 0.026843786, 0.20188..."
6,ExtraTreesRegressor,"[-6.734900000000001, -6.9430499999999995, -6.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9799, -6.625000000000003, -6.974499999999...","[-6.952000000000001, -6.7403400000000016, -6.9...","[0.025896949627321118, 0.1077930535795317, 0.0..."
7,LinearRegression,"[-5.730253913531768, -5.609661702595346, -5.32...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.577591587074083, -4.999496207947311, -5.7...","[-6.5755122470628695, -4.992418580737516, -5.6...","[0.06021195738358451, 0.07789941311010345, 0.0..."
8,KNeighborsRegressor,"[-6.746666666666667, -7.0, -6.05, -4.96, -4.88...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -6.746666666666667, -7.0...","[-6.989333333333333, -6.797333333333334, -7.0,...","[0.005333333333333102, 0.10133333333333318, 0...."
9,SVR,"[-7.089332828171322, -6.896461791744283, -6.00...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.028224887085463, -6.606094902467699, -6.6...","[-7.055541082649976, -6.5812216790095075, -6.5...","[0.02947272774284844, 0.051579172886891414, 0...."


In [37]:
df_AtomPairs2DCount_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_AtomPairs2D_Count_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_df_AtomPairs2D_Count_fp.csv')


In [4]:
#EState fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/EState_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/EState_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_estate_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_estate_fp

X_train shape:  (5568, 79)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 79)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.411576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 15
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.495367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 4454, number of used feature

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4991,0.5412,0.7064,0.1987,0.4459,0.3539,0.5122,0.5527,0.7157,0.1906,0.4375,0.3684
DecisionTreeRegressor,0.4801,0.5286,0.6929,0.2291,0.4819,0.3767,0.5123,0.5452,0.7157,0.1905,0.4466,0.3938
RandomForestRegressor,0.4764,0.5282,0.6902,0.2351,0.4859,0.3771,0.5073,0.5443,0.7123,0.1984,0.4522,0.3955
GradientBoostingRegressor,0.4844,0.5341,0.696,0.2222,0.4714,0.3706,0.4851,0.5405,0.6965,0.2334,0.4836,0.3987
AdaBoostRegressor,0.6364,0.6432,0.7977,-0.0218,0.3299,0.2879,0.6237,0.6423,0.7897,0.0144,0.3643,0.378
XGBRegressor,0.4811,0.529,0.6936,0.2275,0.4801,0.3781,0.5094,0.5444,0.7137,0.195,0.4502,0.3953
ExtraTreesRegressor,0.4803,0.5287,0.693,0.2288,0.4816,0.3768,0.5118,0.5448,0.7154,0.1913,0.4473,0.394
LinearRegression,0.5023,0.5424,0.7087,0.1935,0.4403,0.351,0.5078,0.5494,0.7126,0.1976,0.4452,0.3724
KNeighborsRegressor,1.0381,0.7543,1.0189,-0.6668,0.0616,-0.0051,1.0381,0.7533,1.0189,-0.6404,0.0303,-0.0355
SVR,0.5034,0.5206,0.7095,0.1916,0.4638,0.3698,0.5112,0.5297,0.7149,0.1923,0.4661,0.3884


In [5]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-4.7257515416316584, -6.166855400211246, -5.2...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.166855400211246, -4.7257515416316584, -5....","[-6.339492670798963, -4.787478843320073, -5.17...","[0.10211072329341182, 0.031286363442289376, 0...."
1,DecisionTreeRegressor,"[-4.677142857142858, -6.387333333333332, -5.13...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.387333333333332, -4.677142857142858, -5.1...","[-6.452906725146198, -4.750804141523365, -5.10...","[0.05644788211127124, 0.037324128282113095, 0...."
2,RandomForestRegressor,"[-4.682808658584028, -6.350122042329398, -5.12...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.350122042329398, -4.682808658584028, -5.1...","[-6.439855177418162, -4.752358232880523, -5.10...","[0.06421896488370266, 0.035835586931385444, 0...."
3,GradientBoostingRegressor,"[-4.734528154931525, -6.303856554812297, -5.17...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.303856554812297, -4.734528154931525, -5.1...","[-6.40606307495891, -4.812889473467336, -5.153...","[0.08041007231346635, 0.041570310672311823, 0...."
4,AdaBoostRegressor,"[-5.113022727272723, -5.9350834728218125, -5.8...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.9350834728218125, -5.113022727272723, -5....","[-6.003012729360843, -5.42484121932522, -5.845...","[0.09127246542793117, 0.28479966711645976, 0.0..."
5,XGBRegressor,"[-4.67768, -6.38528, -5.1340313, -5.1340313, -...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.38528, -4.67768, -5.1340313, -5.06476, -5...","[-6.4520493, -4.7515783, -5.10966, -5.4020844,...","[0.056535397, 0.037432015, 0.015893554, 0.2128..."
6,ExtraTreesRegressor,"[-4.677142857142868, -6.387333333333341, -5.13...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.387333333333341, -4.677142857142868, -5.1...","[-6.452906725146202, -4.750804141523371, -5.10...","[0.0564478821112736, 0.037324128282111645, 0.0..."
7,LinearRegression,"[-4.735702715986039, -5.985935610486654, -5.18...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.985935610486654, -4.735702715986039, -5.1...","[-6.154884885188663, -4.7845373577075145, -5.1...","[0.09248167881925906, 0.02851715256928644, 0.0..."
8,KNeighborsRegressor,"[-5.603333333333333, -6.986666666666667, -7.0,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.603333333333333, -7.0...","[-6.984, -6.708666666666668, -7.0, -5.17866666...","[0.005333333333333456, 0.5531549913400807, 0.0..."
9,SVR,"[-4.580073596142576, -6.900183296007555, -4.82...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.900183296007555, -4.580073596142576, -4.8...","[-6.899878411008752, -4.612073429558651, -4.80...","[0.00031373434068456956, 0.01934696595562291, ..."


In [6]:
df_estate_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_EState_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_EState_fp.csv')


In [7]:
#Extended fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/Extended_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/Extended_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_extended_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_extended_fp

X_train shape:  (5568, 1024)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 1024)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 15.013151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2274
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 758
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2271
[LightGBM] [Info] Number of data points in the train set: 4454, number of us

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3678,0.4612,0.6065,0.4094,0.6409,0.5628,0.3699,0.4602,0.6082,0.4154,0.6467,0.5904
DecisionTreeRegressor,0.4212,0.4736,0.649,0.3238,0.5983,0.5403,0.3858,0.4579,0.6212,0.3903,0.6312,0.5836
RandomForestRegressor,0.3593,0.4511,0.5994,0.4231,0.6518,0.5758,0.3631,0.4498,0.6025,0.4263,0.6534,0.5945
GradientBoostingRegressor,0.3875,0.4769,0.6225,0.3779,0.6174,0.5289,0.3957,0.4789,0.6291,0.3747,0.6146,0.5564
AdaBoostRegressor,0.6498,0.6542,0.8061,-0.0434,0.3735,0.3857,0.6339,0.6458,0.7962,-0.0017,0.3962,0.4374
XGBRegressor,0.3616,0.4525,0.6014,0.4193,0.6515,0.5702,0.3605,0.449,0.6004,0.4303,0.6571,0.5946
ExtraTreesRegressor,0.4148,0.4723,0.644,0.334,0.6026,0.542,0.382,0.4562,0.6181,0.3963,0.6346,0.5847
LinearRegression,0.515,0.509,0.7176,0.1731,0.522,0.523,0.4325,0.4818,0.6577,0.3165,0.5828,0.5509
KNeighborsRegressor,0.464,0.5059,0.6812,0.2549,0.5603,0.4793,0.4374,0.4891,0.6614,0.3088,0.5931,0.5366
SVR,0.3975,0.463,0.6304,0.3618,0.6104,0.5473,0.3938,0.4586,0.6275,0.3778,0.6244,0.5773


In [8]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.210110133447338, -6.732410735254416, -5.75...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.306895107939291, -5.066698711260405, -5.2...","[-6.393563645699494, -5.3717918456608285, -5.1...","[0.1128025441255652, 0.1879859436716718, 0.099..."
1,DecisionTreeRegressor,"[-5.62, -6.96, -6.6225, -4.7775, -6.6225, -4.2...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.96, -6.24, -5.1, -6.22, -4.8175, -6.6225,...","[-6.992, -6.25, -5.414, -5.822, -5.28499999999...","[0.016000000000000014, 0.8143218037115305, 0.8..."
2,RandomForestRegressor,"[-5.995883333333333, -6.912300000000001, -6.57...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.904207500000001, -5.366545714285714, -5.5...","[-6.889338261904763, -5.802596476190478, -5.51...","[0.043427997538214594, 0.23144892329983693, 0...."
3,GradientBoostingRegressor,"[-5.14220899759884, -6.6831453381864385, -5.11...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.65121140485979, -5.05463021317866, -4.927...","[-6.591644856785095, -5.169166436478891, -4.88...","[0.11335659133395465, 0.1803984337341485, 0.02..."
4,AdaBoostRegressor,"[-5.468127256317693, -5.634489472703555, -5.63...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.565563703024749, -5.468127256317693, -5.4...","[-5.63550683719888, -5.577656242307972, -5.568...","[0.0444539065599728, 0.0636833531468969, 0.057..."
5,XGBRegressor,"[-8.437091, -8.210553, -6.428049, -4.702557, -...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8522077, -5.8823566, -5.266061, -6.993047...","[-6.8898225, -6.0707283, -5.294844, -6.246085,...","[0.090542756, 0.21896191, 0.21398959, 0.562217..."
6,ExtraTreesRegressor,"[-5.555600000000003, -6.9600000000000035, -6.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.987200000000002, -6.240000000000006, -5.3...","[-6.99744, -6.110300000000003, -5.456930000000...","[0.005119999999999081, 0.7415917583145069, 0.8..."
7,LinearRegression,"[-5.948734010075369, -6.813726583610469, -5.27...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.689984371798825, -4.526532803486631, -4.8...","[-6.7342323006033, -4.467796207798678, -4.7829...","[0.12183121057459037, 0.3111950647582891, 0.06..."
8,KNeighborsRegressor,"[-5.603333333333333, -6.986666666666667, -6.61...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -4.513333333333333, -5.5...","[-6.952000000000001, -4.504, -5.33133333333333...","[0.05356615847093516, 0.029013406862651362, 0...."
9,SVR,"[-5.1420042443352685, -6.815193698706107, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.872507555402064, -4.889444177337033, -4.6...","[-6.86148874132725, -4.929863505344455, -4.635...","[0.039694526457737696, 0.02502980312024346, 0...."


In [9]:
df_extended_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Extended_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Extended_fp.csv')


In [10]:
#Fingerprinter fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/Fingerprinter_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/Fingerprinter_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_fingerprinter_fp , pred_df= train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_fingerprinter_fp

X_train shape:  (5568, 1024)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 1024)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2259
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 753
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2259
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 753
[LightGBM] [Info] Start training from score -5.7

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3804,0.4719,0.6168,0.3892,0.6241,0.5386,0.3833,0.4733,0.6191,0.3943,0.6286,0.5554
DecisionTreeRegressor,0.4337,0.4837,0.6586,0.3036,0.5824,0.5225,0.3913,0.4632,0.6255,0.3817,0.6244,0.5565
RandomForestRegressor,0.3757,0.4643,0.613,0.3967,0.6313,0.5505,0.38,0.4617,0.6164,0.3995,0.6332,0.5581
GradientBoostingRegressor,0.3991,0.4882,0.6318,0.3592,0.6018,0.5012,0.399,0.488,0.6317,0.3695,0.6111,0.5216
AdaBoostRegressor,0.6459,0.6522,0.8037,-0.0371,0.3715,0.3894,0.6245,0.6408,0.7903,0.0131,0.3984,0.4359
XGBRegressor,0.3729,0.4621,0.6107,0.4012,0.6371,0.5556,0.3717,0.4607,0.6097,0.4126,0.6438,0.5608
ExtraTreesRegressor,0.4243,0.4799,0.6514,0.3188,0.5884,0.5238,0.3936,0.4657,0.6274,0.378,0.6212,0.5576
LinearRegression,0.5119,0.5123,0.7155,0.1781,0.5106,0.5058,0.4358,0.4921,0.6602,0.3113,0.5763,0.5419
KNeighborsRegressor,0.4894,0.5217,0.6996,0.2142,0.5207,0.4381,0.4642,0.5031,0.6813,0.2665,0.5532,0.5009
SVR,0.4048,0.4713,0.6362,0.35,0.6004,0.5291,0.3979,0.4679,0.6308,0.3712,0.6192,0.5469


In [11]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.054572681655, -6.498864242733969, -5.10971...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.984784757982033, -4.523138670409128, -4.8...","[-6.077116474019431, -4.563319534313782, -4.85...","[0.14157114227210232, 0.03236298861371521, 0.0..."
1,DecisionTreeRegressor,"[-7.0, -6.96, -5.195000000000001, -4.7775, -5....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.48, -5.12, -5.11, -6.24, -5.1950000...","[-7.0, -4.565666666666667, -5.006, -6.822, -5....","[0.0, 0.107215255962531, 0.22800000000000012, ..."
2,RandomForestRegressor,"[-5.886728888888889, -6.684092261904763, -5.17...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9537, -4.707901666666669, -5.025090238095...","[-6.929163690476192, -4.706269619047621, -4.92...","[0.0493991578605901, 0.12587772817088766, 0.12..."
3,GradientBoostingRegressor,"[-4.795584112293573, -6.479001950247324, -4.93...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.539723264817252, -4.778989000466205, -4.9...","[-6.356125430113132, -4.690042410014629, -4.88...","[0.13700210981899594, 0.07373326102932107, 0.0..."
4,AdaBoostRegressor,"[-5.553678743404556, -5.707758770030314, -5.55...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.553678743404556, -5.553678743404556, -5.5...","[-5.578832213830941, -5.528423399553221, -5.53...","[0.16351682940867196, 0.13310204129271813, 0.1..."
5,XGBRegressor,"[-6.1767316, -6.774272, -5.1184177, -4.702109,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8874984, -4.5139947, -4.8827777, -5.59299...","[-6.832596, -4.6126914, -4.820067, -5.5934353,...","[0.04431113, 0.15006681, 0.08276358, 0.2412836..."
6,ExtraTreesRegressor,"[-7.0, -6.9808, -5.194999999999996, -4.7774999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.563699999999996, -5.120000000000004...","[-7.0, -4.545156666666668, -5.006000000000006,...","[0.0, 0.04971281703723252, 0.22799999999999834..."
7,LinearRegression,"[-5.951929047744257, -6.0598687335401795, -5.2...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.055713588220685, -6.239593214737146, -4.9...","[-6.258447300575504, -5.205028587320231, -4.94...","[0.11993299326332864, 0.8575980679739353, 0.07..."
8,KNeighborsRegressor,"[-5.603333333333333, -6.986666666666667, -6.49...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.2299999999999995, -5.52666666666666...","[-6.954666666666666, -5.053333333333333, -4.84...","[0.055521767503085344, 0.29412015685203635, 0...."
9,SVR,"[-4.960774021146256, -6.787210065738998, -4.75...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.83383133007565, -5.133982955754157, -4.68...","[-6.808664480496785, -5.141173652418784, -4.66...","[0.05392850097076415, 0.0963717167346493, 0.02..."


In [12]:
df_fingerprinter_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Fingerprinter_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Fingerprinter_fp.csv')


In [13]:
#GraphOnly fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/Graphonly_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/Graphonly_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_graph_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_graph_fp

X_train shape:  (5568, 1024)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 1024)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1272
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 424
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1266
[LightGBM] [Info] Number of data points in the train set: 4454, number of use

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4022,0.4832,0.6342,0.3541,0.5951,0.5063,0.4168,0.4912,0.6456,0.3413,0.5844,0.5085
DecisionTreeRegressor,0.4247,0.4875,0.6517,0.3181,0.5775,0.4916,0.4258,0.4865,0.6525,0.3272,0.5771,0.5123
RandomForestRegressor,0.3957,0.4765,0.6291,0.3646,0.6054,0.5093,0.4128,0.4822,0.6425,0.3478,0.591,0.5184
GradientBoostingRegressor,0.4182,0.495,0.6467,0.3286,0.5751,0.472,0.4263,0.5003,0.6529,0.3264,0.5743,0.4872
AdaBoostRegressor,0.5466,0.5924,0.7393,0.1224,0.4396,0.39,0.5216,0.5829,0.7222,0.1757,0.4931,0.4354
XGBRegressor,0.4004,0.4779,0.6328,0.3571,0.601,0.5051,0.4168,0.4845,0.6456,0.3414,0.5864,0.5197
ExtraTreesRegressor,0.4157,0.4847,0.6448,0.3325,0.5869,0.4949,0.4238,0.4862,0.651,0.3303,0.5794,0.5156
LinearRegression,0.4682,0.5086,0.6842,0.2483,0.5241,0.4685,0.4554,0.505,0.6749,0.2803,0.5401,0.4836
KNeighborsRegressor,0.5611,0.5645,0.749,0.0991,0.4306,0.3397,0.5842,0.5658,0.7643,0.0769,0.4018,0.3517
SVR,0.4335,0.4835,0.6584,0.3039,0.562,0.4759,0.4372,0.4838,0.6612,0.3091,0.5682,0.4905


In [14]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-4.953915252662118, -6.534657041656787, -5.20...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.4284853052792155, -4.8871405470605325, -5...","[-6.435023660987748, -5.132642583665913, -5.17...","[0.07102059602735827, 0.18058381583927116, 0.0..."
1,DecisionTreeRegressor,"[-4.805, -7.0, -5.248281250000002, -4.77324999...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.805, -5.248281250000002, -4.82, -5....","[-7.0, -6.561, -5.220977855716688, -6.128, -5....","[0.0, 0.8780000000000002, 0.051497376799151964..."
2,RandomForestRegressor,"[-5.124944924103675, -6.939516666666666, -5.22...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.03771311036186, -5.228100325386871,...","[-6.982614816738817, -6.010645198606949, -5.22...","[0.017367788612230053, 0.4937020010792809, 0.0..."
3,GradientBoostingRegressor,"[-5.060024743824091, -6.954386865212181, -4.96...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.488638530497739, -5.0475861307519505, -4....","[-6.373596669806568, -5.3743225058336375, -4.9...","[0.10903766030959725, 0.37135808648679475, 0.0..."
4,AdaBoostRegressor,"[-5.408786610878663, -5.78736170212766, -5.408...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.5266831683168425, -5.408786610878663, -5....","[-5.676293894926543, -5.347214243511325, -5.28...","[0.1809520666823292, 0.09497439950463075, 0.14..."
5,XGBRegressor,"[-5.5538764, -6.891533, -5.1962934, -4.789754,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.972633, -5.4913654, -5.1962934, -6.207151...","[-7.012512, -6.2977157, -5.169132, -6.043078, ...","[0.028772052, 0.40571493, 0.04417925, 0.246743..."
6,ExtraTreesRegressor,"[-4.805000000000006, -7.0, -5.2482812499999945...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.805000000000006, -5.248281249999994...","[-7.0, -6.561000000000002, -5.220977855716685,...","[0.0, 0.8779999999999976, 0.051497376799150944..."
7,LinearRegression,"[-5.411188695740503, -7.13582012514197, -5.141...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.755509851716345, -5.144603493984921, -5.1...","[-6.777083000237599, -4.960357065903923, -5.10...","[0.0525503901938067, 0.1418246031652898, 0.031..."
8,KNeighborsRegressor,"[-5.1933333333333325, -6.986666666666667, -7.0...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.420000000000001, -7.0, -5.383333333...","[-7.0, -4.689333333333334, -7.0, -5.4773333333...","[0.0, 0.3536313208853409, 0.0, 0.1582459407939..."
9,SVR,"[-4.7998455359027155, -6.993772097541642, -4.7...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.900064672563355, -4.6938019928885435, -4....","[-6.889847920561131, -4.702268844924371, -4.72...","[0.020262504649472365, 0.03774984528325181, 0...."


In [15]:
df_graph_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Graphonly_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Graphonly_fp.csv')


In [16]:
#KlekotaRoth fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/KlekotaRoth_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/KlekotaRoth_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRoth_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRoth_fp

X_train shape:  (5568, 4860)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 4860)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 840
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 280
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011677 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 888
[LightGBM] [Info] Number of data points in the train set: 4454, number of used 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2945,0.4061,0.5427,0.5272,0.7273,0.6904,0.3039,0.41,0.5512,0.5198,0.7221,0.693
DecisionTreeRegressor,0.3707,0.4267,0.6088,0.4049,0.6675,0.6482,0.3238,0.408,0.5691,0.4883,0.7065,0.6905
RandomForestRegressor,0.2844,0.3957,0.5332,0.5434,0.7373,0.7043,0.2941,0.3964,0.5423,0.5353,0.732,0.7098
GradientBoostingRegressor,0.3361,0.4394,0.5797,0.4603,0.6832,0.6383,0.3397,0.4408,0.5828,0.4633,0.686,0.6582
AdaBoostRegressor,0.5221,0.58,0.7225,0.1618,0.497,0.4473,0.4989,0.5676,0.7063,0.2117,0.5393,0.5001
XGBRegressor,0.2736,0.3893,0.5231,0.5607,0.7491,0.7086,0.2886,0.3958,0.5372,0.544,0.7379,0.7089
ExtraTreesRegressor,0.3564,0.4221,0.597,0.4277,0.6779,0.6543,0.3251,0.4086,0.5702,0.4863,0.7051,0.6911
LinearRegression,0.3829,0.4566,0.6188,0.3852,0.6295,0.6179,0.3729,0.4497,0.6107,0.4107,0.6445,0.6417
KNeighborsRegressor,0.3777,0.454,0.6145,0.3936,0.6424,0.5912,0.3853,0.4486,0.6207,0.3911,0.6399,0.6194
SVR,0.3191,0.413,0.5649,0.4877,0.7027,0.6689,0.3181,0.4081,0.564,0.4974,0.7096,0.6886


In [17]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.054157956992978, -6.191867230360358, -4.99...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.671025641441484, -4.807873362339619, -5.4...","[-6.771761707721355, -5.000511410197683, -5.44...","[0.13300900164476903, 0.1367893260281191, 0.06..."
1,DecisionTreeRegressor,"[-4.7, -7.0, -4.47, -4.39, -4.515000000000001,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.375, -4.32, -5.89, -6.24, -4.39, -5...","[-6.83, -5.279, -5.092, -5.856285714285714, -6...","[0.10751744044572485, 0.5544132033059818, 0.96..."
2,RandomForestRegressor,"[-5.25398357142857, -6.6502166666666644, -4.97...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.5613666666666655, -4.944375000000001, -5....","[-6.751495666666666, -5.051986380952381, -5.25...","[0.09923777935175039, 0.190504641207495, 0.071..."
3,GradientBoostingRegressor,"[-4.998103122591567, -6.446859134215356, -4.96...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.570759711248891, -4.891553504666785, -5.1...","[-6.6718231677083155, -4.910190983485441, -5.0...","[0.05875599376040858, 0.0510377511978016, 0.06..."
4,AdaBoostRegressor,"[-5.6070818505338025, -5.7097874720357895, -5....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.718568456096558, -5.568749109052049, -5.6...","[-5.711732031063603, -5.497269322083481, -5.62...","[0.05624384395704462, 0.0642314943462804, 0.02..."
5,XGBRegressor,"[-4.9887757, -6.7772355, -5.3717394, -4.858517...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.5603876, -4.940669, -5.3446703, -6.492645...","[-7.1524444, -5.119002, -5.297551, -5.855855, ...","[0.22800697, 0.174265, 0.1335067, 0.5189332, 0..."
6,ExtraTreesRegressor,"[-4.7447999999999935, -7.0, -4.530333333333344...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -4.394800000000003, -4.319999999999994...","[-6.830000000000004, -5.166169999999999, -5.04...","[0.10751744044572457, 0.4992485949905098, 0.98..."
7,LinearRegression,"[-5.052242846561083, -6.286541960512412, -4.72...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.710505849339044, -4.978726087452911, -4.9...","[-6.739854277513606, -5.028491848594683, -4.91...","[0.06243188235852099, 0.04704168807776333, 0.0..."
8,KNeighborsRegressor,"[-4.8999999999999995, -7.0, -5.626666666666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.1933333333333325, -5....","[-6.904000000000001, -5.116, -5.52, -5.6146666...","[0.04165466493816895, 0.3840856385987085, 0.33..."
9,SVR,"[-4.862511114721572, -6.8505754908200265, -4.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9099575148058445, -4.728377656062778, -4....","[-6.89439910462067, -4.770812628831248, -4.793...","[0.025300092130619876, 0.027800118025745644, 0..."


In [18]:
df_KlekotaRoth_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_KlekotaRoth_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_KlekotaRoth_fp.csv')


In [19]:
#KlekotaRoth Count fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/KlekotaRothCount_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/KlekotaRothCount_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_KlekotaRothCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_KlekotaRothCount_fp

X_train shape:  (5568, 4860)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 4860)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2839
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 339
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2886
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 355
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2413,0.3608,0.4912,0.6126,0.7841,0.755,0.2534,0.3676,0.5034,0.5996,0.7753,0.7558
DecisionTreeRegressor,0.3072,0.3847,0.5542,0.5068,0.7319,0.7166,0.2759,0.3663,0.5253,0.5639,0.7558,0.7532
RandomForestRegressor,0.2309,0.3529,0.4806,0.6292,0.7932,0.7677,0.2501,0.3568,0.5001,0.6048,0.7778,0.7656
GradientBoostingRegressor,0.2908,0.4018,0.5392,0.5331,0.7368,0.6996,0.2951,0.4018,0.5432,0.5337,0.7352,0.7177
AdaBoostRegressor,0.4824,0.5583,0.6946,0.2254,0.5681,0.5331,0.4544,0.5399,0.6741,0.2819,0.6097,0.5852
XGBRegressor,0.2265,0.3475,0.476,0.6363,0.7977,0.7696,0.2397,0.3507,0.4896,0.6213,0.7883,0.7694
ExtraTreesRegressor,0.2261,0.3491,0.4754,0.637,0.7985,0.7724,0.2349,0.3469,0.4846,0.6289,0.7933,0.7784
LinearRegression,0.3724,0.4351,0.6102,0.4021,0.644,0.6553,0.3547,0.432,0.5956,0.4395,0.6658,0.6876
KNeighborsRegressor,0.3143,0.4077,0.5606,0.4954,0.7155,0.6774,0.3093,0.3975,0.5562,0.5112,0.7245,0.7091
SVR,0.2734,0.3721,0.5229,0.561,0.7525,0.7299,0.2808,0.3763,0.5299,0.5563,0.7482,0.7343


In [20]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.467749321574162, -6.623705394182443, -6.03...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.997994065065435, -6.414553587269026, -6.6...","[-7.151885813305382, -6.434383414686645, -6.59...","[0.10518687791621008, 0.09254534494622917, 0.1..."
1,DecisionTreeRegressor,"[-6.24, -7.0, -7.0, -5.92, -5.15, -4.77, -4.66...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.24, -7.0, -7.0, -7.0, -6.85, -6.85,...","[-7.0, -6.544, -6.992, -6.396, -6.55, -6.72399...","[0.0, 0.37232244090304295, 0.01600000000000001..."
2,RandomForestRegressor,"[-6.7438, -6.744399999999998, -5.8691016666666...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9354, -6.635500000000003, -6.771500000000...","[-6.947020000000002, -6.666260000000001, -6.58...","[0.016007548219513126, 0.2029012134019893, 0.1..."
3,GradientBoostingRegressor,"[-6.202071049003006, -6.641640720602897, -5.92...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.041259866292835, -6.3513192068027395, -6....","[-7.193116589281702, -6.45460379217071, -6.520...","[0.17902024313381015, 0.17779769620480668, 0.1..."
4,AdaBoostRegressor,"[-5.924212299060027, -5.7825, -5.5120925553320...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.013020134228188, -5.7825, -6.313392857142...","[-6.087702445289085, -5.952852035526168, -6.03...","[0.12180299989279732, 0.1752146039696586, 0.25..."
5,XGBRegressor,"[-6.845116, -7.122616, -6.7358203, -5.44078, -...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.35293, -6.4759326, -6.990032, -6.873097, ...","[-7.1894617, -6.592904, -7.009122, -6.560726, ...","[0.15143919, 0.22919108, 0.16092433, 0.3625372..."
6,ExtraTreesRegressor,"[-6.68525, -6.868299999999998, -6.104550000000...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.9965, -6.590400000000002, -6.747699999999...","[-6.984440000000001, -6.703900000000002, -6.76...","[0.009708470528357894, 0.19671736069803109, 0...."
7,LinearRegression,"[-5.137564542377576, -6.486956505415558, -4.77...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.013549403470219, -5.309797517213946, -5.1...","[-6.148902608428957, -5.356094111390378, -5.13...","[0.10358448414094945, 0.06061241747474251, 0.0..."
8,KNeighborsRegressor,"[-4.8999999999999995, -7.0, -5.62, -5.55, -4.6...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.1933333333333325, -6....","[-6.989333333333333, -5.575999999999999, -6.40...","[0.005333333333333102, 0.21866666666666723, 0...."
9,SVR,"[-6.205489354598905, -6.808520423147991, -5.30...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.324567888746655, -6.158299009403002, -6.7...","[-7.252162203523644, -6.317877420641227, -6.69...","[0.07803546764939835, 0.0996689552576794, 0.06..."


In [21]:
df_KlekotaRothCount_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_KlekotaRoth_Count_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_KlekotaRoth_Count_fp.csv')


In [22]:
#MACCS fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/MACCS_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/MACCS_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_MACCS_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_MACCS_fp

X_train shape:  (5568, 166)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 166)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 240
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 80
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 243
[LightGBM] [Info] Number of data points in the train set: 4454, number of used fea

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.3553,0.4411,0.5961,0.4295,0.6556,0.6289,0.3533,0.4429,0.5944,0.4417,0.6659,0.6406
DecisionTreeRegressor,0.3924,0.4485,0.6264,0.37,0.6273,0.6161,0.3546,0.4322,0.5955,0.4397,0.6664,0.6539
RandomForestRegressor,0.3541,0.4352,0.5951,0.4315,0.6594,0.6355,0.3407,0.427,0.5837,0.4616,0.6801,0.6608
GradientBoostingRegressor,0.3798,0.4629,0.6163,0.3901,0.6283,0.5999,0.3767,0.4604,0.6137,0.4048,0.642,0.6189
AdaBoostRegressor,0.5675,0.609,0.7533,0.0888,0.4445,0.4238,0.5382,0.5969,0.7336,0.1495,0.4976,0.4551
XGBRegressor,0.3528,0.4338,0.594,0.4335,0.6617,0.6367,0.346,0.4321,0.5882,0.4532,0.6741,0.6652
ExtraTreesRegressor,0.3844,0.446,0.62,0.3827,0.6339,0.6198,0.3529,0.4317,0.594,0.4424,0.6681,0.6561
LinearRegression,0.4301,0.4888,0.6558,0.3095,0.5584,0.5543,0.426,0.4877,0.6527,0.3269,0.5728,0.5694
KNeighborsRegressor,0.5583,0.5356,0.7472,0.1035,0.4526,0.4219,0.557,0.528,0.7463,0.1198,0.4587,0.4266
SVR,0.3756,0.434,0.6129,0.3969,0.6373,0.6202,0.38,0.4337,0.6164,0.3996,0.6396,0.6378


In [23]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-4.828854615263665, -6.25409174394604, -4.782...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.25409174394604, -4.77692003777844, -5.174...","[-6.382234928074743, -4.907868998332539, -5.11...","[0.11991125078650615, 0.12036585461728498, 0.0..."
1,DecisionTreeRegressor,"[-4.8999999999999995, -7.0, -4.801176470588236...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.816666666666666, -5.283818181818183...","[-6.987333333333334, -5.214666666666666, -5.19...","[0.02533333333333303, 0.38978826160992697, 0.0..."
2,RandomForestRegressor,"[-4.90569964285714, -6.9707676190476215, -4.78...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.913628605824498, -5.794885079365076, -5.2...","[-6.930332289429968, -5.214396165223664, -5.19...","[0.016298820400586684, 0.3838726863050127, 0.0..."
3,GradientBoostingRegressor,"[-4.690342217944251, -6.29020172780605, -4.945...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.264818181643189, -4.695472454316142, -4.9...","[-6.4265646379471395, -4.7873401517700245, -4....","[0.12107986246483377, 0.04811141695798136, 0.0..."
4,AdaBoostRegressor,"[-5.395815384615388, -5.768305531833068, -5.29...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.664646153846155, -5.395815384615388, -5.3...","[-5.591162677700295, -5.370591589523299, -5.37...","[0.179746667553664, 0.14341113076273732, 0.143..."
5,XGBRegressor,"[-5.044717, -6.8042765, -4.7162204, -4.8664494...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.71183, -5.2756224, -5.2298694, -6.33378, ...","[-6.8375244, -5.120825, -5.1633034, -6.16767, ...","[0.08154567, 0.15820575, 0.06364988, 0.2141117..."
6,ExtraTreesRegressor,"[-4.8999999999999915, -7.0, -4.801176470588242...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.81666666666667, -5.283818181818176,...","[-6.987333333333334, -5.214666666666666, -5.19...","[0.025333333333331966, 0.3897882616099315, 0.0..."
7,LinearRegression,"[-4.783102794790144, -6.155838290302034, -5.04...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.074338701094595, -4.792171396907944, -5.0...","[-6.260375281168595, -4.832185810738035, -4.98...","[0.10079208316089719, 0.02187978401212345, 0.0..."
8,KNeighborsRegressor,"[-4.8999999999999995, -6.986666666666667, -5.8...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.816666666666666, -7.0...","[-6.989333333333333, -5.58, -6.957333333333334...","[0.005333333333333102, 0.5033929324538081, 0.0..."
9,SVR,"[-4.706629860321147, -6.899857396741986, -4.56...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.89945879937141, -4.6295363354867245, -4.7...","[-6.893104743119852, -4.688924966011017, -4.72...","[0.0134519078867844, 0.045612497700995666, 0.0..."


In [24]:
df_MACCS_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_MACCS_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_MACCS_fp.csv')


In [25]:
#PubChem fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/PubChem_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/PubChem_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_PubChem_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_PubChem_fp

X_train shape:  (5568, 881)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 881)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 786
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 262
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009510 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 801
[LightGBM] [Info] Number of data points in the train set: 4454, number of used fe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.376,0.4697,0.6132,0.3963,0.6296,0.5375,0.3716,0.4657,0.6096,0.4128,0.6432,0.57
DecisionTreeRegressor,0.4094,0.4759,0.6398,0.3427,0.6012,0.5261,0.3819,0.4573,0.618,0.3965,0.6338,0.5734
RandomForestRegressor,0.3698,0.4633,0.6081,0.4062,0.6384,0.547,0.3641,0.4543,0.6034,0.4247,0.6521,0.5788
GradientBoostingRegressor,0.3821,0.4777,0.6181,0.3865,0.6243,0.5217,0.3843,0.4765,0.6199,0.3928,0.6296,0.5532
AdaBoostRegressor,0.621,0.6411,0.788,0.0029,0.4036,0.392,0.6112,0.6363,0.7818,0.0342,0.4205,0.4286
XGBRegressor,0.373,0.4638,0.6107,0.4012,0.6364,0.541,0.3613,0.4502,0.6011,0.429,0.6557,0.5855
ExtraTreesRegressor,0.4022,0.4732,0.6342,0.3542,0.6078,0.5306,0.3819,0.4575,0.618,0.3965,0.634,0.573
LinearRegression,0.4201,0.4887,0.6481,0.3255,0.5767,0.5119,0.404,0.4861,0.6356,0.3616,0.6033,0.5448
KNeighborsRegressor,0.5654,0.5673,0.752,0.0921,0.4792,0.3955,0.5498,0.5558,0.7415,0.1312,0.5023,0.4487
SVR,0.3949,0.4669,0.6284,0.3659,0.6141,0.5267,0.3964,0.4618,0.6296,0.3737,0.6216,0.5617


In [26]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.15240091869389, -6.409222493499245, -5.545...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.730628525521913, -6.160845049049088, -6.1...","[-7.03747074163312, -5.91633115788449, -6.0649...","[0.31613404943333073, 0.32462689353377144, 0.2..."
1,DecisionTreeRegressor,"[-6.89, -7.0, -5.511875000000001, -4.979759036...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.873333333333334, -6.89, -7.0, -6.87...","[-6.948, -6.974666666666667, -6.88666666666666...","[0.049558046773455613, 0.05066666666666641, 0...."
2,RandomForestRegressor,"[-6.8417517511520645, -6.983499999999999, -5.5...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.951514285714284, -6.865683174603171, -6.8...","[-6.953734857142857, -6.86388281962482, -6.824...","[0.007862618798297322, 0.025702446569454675, 0..."
3,GradientBoostingRegressor,"[-6.759241818307694, -6.926074604175503, -5.32...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.6521262779075485, -6.570095986570666, -6....","[-7.660522247833451, -6.391796237208716, -6.32...","[0.05095329964644285, 0.21846501206762203, 0.0..."
4,AdaBoostRegressor,"[-5.870566190768713, -5.801438356164383, -5.43...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.870566190768713, -5.727960175447544, -5.8...","[-5.847272281495562, -5.73358052499466, -5.825...","[0.053836174943512054, 0.0835560159715254, 0.0..."
5,XGBRegressor,"[-8.323095, -8.49177, -5.5085397, -4.992343, -...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.355886, -7.7571683, -7.0634513, -6.523109...","[-7.1913886, -7.1380224, -6.9227333, -6.431646...","[0.23967122, 0.33813962, 0.087070495, 0.144615..."
6,ExtraTreesRegressor,"[-6.88999999999999, -7.0, -5.511874999999992, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -6.873333333333334, -6.88999999999999,...","[-6.947999999999996, -6.966306666666666, -6.89...","[0.04955804677346036, 0.04922495482758469, 0.0..."
7,LinearRegression,"[-6.45522034418687, -7.055143305233996, -5.545...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.749785115888298, -6.116769408880275, -6.7...","[-7.901167677574543, -6.153447963219231, -6.73...","[0.10380520823425542, 0.07125564115036094, 0.0..."
8,KNeighborsRegressor,"[-6.963333333333334, -7.0, -6.836666666666666,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.973333333333334, -7.0, -6.963333333333334...","[-6.978666666666667, -7.0, -6.970666666666668,...","[0.006531972647421959, 0.0, 0.0146666666666664..."
9,SVR,"[-6.347262239146147, -6.995727536382716, -5.24...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.931115307275416, -6.327606660447614, -6.3...","[-6.925149257760651, -6.501785916112955, -6.51...","[0.029477865723266827, 0.10331472781994015, 0...."


In [27]:
df_PubChem_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_PubChem_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_PubChem_fp.csv')


In [28]:
#Substructure fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/Substructure_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/Substructure_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_Substructure_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_Substructure_fp

X_train shape:  (5568, 307)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 307)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 30
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 4454, number of used featu

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4717,0.5262,0.6868,0.2427,0.4926,0.3835,0.4731,0.5264,0.6878,0.2524,0.5025,0.4199
DecisionTreeRegressor,0.4554,0.5131,0.6748,0.2688,0.5213,0.3941,0.4728,0.5169,0.6876,0.2529,0.5075,0.4394
RandomForestRegressor,0.4526,0.5125,0.6727,0.2733,0.5237,0.3968,0.4685,0.5168,0.6845,0.2597,0.5119,0.4385
GradientBoostingRegressor,0.4615,0.5206,0.6794,0.2589,0.5091,0.3928,0.4742,0.5265,0.6886,0.2506,0.5008,0.4143
AdaBoostRegressor,0.5497,0.5959,0.7414,0.1174,0.4256,0.3161,0.5456,0.5961,0.7386,0.1378,0.4424,0.3875
XGBRegressor,0.4537,0.5127,0.6736,0.2715,0.523,0.3969,0.4696,0.5165,0.6853,0.2579,0.5108,0.4362
ExtraTreesRegressor,0.4559,0.5136,0.6752,0.2679,0.5202,0.3942,0.4723,0.5166,0.6873,0.2536,0.5079,0.4399
LinearRegression,0.4785,0.5295,0.6917,0.2318,0.4826,0.3795,0.4784,0.5298,0.6917,0.244,0.4941,0.408
KNeighborsRegressor,0.7096,0.6317,0.8424,-0.1393,0.2377,0.1502,0.7094,0.6324,0.8423,-0.1211,0.2195,0.1702
SVR,0.476,0.5089,0.6899,0.2357,0.5021,0.3978,0.4869,0.5111,0.6978,0.2306,0.4988,0.4273


In [29]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-5.1765165647687255, -6.115999270764352, -5.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.115999270764352, -5.1765165647687255, -5....","[-6.223803935418417, -5.23510786812346, -5.173...","[0.07419030034403576, 0.0380275825419148, 0.00..."
1,DecisionTreeRegressor,"[-5.088333333333333, -6.900833333333334, -5.16...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.900833333333334, -5.088333333333333, -5.1...","[-6.910515384615385, -5.566761904761904, -5.13...","[0.029148345781561377, 0.29670803808448715, 0...."
2,RandomForestRegressor,"[-5.076318534492043, -6.9064871039392335, -5.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.906487103939234, -5.076318534492043, -5.1...","[-6.912931768470837, -5.5702805028802045, -5.1...","[0.02750031991738278, 0.30973715553062103, 0.0..."
3,GradientBoostingRegressor,"[-5.0477080181230125, -6.657747306540635, -5.1...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.657747306540635, -5.0477080181230125, -5....","[-6.543614244757757, -5.118199857641798, -5.13...","[0.06974657852196715, 0.043943005853952936, 0...."
4,AdaBoostRegressor,"[-5.337882797731561, -5.893963133640551, -5.33...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-5.893963133640551, -5.337882797731561, -5.3...","[-5.804903426009342, -5.368202233480746, -5.36...","[0.19793378106761128, 0.1217614130678465, 0.12..."
5,XGBRegressor,"[-5.099984, -6.8878307, -5.160158, -5.160158, ...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.8878307, -5.099984, -5.160158, -4.860455,...","[-6.899044, -5.523138, -5.1358643, -4.9446115,...","[0.031615507, 0.26666743, 0.016278781, 0.20925..."
6,ExtraTreesRegressor,"[-5.088333333333323, -6.900833333333335, -5.16...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.900833333333335, -5.088333333333323, -5.1...","[-6.9105153846153895, -5.566761904761902, -5.1...","[0.029148345781560572, 0.2967080380844896, 0.0..."
7,LinearRegression,"[-5.072410162397099, -6.010943270708908, -5.15...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.010943270708908, -5.072410162397099, -5.1...","[-6.174629451019132, -5.055658563434185, -5.13...","[0.08899424116007519, 0.03262090182441304, 0.0..."
8,KNeighborsRegressor,"[-5.593333333333334, -6.986666666666667, -7.0,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.593333333333334, -7.0...","[-6.984, -6.621333333333334, -7.0, -5.12133333...","[0.005333333333333456, 0.5474693294301212, 0.0..."
9,SVR,"[-4.676290881539564, -6.900171571476855, -4.83...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.900171571476855, -4.676290881539564, -4.8...","[-6.900172829360703, -4.794388523813799, -4.82...","[0.0001230336518123236, 0.07316673838856275, 0..."


In [30]:
df_Substructure_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Substructure_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Substructure_fp.csv')


In [31]:
#Substructure Count fingerprints
df_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/SubstructureCount_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/SubstructureCount_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
df_SubstructureCount_fp, pred_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
df_SubstructureCount_fp

X_train shape:  (5568, 307)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 307)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 40
[LightGBM] [Info] Start training from score -5.738310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 40
[LightGBM] [Info] Start training from score -5.742699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ov



0.48511283394264737


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.2624,0.378,0.5123,0.5786,0.7625,0.7254,0.2618,0.3729,0.5117,0.5862,0.7692,0.7488
DecisionTreeRegressor,0.3161,0.3899,0.5622,0.4924,0.7201,0.7034,0.2686,0.3632,0.5182,0.5756,0.7619,0.7516
RandomForestRegressor,0.2431,0.3602,0.493,0.6097,0.7809,0.7495,0.2573,0.3594,0.5072,0.5934,0.7706,0.7611
GradientBoostingRegressor,0.3179,0.4239,0.5639,0.4895,0.7103,0.6768,0.3272,0.4238,0.572,0.4829,0.7046,0.6949
AdaBoostRegressor,0.5313,0.5965,0.7289,0.1469,0.5036,0.428,0.5045,0.5797,0.7103,0.2028,0.5512,0.48
XGBRegressor,0.2417,0.3576,0.4916,0.612,0.7829,0.7483,0.2528,0.3559,0.5028,0.6005,0.7752,0.7604
ExtraTreesRegressor,0.2432,0.3592,0.4931,0.6096,0.7812,0.7485,0.2583,0.3596,0.5082,0.5919,0.7702,0.7621
LinearRegression,0.4397,0.4876,0.6631,0.2941,0.5434,0.5641,0.4438,0.4866,0.6662,0.2986,0.5468,0.5764
KNeighborsRegressor,0.315,0.4064,0.5613,0.4942,0.7102,0.6787,0.3046,0.3961,0.5519,0.5187,0.7258,0.7108
SVR,0.3542,0.4244,0.5952,0.4312,0.6679,0.6616,0.3653,0.4221,0.6044,0.4227,0.6592,0.6783


In [32]:
pred_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.984988093988094, -6.874354012838001, -5.76...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.018970450241344, -5.817410302907244, -6.8...","[-6.97681800604188, -5.890625592487536, -6.896...","[0.031844742780730836, 0.14786966484697378, 0...."
1,DecisionTreeRegressor,"[-7.0, -7.0, -6.244999999999999, -5.07, -4.59,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0, -5.2, -6.96, -7.0, -6.24, -7.0, -6.85,...","[-6.976000000000001, -5.48, -6.992, -6.754, -5...","[0.01959591794226544, 0.2836899716239544, 0.01..."
2,RandomForestRegressor,"[-6.884099999999998, -6.871400000000002, -6.08...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.959099999999998, -5.758666666666669, -6.9...","[-6.9711, -5.791460000000002, -6.94571, -6.252...","[0.014284677105206627, 0.17308964973228333, 0...."
3,GradientBoostingRegressor,"[-6.7569234264413245, -6.6529208801483195, -5....",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.0555655635533885, -5.398139696109834, -6....","[-7.122438663819739, -5.490769107178844, -6.77...","[0.06606522621599681, 0.1501292997603641, 0.05..."
4,AdaBoostRegressor,"[-6.543428571428572, -5.959210526315789, -5.70...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.543428571428572, -5.714731064764001, -6.5...","[-6.092710670620762, -5.694517651920593, -6.02...","[0.24773951815862527, 0.09382615786149985, 0.2..."
5,XGBRegressor,"[-6.9898057, -6.8005347, -5.990174, -4.797337,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.989251, -5.6280417, -6.7466702, -7.050244...","[-7.049016, -5.7448792, -6.951416, -6.670949, ...","[0.04864793, 0.17339802, 0.12257617, 0.4663881..."
6,ExtraTreesRegressor,"[-6.7646999999999995, -6.756800000000001, -6.2...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.965, -6.237200000000002, -6.9003999999999...","[-6.97304, -6.242980000000001, -6.8344, -6.478...","[0.006861661606345793, 0.1262953269127563, 0.0..."
7,LinearRegression,"[-5.280245626491479, -5.895392542571749, -5.51...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.0117887517806645, -4.6411953739140435, -5...","[-6.19918580553964, -4.698957279642839, -5.676...","[0.09731223194497061, 0.06727830500580526, 0.0..."
8,KNeighborsRegressor,"[-6.453333333333333, -7.0, -6.496666666666666,...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-6.986666666666667, -5.603333333333333, -7.0...","[-6.9893333333333345, -6.294, -6.824, -5.40799...","[0.005333333333333102, 0.4281142111373762, 0.2..."
9,SVR,"[-6.691550259534014, -6.897124498175519, -5.50...",0 -7.00 1 -7.00 2 -7.00 3 ...,"[[-7.1613209558550786, -6.033844266641924, -6....","[-7.166614671576741, -6.128104668832263, -6.75...","[0.026794922281811653, 0.08046582871499011, 0...."


In [33]:
df_SubstructureCount_fp.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Results_Substructure_Count_fp.csv')
pred_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Fingerprints/Prediction_data_Substructure_Count_fp.csv')


In [None]:
from sklearn.model_selection import GridSearchCV
import os
import joblib
def train_and_test_predict_with_tuning(models, param_grids, X_train, y_train, X_test, y_test, save_dir):
   
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        best_params = None

        # hyperparameter tuning 
        if model_name in param_grids and param_grids[model_name]:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            grid_search = GridSearchCV(
                estimator=model, 
                param_grid=param_grids[model_name], 
                cv=kf,
                scoring='neg_mean_squared_error', 
                n_jobs=-1)
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            print(model_name)
            print(": best params",best_params)
        else:
            default_params = model.get_params()
            print(model_name, ': Default params', default_params)
            best_params = {}
            print(model_name, ':Used Default params')

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)  
            test_predictions_folds.append(predictions_test_fold)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test Predictions folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,
            'Best Parameters': best_params
        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }
        # Save the model
        model_path = os.path.join(save_dir, f"{model_name}.joblib")
        joblib.dump(model, model_path)
        print(f"Saved {model_name} model to {model_path}")

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df


In [None]:
param_grids = {
        'ExtraTreesRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None,1,5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'LGBMRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'num_leaves': [31, 50, 100]
        },
        'DecisionTreeRegressor': {
            'max_depth': [None, 10, 20, 50, 100],
            'min_samples_split': [2, 5, 10]
        },
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'max_depth': [None, 1, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
        },
        'GradientBoostingRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7, 10]
        },
        'AdaBoostRegressor': {
            'n_estimators': [50, 100, 200, 400],
            'learning_rate': [0.001, 0.01, 0.1, 1.0]
        },
        'SVR': {
            'C': [0.001, 0.1, 1, 10],
            'epsilon': [0.1, 0.2, 0.5],
            'gamma': [0.001, 0.1, 1, 10]
        },
        'KNeighborsRegressor': {
            'n_neighbors': [3, 5, 10],
            'weights': ['uniform', 'distance']
        },
        'MLPRegressor': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'learning_rate': ['constant', 'adaptive'],
            'max_iter': [100,200, 400]
}
    }


In [None]:
#All fingerprints const rem Hyperparametric tuning
df_train = pd.read_csv('Fingerprints/Train/All_fingerprints_train.csv')
X_train = df_train.drop(['ID','SMILES','Permeability'],axis=1)
X_train, const_col = remove_constant_columns(X_train)
y_train = df_train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
df_test = pd.read_csv('Fingerprints/Test/All_fingerprints_test.csv')
X_test = df_test[X_train.columns]
y_test = df_test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
save_dir = 'fingerprints_results/Models_All_const_rem_fingerprints_HPT/'
os.makedirs(save_dir, exist_ok=True)
result_df, prediction_df = train_and_test_predict_with_tuning(models, param_grids, X_train,y_train, X_test,  y_test)
result_df

In [None]:
prediction_df

In [None]:
result_df.to_csv('fingerprints_results/Results_All_const_rem_fingerprints_HPT.csv')
prediction_df.to_csv('fingerprints_results/Prediction_data_All_const_rem_fingerprints_HPT.csv')