In [1]:
#Starts from here
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [2]:
from tqdm import tqdm
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Descriptors/Train_2d_3d_all_descriptors_MDCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Descriptors/Test_2d_3d_all_descriptors_MDCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Train/All_fingerprints_train_MDCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Fingerprints/Test/All_fingerprints_test_MDCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_mdck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_mdck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Atomic/Train_all_atomic_desc_MDCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/MDCK/features/Atomic/Test_all_atomic_desc_MDCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(51, 170)
(13, 170)
(51, 90)
(13, 90)
(51, 625)
(13, 625)
(51, 8)
(13, 8)


In [3]:
merge_keys = ['ID', 'SMILES', 'Permeability']

merged_train = df_desc_train.merge(df_fp_train, on=merge_keys)
merged_train = merged_train.merge(df_emb_train, on=merge_keys)
merged_train = merged_train.merge(df_atomic_train, on=merge_keys)

merged_test = df_desc_test.merge(df_fp_test, on=merge_keys)
merged_test = merged_test.merge(df_emb_test, on=merge_keys)
merged_test = merged_test.merge(df_atomic_test, on=merge_keys)

In [4]:
X_train = merged_train.drop(columns=['ID', 'SMILES']).select_dtypes(include=['number'])
selected_final_features = features(X_train, target_column='Permeability')

train = pd.concat([merged_train[['ID', 'SMILES', 'Permeability']], X_train[selected_final_features]], axis=1)
test = merged_test[train.columns] 

print('selected_final_features', selected_final_features )
print("Final Train shape:", train.shape)
print("Final Test shape:", test.shape)

selected_final_features ['MinAbsEStateIndex', 'SPS', 'FpDensityMorgan2', 'Ipc', 'EState_VSA11', 'AdjacencyMatrix.6', 'AdjacencyMatrix.11', 'AATS.8', 'AATS.11', 'AATS.23', 'AATS.31', 'AATS.44', 'AATS.52', 'AATS.93', 'ATSC.4', 'ATSC.5', 'ATSC.7', 'ATSC.16', 'ATSC.20', 'ATSC.22', 'ATSC.23', 'ATSC.24', 'ATSC.25', 'ATSC.26', 'ATSC.30', 'ATSC.32', 'ATSC.64', 'ATSC.84', 'ATSC.93', 'ATSC.101', 'ATSC.104', 'ATSC.105', 'ATSC.106', 'AATSC.11', 'AATSC.14', 'AATSC.15', 'AATSC.16', 'AATSC.17', 'AATSC.50', 'AATSC.57', 'GATS.4', 'GATS.12', 'GATS.22', 'GATS.61', 'GATS.62', 'GATS.86', 'AtomTypeEState.173', 'AtomTypeEState.187', 'AtomTypeEState.252', 'AtomTypeEState.260', 'AtomTypeEState.271', 'RingCount.52', 'ALogP', 'AATS3v', 'AATS4v', 'AATS4i', 'AATS6i', 'AATS7i', 'ATSC8c', 'ATSC8e', 'ATSC5p', 'ATSC6p', 'ATSC3i', 'ATSC1s', 'ATSC2s', 'ATSC8s', 'AATSC3m', 'AATSC5v', 'AATSC8v', 'VR3_Dzv', 'VE3_Dzp', 'VR2_Dzs', 'BCUTp-1l', 'BCUTp-1h', 'SpMAD_Dt', 'VE1_Dt', 'VR1_Dt', 'nHBint7', 'nHBint8', 'nHBint9', 'SHBin

In [5]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [6]:
X_train = train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test[X_train.columns]
y_test = test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 870)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 870)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 176
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9719
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 646
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5526,0.6389,0.7433,0.1458,0.3998,0.4182,0.3231,0.4931,0.5684,0.4508,0.696,0.5144
DecisionTreeRegressor,1.2053,0.8487,1.0979,-0.8633,0.0399,0.1193,0.32,0.4698,0.5657,0.4559,0.6866,0.4594
RandomForestRegressor,0.5443,0.6022,0.7378,0.1586,0.4067,0.4906,0.2825,0.4658,0.5315,0.5198,0.7297,0.4484
GradientBoostingRegressor,0.8542,0.7402,0.9242,-0.3205,0.0574,0.07,0.2726,0.4546,0.5221,0.5366,0.7334,0.4759
AdaBoostRegressor,0.6229,0.648,0.7892,0.0372,0.3534,0.4479,0.2815,0.4554,0.5306,0.5214,0.7231,0.4869
XGBRegressor,0.8576,0.7419,0.9261,-0.3257,0.0576,0.1326,0.296,0.4535,0.5441,0.4967,0.7058,0.5254
ExtraTreesRegressor,0.5611,0.6166,0.7491,0.1326,0.3999,0.4525,0.2997,0.4506,0.5474,0.4906,0.7112,0.4374
LinearRegression,0.8666,0.7256,0.9309,-0.3396,0.2907,0.3326,0.4622,0.5608,0.6798,0.2143,0.6094,0.4429
KNeighborsRegressor,0.5994,0.6068,0.7742,0.0734,0.3704,0.4051,0.4011,0.5517,0.6334,0.3181,0.57,0.3719
SVR,0.6102,0.6502,0.7811,0.0568,0.3115,0.3492,0.3451,0.5191,0.5875,0.4133,0.6583,0.3879


In [7]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_features_MDCK.csv')

In [8]:
X = train.drop(columns=['ID', 'SMILES', 'Permeability'])
y = train['Permeability']

rf = RandomForestRegressor(n_estimators=100, random_state=101, n_jobs=-1)
rf.fit(X, y)

importances = rf.feature_importances_
feature_names = X.columns


In [9]:
#Top 10 features
n = 10  
top_10_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_10_features = feature_names[top_10_indices].tolist() 

# Output the list
print("Top", 10, "features:\n")
print(top_10_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_10_features]], axis=1)
test_df = test[train.columns] 

Top 10 features:

['x_fine_emb_MFXL617', 'TDB10s', 'x_fine_emb_MFXL161', 'x_fine_emb_MFXL603', 'x_fine_emb_MFXL639', 'TDB10v', 'x_fine_emb_MFXL167', 'x_fine_emb_MFXL244', 'x_fine_emb_MFXL22', 'minssCH2']


In [10]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 10)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 10)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 3
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 7
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info]



Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.478,0.5329,0.6914,0.2611,0.5148,0.5713,0.4231,0.535,0.6505,0.2807,0.5953,0.4766
DecisionTreeRegressor,0.8717,0.6916,0.9336,-0.3475,0.2908,0.4001,0.285,0.4554,0.5338,0.5155,0.7208,0.4209
RandomForestRegressor,0.4143,0.5114,0.6437,0.3595,0.606,0.6747,0.2604,0.421,0.5103,0.5573,0.748,0.4044
GradientBoostingRegressor,0.5017,0.5393,0.7083,0.2245,0.5259,0.5728,0.3122,0.405,0.5588,0.4692,0.6981,0.4099
AdaBoostRegressor,0.4786,0.5241,0.6918,0.2602,0.5355,0.6994,0.2443,0.3853,0.4943,0.5847,0.7648,0.3989
XGBRegressor,0.5769,0.5567,0.7595,0.1082,0.4566,0.5193,0.2805,0.4082,0.5297,0.5231,0.7239,0.4374
ExtraTreesRegressor,0.5292,0.5815,0.7274,0.182,0.4639,0.5474,0.249,0.3805,0.499,0.5767,0.761,0.4099
LinearRegression,0.5459,0.6196,0.7388,0.1562,0.5039,0.538,0.3545,0.4316,0.5954,0.3973,0.6681,0.5007
KNeighborsRegressor,0.5613,0.5908,0.7492,0.1324,0.4288,0.4183,0.3562,0.4865,0.5969,0.3944,0.6594,0.4649
SVR,0.6415,0.6651,0.8009,0.0083,0.3131,0.3536,0.2922,0.457,0.5405,0.5033,0.755,0.4374


In [11]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_top_10_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_top_10_features_MDCK.csv')

In [12]:
#Top 20 features
n = 20  
top_20_indices = importances.argsort()[::-1][:n]  
top_20_features = feature_names[top_20_indices].tolist()  # convert to list

# Output the list
print("Top", 20, "features:\n")
print(top_20_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_20_features]], axis=1)
test_df = test[train.columns] 

Top 20 features:

['x_fine_emb_MFXL617', 'TDB10s', 'x_fine_emb_MFXL161', 'x_fine_emb_MFXL603', 'x_fine_emb_MFXL639', 'TDB10v', 'x_fine_emb_MFXL167', 'x_fine_emb_MFXL244', 'x_fine_emb_MFXL22', 'minssCH2', 'ATSC.26', 'RDF155m', 'x_fine_emb_MFXL246', 'TDB10m', 'x_fine_emb_MFXL127', 'x_fine_emb_MFXL565', 'geomDiameter', 'x_fine_emb_MFXL339', 'x_fine_emb_MFXL10', 'x_fine_emb_MFXL450']


In [13]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 20)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 20)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 6
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 14
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testin



Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.4258,0.5183,0.6525,0.3418,0.5871,0.6176,0.4114,0.533,0.6414,0.3006,0.5705,0.5207
DecisionTreeRegressor,0.5392,0.5682,0.7343,0.1664,0.5663,0.6327,0.3612,0.5191,0.601,0.3859,0.6533,0.2558
RandomForestRegressor,0.3873,0.4903,0.6224,0.4012,0.6397,0.6811,0.2601,0.4382,0.51,0.5578,0.7493,0.4099
GradientBoostingRegressor,0.4612,0.5151,0.6791,0.287,0.5628,0.6382,0.2631,0.4337,0.513,0.5527,0.7437,0.4759
AdaBoostRegressor,0.3676,0.4787,0.6063,0.4318,0.6663,0.6899,0.2067,0.389,0.4546,0.6486,0.8058,0.4979
XGBRegressor,0.57,0.5576,0.755,0.1189,0.4957,0.5726,0.2734,0.4304,0.5229,0.5351,0.741,0.4484
ExtraTreesRegressor,0.4541,0.5547,0.6739,0.298,0.5521,0.5735,0.2268,0.3935,0.4762,0.6145,0.787,0.542
LinearRegression,0.8905,0.7452,0.9437,-0.3766,0.3748,0.3863,0.4708,0.5412,0.6861,0.1996,0.6211,0.4512
KNeighborsRegressor,0.4808,0.5548,0.6934,0.2567,0.5355,0.5297,0.3906,0.5027,0.625,0.336,0.6023,0.597
SVR,0.5416,0.5999,0.736,0.1627,0.44,0.4686,0.3402,0.5072,0.5833,0.4216,0.6587,0.5585


In [14]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_top_20_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_top_20_features_MDCK.csv')

In [15]:
#Top 50 features
n = 50  
top_50_indices = importances.argsort()[::-1][:n] 
top_50_features = feature_names[top_50_indices].tolist()  # convert to list

# Output the list
print("Top", 50, "features:\n")
print(top_50_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_50_features]], axis=1)
test_df = test[train.columns] 

Top 50 features:

['x_fine_emb_MFXL617', 'TDB10s', 'x_fine_emb_MFXL161', 'x_fine_emb_MFXL603', 'x_fine_emb_MFXL639', 'TDB10v', 'x_fine_emb_MFXL167', 'x_fine_emb_MFXL244', 'x_fine_emb_MFXL22', 'minssCH2', 'ATSC.26', 'RDF155m', 'x_fine_emb_MFXL246', 'TDB10m', 'x_fine_emb_MFXL127', 'x_fine_emb_MFXL565', 'geomDiameter', 'x_fine_emb_MFXL339', 'x_fine_emb_MFXL10', 'x_fine_emb_MFXL450', 'x_fine_emb_MFXL319', 'GATS.4', 'x_fine_emb_MFXL103', 'x_fine_emb_MFXL448', 'TDB10e', 'RDF85s', 'RDF40m', 'AATSC.17', 'x_fine_emb_MFXL74', 'x_fine_emb_MFXL640', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL704', 'LOBMIN', 'x_fine_emb_MFXL423', 'x_fine_emb_MFXL345', 'x_fine_emb_MFXL41', 'x_fine_emb_MFXL548', 'x_fine_emb_MFXL9', 'x_fine_emb_MFXL203', 'Tm', 'x_fine_emb_MFXL655', 'x_fine_emb_MFXL27', 'nHBint8', 'x_fine_emb_MFXL739', 'x_fine_emb_MFXL151', 'FNSA-2', 'x_fine_emb_MFXL685', 'x_fine_emb_MFXL380', 'x_fine_emb_MFXL703', 'x_fine_emb_MFXL90']


In [16]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 50)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 50)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 12
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 593
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 39
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test



Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5195,0.6032,0.7208,0.1969,0.4549,0.4837,0.3711,0.5332,0.6092,0.3691,0.612,0.4484
DecisionTreeRegressor,0.6523,0.5986,0.8076,-0.0083,0.4641,0.5293,0.3793,0.4808,0.6159,0.3551,0.6247,0.3439
RandomForestRegressor,0.4195,0.4971,0.6477,0.3516,0.5972,0.656,0.2847,0.4511,0.5336,0.516,0.7199,0.4814
GradientBoostingRegressor,0.4575,0.4927,0.6764,0.2927,0.5649,0.6791,0.2995,0.4519,0.5473,0.4908,0.7014,0.4539
AdaBoostRegressor,0.4214,0.508,0.6492,0.3485,0.604,0.6589,0.2886,0.4431,0.5372,0.5094,0.7143,0.4979
XGBRegressor,0.5786,0.5421,0.7606,0.1056,0.4812,0.6045,0.3303,0.4585,0.5747,0.4384,0.6743,0.3604
ExtraTreesRegressor,0.4007,0.5144,0.633,0.3807,0.6177,0.653,0.2849,0.4256,0.5338,0.5156,0.7226,0.4979
LinearRegression,1.932,1.0372,1.39,-1.9866,0.2732,0.2629,0.46,0.563,0.6782,0.218,0.6455,0.4649
KNeighborsRegressor,0.5043,0.55,0.7101,0.2205,0.5152,0.5762,0.3044,0.4413,0.5518,0.4825,0.7001,0.597
SVR,0.4899,0.5632,0.6999,0.2427,0.5058,0.5497,0.3542,0.518,0.5952,0.3978,0.6343,0.564


In [17]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_top_50_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_top_50_features_MDCK.csv')

In [18]:
#Top 100 features
n = 100  
top_100_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_100_features = feature_names[top_100_indices].tolist()  # convert to list

# Output the list
print("Top", 100, "features:\n")
print(top_100_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_100_features]], axis=1)
test_df = test[train.columns] 

Top 100 features:

['x_fine_emb_MFXL617', 'TDB10s', 'x_fine_emb_MFXL161', 'x_fine_emb_MFXL603', 'x_fine_emb_MFXL639', 'TDB10v', 'x_fine_emb_MFXL167', 'x_fine_emb_MFXL244', 'x_fine_emb_MFXL22', 'minssCH2', 'ATSC.26', 'RDF155m', 'x_fine_emb_MFXL246', 'TDB10m', 'x_fine_emb_MFXL127', 'x_fine_emb_MFXL565', 'geomDiameter', 'x_fine_emb_MFXL339', 'x_fine_emb_MFXL10', 'x_fine_emb_MFXL450', 'x_fine_emb_MFXL319', 'GATS.4', 'x_fine_emb_MFXL103', 'x_fine_emb_MFXL448', 'TDB10e', 'RDF85s', 'RDF40m', 'AATSC.17', 'x_fine_emb_MFXL74', 'x_fine_emb_MFXL640', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL704', 'LOBMIN', 'x_fine_emb_MFXL423', 'x_fine_emb_MFXL345', 'x_fine_emb_MFXL41', 'x_fine_emb_MFXL548', 'x_fine_emb_MFXL9', 'x_fine_emb_MFXL203', 'Tm', 'x_fine_emb_MFXL655', 'x_fine_emb_MFXL27', 'nHBint8', 'x_fine_emb_MFXL739', 'x_fine_emb_MFXL151', 'FNSA-2', 'x_fine_emb_MFXL685', 'x_fine_emb_MFXL380', 'x_fine_emb_MFXL703', 'x_fine_emb_MFXL90', 'x_fine_emb_MFXL278', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL122', 'x_fi

In [19]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 100)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 100)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 16
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1183
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 78
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of t

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5037,0.5915,0.7097,0.2213,0.4757,0.4979,0.3646,0.5212,0.6038,0.3802,0.6245,0.4869
DecisionTreeRegressor,1.057,0.75,1.0281,-0.634,0.108,0.2019,0.2909,0.4536,0.5393,0.5055,0.7122,0.3274
RandomForestRegressor,0.5001,0.5676,0.7072,0.2269,0.4827,0.5696,0.2785,0.4518,0.5277,0.5265,0.7299,0.4704
GradientBoostingRegressor,0.7006,0.6253,0.837,-0.083,0.2686,0.3339,0.2809,0.4504,0.53,0.5225,0.7231,0.4924
AdaBoostRegressor,0.5099,0.5556,0.7141,0.2117,0.4873,0.5906,0.2764,0.4491,0.5257,0.5302,0.7285,0.4649
XGBRegressor,0.8332,0.666,0.9128,-0.288,0.2223,0.3304,0.3229,0.4592,0.5683,0.451,0.6746,0.3879
ExtraTreesRegressor,0.4519,0.5293,0.6722,0.3014,0.5518,0.6147,0.2931,0.4485,0.5414,0.5018,0.7155,0.4099
LinearRegression,0.834,0.7108,0.9132,-0.2892,0.4588,0.5336,0.558,0.5963,0.747,0.0514,0.4402,0.1265
KNeighborsRegressor,0.616,0.6013,0.7848,0.0478,0.352,0.4227,0.3395,0.4782,0.5826,0.4229,0.6516,0.4319
SVR,0.5194,0.594,0.7207,0.1971,0.4581,0.5016,0.3591,0.518,0.5993,0.3894,0.6284,0.4759


In [20]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_top_100_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_top_100_features_MDCK.csv')

In [21]:
#Top 200 features
n = 200  
top_200_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_200_features = feature_names[top_200_indices].tolist()  # convert to list

# Output the list
print("Top", 200, "features:\n")
print(top_200_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_200_features]], axis=1)
test_df = test[train.columns]

Top 200 features:

['x_fine_emb_MFXL617', 'TDB10s', 'x_fine_emb_MFXL161', 'x_fine_emb_MFXL603', 'x_fine_emb_MFXL639', 'TDB10v', 'x_fine_emb_MFXL167', 'x_fine_emb_MFXL244', 'x_fine_emb_MFXL22', 'minssCH2', 'ATSC.26', 'RDF155m', 'x_fine_emb_MFXL246', 'TDB10m', 'x_fine_emb_MFXL127', 'x_fine_emb_MFXL565', 'geomDiameter', 'x_fine_emb_MFXL339', 'x_fine_emb_MFXL10', 'x_fine_emb_MFXL450', 'x_fine_emb_MFXL319', 'GATS.4', 'x_fine_emb_MFXL103', 'x_fine_emb_MFXL448', 'TDB10e', 'RDF85s', 'RDF40m', 'AATSC.17', 'x_fine_emb_MFXL74', 'x_fine_emb_MFXL640', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL704', 'LOBMIN', 'x_fine_emb_MFXL423', 'x_fine_emb_MFXL345', 'x_fine_emb_MFXL41', 'x_fine_emb_MFXL548', 'x_fine_emb_MFXL9', 'x_fine_emb_MFXL203', 'Tm', 'x_fine_emb_MFXL655', 'x_fine_emb_MFXL27', 'nHBint8', 'x_fine_emb_MFXL739', 'x_fine_emb_MFXL151', 'FNSA-2', 'x_fine_emb_MFXL685', 'x_fine_emb_MFXL380', 'x_fine_emb_MFXL703', 'x_fine_emb_MFXL90', 'x_fine_emb_MFXL278', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL122', 'x_fi

In [22]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 200)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 200)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 621
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 42
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 157
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5343,0.6108,0.731,0.174,0.4316,0.4124,0.3341,0.5151,0.578,0.432,0.6717,0.5309
DecisionTreeRegressor,0.9811,0.7506,0.9905,-0.5167,0.066,0.1567,0.3797,0.5113,0.6162,0.3546,0.6236,0.3604
RandomForestRegressor,0.5141,0.5784,0.717,0.2052,0.4577,0.5535,0.2564,0.4393,0.5064,0.5641,0.7657,0.4759
GradientBoostingRegressor,0.7813,0.6788,0.8839,-0.2078,0.1591,0.2341,0.2918,0.4717,0.5402,0.5039,0.7107,0.4319
AdaBoostRegressor,0.4874,0.5529,0.6981,0.2465,0.5159,0.5641,0.2372,0.4097,0.487,0.5968,0.7733,0.5585
XGBRegressor,0.7918,0.692,0.8898,-0.2241,0.2173,0.2622,0.367,0.4801,0.6058,0.3761,0.6277,0.3659
ExtraTreesRegressor,0.4825,0.5505,0.6946,0.2542,0.5098,0.5914,0.271,0.4449,0.5206,0.5392,0.7368,0.4264
LinearRegression,0.8101,0.7095,0.9001,-0.2523,0.344,0.4077,0.4937,0.5304,0.7026,0.1608,0.5924,0.4732
KNeighborsRegressor,0.6334,0.6093,0.7958,0.0209,0.3372,0.4474,0.3795,0.5175,0.6161,0.3548,0.6001,0.4319
SVR,0.5729,0.6234,0.7569,0.1144,0.372,0.4202,0.365,0.5226,0.6042,0.3794,0.6278,0.4759


In [23]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_top_200_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_top_200_features_MDCK.csv')

In [24]:
#Top 500 features
n = 500  
top_500_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_500_features = feature_names[top_500_indices].tolist()  # convert to list

# Output the list
print("Top", 500, "features:\n")
print(top_500_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_500_features]], axis=1)
test_df = test[train.columns]

Top 500 features:

['x_fine_emb_MFXL617', 'TDB10s', 'x_fine_emb_MFXL161', 'x_fine_emb_MFXL603', 'x_fine_emb_MFXL639', 'TDB10v', 'x_fine_emb_MFXL167', 'x_fine_emb_MFXL244', 'x_fine_emb_MFXL22', 'minssCH2', 'ATSC.26', 'RDF155m', 'x_fine_emb_MFXL246', 'TDB10m', 'x_fine_emb_MFXL127', 'x_fine_emb_MFXL565', 'geomDiameter', 'x_fine_emb_MFXL339', 'x_fine_emb_MFXL10', 'x_fine_emb_MFXL450', 'x_fine_emb_MFXL319', 'GATS.4', 'x_fine_emb_MFXL103', 'x_fine_emb_MFXL448', 'TDB10e', 'RDF85s', 'RDF40m', 'AATSC.17', 'x_fine_emb_MFXL74', 'x_fine_emb_MFXL640', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL704', 'LOBMIN', 'x_fine_emb_MFXL423', 'x_fine_emb_MFXL345', 'x_fine_emb_MFXL41', 'x_fine_emb_MFXL548', 'x_fine_emb_MFXL9', 'x_fine_emb_MFXL203', 'Tm', 'x_fine_emb_MFXL655', 'x_fine_emb_MFXL27', 'nHBint8', 'x_fine_emb_MFXL739', 'x_fine_emb_MFXL151', 'FNSA-2', 'x_fine_emb_MFXL685', 'x_fine_emb_MFXL380', 'x_fine_emb_MFXL703', 'x_fine_emb_MFXL90', 'x_fine_emb_MFXL278', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL122', 'x_fi

In [25]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (51, 500)
y_train shape:  (51,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (13, 500)
y_test shape:  (13,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1432
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 97
[LightGBM] [Info] Start training from score -5.726629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6048
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 400
[LightGBM] [Info] Start training from score -5.661081
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.5467,0.6326,0.7394,0.1549,0.411,0.417,0.303,0.4725,0.5505,0.4848,0.7313,0.5254
DecisionTreeRegressor,1.1332,0.8342,1.0645,-0.7517,0.0223,0.1008,0.2883,0.4593,0.5369,0.5099,0.717,0.4264
RandomForestRegressor,0.5389,0.5983,0.7341,0.1669,0.4168,0.5228,0.2782,0.4595,0.5275,0.527,0.7404,0.4484
GradientBoostingRegressor,0.7941,0.7079,0.8911,-0.2276,0.1264,0.1462,0.2936,0.4758,0.5419,0.5008,0.7087,0.3879
AdaBoostRegressor,0.5266,0.5902,0.7257,0.1859,0.4607,0.5345,0.2788,0.4428,0.528,0.526,0.7313,0.4869
XGBRegressor,0.852,0.725,0.923,-0.3171,0.1364,0.1714,0.3094,0.4633,0.5562,0.474,0.6894,0.4319
ExtraTreesRegressor,0.5333,0.5887,0.7303,0.1756,0.4402,0.5412,0.2655,0.4392,0.5152,0.5487,0.7451,0.4704
LinearRegression,0.7817,0.6953,0.8841,-0.2083,0.3237,0.3296,0.4515,0.5417,0.6719,0.2325,0.5939,0.4814
KNeighborsRegressor,0.6107,0.6135,0.7815,0.056,0.3596,0.4092,0.436,0.5842,0.6603,0.2588,0.5359,0.3714
SVR,0.5939,0.6415,0.7707,0.0819,0.336,0.3877,0.3526,0.5124,0.5938,0.4006,0.648,0.4649


In [26]:
result_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/combined_top_500_features_MDCK.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/MDCK/results/combined_features/prediction_data_combined_top_500_features_MDCK.csv')