In [1]:
#Starts from here
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [2]:
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Test_2d_3d_all_descriptors_Caco2.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Train/All_fingerprints_train_Caco2.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Fingerprints/Test/All_fingerprints_test_Caco2.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_caco2.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Train_all_atomic_desc_Caco2.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Atomic/Test_all_atomic_desc_Caco2.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/Caco2/features/Descriptors/Train_2d_3d_all_descriptors_Caco2.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(1008, 262)
(252, 262)
(1008, 916)
(252, 916)
(1008, 763)
(252, 763)
(1008, 12)
(252, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [3]:
merge_keys = ['ID', 'SMILES', 'Permeability']

merged_train = df_desc_train.merge(df_fp_train, on=merge_keys)
merged_train = merged_train.merge(df_emb_train, on=merge_keys)
merged_train = merged_train.merge(df_atomic_train, on=merge_keys)

merged_test = df_desc_test.merge(df_fp_test, on=merge_keys)
merged_test = merged_test.merge(df_emb_test, on=merge_keys)
merged_test = merged_test.merge(df_atomic_test, on=merge_keys)

In [4]:
X_train = merged_train.drop(columns=['ID', 'SMILES']).select_dtypes(include=['number'])
selected_final_features = features(X_train, target_column='Permeability')

train = pd.concat([merged_train[['ID', 'SMILES', 'Permeability']], X_train[selected_final_features]], axis=1)
test = merged_test[train.columns] 

print('selected_final_features', selected_final_features )
print("Final Train shape:", train.shape)
print("Final Test shape:", test.shape)

selected_final_features ['qed', 'SPS', 'FpDensityMorgan1', 'AvgIpc', 'Ipc', 'PEOE_VSA14', 'EState_VSA11', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumUnspecifiedAtomStereoCenters', 'fr_Al_OH_noTert', 'fr_Ar_N', 'fr_aryl_methyl', 'fr_bicyclic', 'fr_methoxy', 'fr_morpholine', 'fr_para_hydroxylation', 'fr_piperdine', 'fr_unbrch_alkane', 'BasicGroupCount', 'AdjacencyMatrix.6', 'AdjacencyMatrix.9', 'AATS', 'AATS.5', 'AATS.19', 'AATS.90', 'AATS.95', 'AATS.98', 'ATSC.4', 'ATSC.5', 'ATSC.7', 'ATSC.11', 'ATSC.16', 'ATSC.20', 'ATSC.21', 'ATSC.22', 'ATSC.23', 'ATSC.24', 'ATSC.25', 'ATSC.26', 'ATSC.32', 'ATSC.33', 'ATSC.37', 'ATSC.41', 'ATSC.43', 'ATSC.44', 'ATSC.62', 'ATSC.64', 'ATSC.68', 'ATSC.75', 'ATSC.76', 'ATSC.82', 'ATSC.87', 'ATSC.88', 'ATSC.101', 'ATSC.106', 'AATSC.9', 'AATSC.11', 'AATSC.12', 'AATSC.13', 'AATSC.14', 'AATSC.15', 'AATSC.16', 'AATSC.17', 'AATSC.38', 'AATSC.39', 'AATSC.40', 'AATSC.42', 'AATSC.52', 'AATSC.59', 'AATSC.61', 'AATSC.62', 'AATSC.100', 'GATS.3', 'GATS.13', 

In [5]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [7]:
X_train = train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test[X_train.columns]
y_test = test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 1917)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 1917)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249118
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1653
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 249128
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 1670
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1739,0.3129,0.417,0.7132,0.8445,0.8281,0.1932,0.342,0.4395,0.6662,0.8183,0.8005
DecisionTreeRegressor,0.3415,0.4364,0.5843,0.4369,0.7184,0.7009,0.2365,0.3644,0.4863,0.5914,0.7757,0.7399
RandomForestRegressor,0.1755,0.3124,0.4189,0.7106,0.8436,0.8263,0.2004,0.3505,0.4476,0.6538,0.8116,0.7881
GradientBoostingRegressor,0.1749,0.3135,0.4182,0.7115,0.8436,0.8244,0.1913,0.3409,0.4374,0.6694,0.8209,0.8024
AdaBoostRegressor,0.1932,0.3345,0.4396,0.6813,0.8294,0.8082,0.2164,0.3697,0.4652,0.6261,0.8024,0.7795
XGBRegressor,0.2029,0.3333,0.4505,0.6653,0.8174,0.7998,0.2032,0.3475,0.4508,0.6489,0.8079,0.7877
ExtraTreesRegressor,0.1651,0.3047,0.4063,0.7277,0.8537,0.84,0.1895,0.3358,0.4353,0.6725,0.8225,0.8096
LinearRegression,1.0941,0.8077,1.046,-0.8045,0.5198,0.5232,0.5765,0.5721,0.7593,0.0039,0.582,0.5864
KNeighborsRegressor,0.1877,0.319,0.4332,0.6905,0.834,0.8071,0.1743,0.3161,0.4175,0.6987,0.8364,0.818
SVR,0.1597,0.2977,0.3996,0.7366,0.8591,0.8536,0.177,0.3236,0.4207,0.6941,0.8355,0.8275


In [8]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_features_caco2.csv')

In [9]:
X = train.drop(columns=['ID', 'SMILES', 'Permeability'])
y = train['Permeability']

rf = RandomForestRegressor(n_estimators=100, random_state=101, n_jobs=-1)
rf.fit(X, y)

importances = rf.feature_importances_
feature_names = X.columns


In [10]:
#Top 10 features
n = 10  
top_10_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_10_features = feature_names[top_10_indices].tolist() 

# Output the list
print("Top", 10, "features:\n")
print(top_10_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_10_features]], axis=1)
test_df = test[train.columns] 

Top 10 features:

['x_fine_emb_MFXL339', 'x_fine_emb_MFXL263', 'x_fine_emb_MFXL682', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL411', 'x_fine_emb_MFXL355', 'x_fine_emb_MFXL618', 'x_fine_emb_MFXL287', 'x_fine_emb_MFXL709', 'x_fine_emb_MFXL451']


In [11]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 10)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 10)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 10
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 10
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



0.43822206303697686




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1937,0.3306,0.4402,0.6805,0.8257,0.8029,0.2265,0.3757,0.4759,0.6086,0.7826,0.7491
DecisionTreeRegressor,0.3532,0.4518,0.5943,0.4174,0.7164,0.6926,0.256,0.399,0.506,0.5576,0.7573,0.7289
RandomForestRegressor,0.1862,0.3246,0.4315,0.6929,0.8324,0.81,0.2198,0.3718,0.4688,0.6202,0.7906,0.7602
GradientBoostingRegressor,0.1907,0.3273,0.4367,0.6855,0.8284,0.805,0.2366,0.3871,0.4864,0.5911,0.7724,0.7378
AdaBoostRegressor,0.2178,0.362,0.4666,0.6409,0.8044,0.7795,0.2436,0.4006,0.4936,0.579,0.7732,0.7447
XGBRegressor,0.2146,0.3489,0.4633,0.646,0.8076,0.7857,0.2308,0.3708,0.4805,0.6011,0.778,0.7427
ExtraTreesRegressor,0.1867,0.326,0.4321,0.6921,0.832,0.807,0.2149,0.3677,0.4636,0.6286,0.796,0.7705
LinearRegression,0.1983,0.3465,0.4453,0.673,0.8204,0.8047,0.2443,0.4013,0.4942,0.5779,0.7648,0.7464
KNeighborsRegressor,0.2132,0.3496,0.4617,0.6485,0.81,0.7735,0.2493,0.3879,0.4993,0.5692,0.7643,0.7216
SVR,0.1937,0.3262,0.4401,0.6805,0.8254,0.8017,0.2183,0.3654,0.4672,0.6228,0.7906,0.762


In [12]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_top_10_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_top_10_features_caco2.csv')

In [13]:
#Top 20 features
n = 20  
top_20_indices = importances.argsort()[::-1][:n]  
top_20_features = feature_names[top_20_indices].tolist()  # convert to list

# Output the list
print("Top", 20, "features:\n")
print(top_20_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_20_features]], axis=1)
test_df = test[train.columns] 

Top 20 features:

['x_fine_emb_MFXL339', 'x_fine_emb_MFXL263', 'x_fine_emb_MFXL682', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL411', 'x_fine_emb_MFXL355', 'x_fine_emb_MFXL618', 'x_fine_emb_MFXL287', 'x_fine_emb_MFXL709', 'x_fine_emb_MFXL451', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL22', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL399', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL388', 'x_fine_emb_MFXL420', 'x_fine_emb_MFXL209', 'x_fine_emb_MFXL753', 'x_fine_emb_MFXL208']


In [14]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 20)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 20)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 20
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000487 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 20
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



0.21894296441390593




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1925,0.3335,0.4388,0.6825,0.8267,0.8027,0.2225,0.3685,0.4717,0.6156,0.7879,0.7558
DecisionTreeRegressor,0.3345,0.4282,0.5784,0.4483,0.7306,0.7104,0.2545,0.3816,0.5044,0.5603,0.7647,0.7308
RandomForestRegressor,0.1824,0.3256,0.4271,0.6991,0.8362,0.8129,0.2084,0.3601,0.4565,0.6399,0.8026,0.7775
GradientBoostingRegressor,0.1918,0.3298,0.438,0.6836,0.8272,0.8034,0.2198,0.367,0.4688,0.6203,0.7907,0.76
AdaBoostRegressor,0.2088,0.3541,0.457,0.6556,0.8141,0.7895,0.2335,0.3933,0.4832,0.5965,0.7869,0.7655
XGBRegressor,0.2097,0.3468,0.4579,0.6542,0.8112,0.7853,0.211,0.3642,0.4594,0.6354,0.7998,0.7722
ExtraTreesRegressor,0.1852,0.3257,0.4303,0.6946,0.8335,0.8086,0.2077,0.3588,0.4558,0.6411,0.8036,0.7827
LinearRegression,0.195,0.3454,0.4416,0.6784,0.8237,0.8093,0.2384,0.3964,0.4882,0.5881,0.7707,0.7549
KNeighborsRegressor,0.201,0.3387,0.4483,0.6685,0.8209,0.7891,0.2151,0.3569,0.4638,0.6283,0.7964,0.764
SVR,0.1911,0.3247,0.4371,0.6849,0.8282,0.8076,0.2046,0.3503,0.4523,0.6465,0.8048,0.7767


In [15]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_top_20_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_top_20_features_caco2.csv')

In [16]:
#Top 50 features
n = 50  
top_50_indices = importances.argsort()[::-1][:n] 
top_50_features = feature_names[top_50_indices].tolist()  # convert to list

# Output the list
print("Top", 50, "features:\n")
print(top_50_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_50_features]], axis=1)
test_df = test[train.columns] 

Top 50 features:

['x_fine_emb_MFXL339', 'x_fine_emb_MFXL263', 'x_fine_emb_MFXL682', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL411', 'x_fine_emb_MFXL355', 'x_fine_emb_MFXL618', 'x_fine_emb_MFXL287', 'x_fine_emb_MFXL709', 'x_fine_emb_MFXL451', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL22', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL399', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL388', 'x_fine_emb_MFXL420', 'x_fine_emb_MFXL209', 'x_fine_emb_MFXL753', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL571', 'x_fine_emb_MFXL333', 'x_fine_emb_MFXL530', 'x_fine_emb_MFXL359', 'x_fine_emb_MFXL659', 'x_fine_emb_MFXL678', 'x_fine_emb_MFXL721', 'x_fine_emb_MFXL675', 'x_fine_emb_MFXL76', 'ATSC.7', 'x_fine_emb_MFXL632', 'x_fine_emb_MFXL33', 'x_fine_emb_MFXL517', 'ATSC8s', 'x_fine_emb_MFXL184', 'x_fine_emb_MFXL100', 'x_fine_emb_MFXL275', 'x_fine_emb_MFXL315', 'x_fine_emb_MFXL667', 'x_fine_emb_MFXL88', 'maxHBint6', 'x_fine_emb_MFXL660', 'x_fine_emb_MFXL62', 'x_fine_emb_MFXL169', 'x_fine_emb_MFXL604', 'x_fine_emb_MFXL136', 'x_fine

In [17]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 50)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 50)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 50
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 50
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over



0.12784119242402636




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1739,0.3137,0.417,0.7132,0.8447,0.8271,0.2066,0.3479,0.4545,0.643,0.8042,0.7809
DecisionTreeRegressor,0.3211,0.42,0.5667,0.4704,0.7392,0.7129,0.2505,0.3878,0.5005,0.5672,0.7635,0.7325
RandomForestRegressor,0.1716,0.3099,0.4143,0.7169,0.8468,0.8272,0.2004,0.3476,0.4476,0.6537,0.8104,0.7842
GradientBoostingRegressor,0.1773,0.3177,0.4211,0.7075,0.8412,0.8197,0.2057,0.3515,0.4535,0.6446,0.8057,0.7826
AdaBoostRegressor,0.191,0.3373,0.4371,0.685,0.8324,0.8125,0.2176,0.3737,0.4664,0.6241,0.8018,0.7806
XGBRegressor,0.1953,0.3314,0.4419,0.678,0.8251,0.8032,0.2111,0.3564,0.4595,0.6352,0.799,0.7743
ExtraTreesRegressor,0.1712,0.3103,0.4138,0.7176,0.8472,0.8293,0.2031,0.3501,0.4507,0.6491,0.8077,0.7922
LinearRegression,0.183,0.33,0.4278,0.6982,0.8359,0.8208,0.2188,0.3793,0.4678,0.6218,0.7914,0.7719
KNeighborsRegressor,0.1941,0.3252,0.4405,0.6799,0.8282,0.8054,0.2161,0.3583,0.4649,0.6266,0.7967,0.7751
SVR,0.1736,0.3095,0.4166,0.7137,0.8451,0.8293,0.2124,0.3505,0.4609,0.633,0.7969,0.771


In [18]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_top_50_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_top_50_features_caco2.csv')

In [19]:
#Top 100 features
n = 100  
top_100_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_100_features = feature_names[top_100_indices].tolist()  # convert to list

# Output the list
print("Top", 100, "features:\n")
print(top_100_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_100_features]], axis=1)
test_df = test[train.columns] 

Top 100 features:

['x_fine_emb_MFXL339', 'x_fine_emb_MFXL263', 'x_fine_emb_MFXL682', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL411', 'x_fine_emb_MFXL355', 'x_fine_emb_MFXL618', 'x_fine_emb_MFXL287', 'x_fine_emb_MFXL709', 'x_fine_emb_MFXL451', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL22', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL399', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL388', 'x_fine_emb_MFXL420', 'x_fine_emb_MFXL209', 'x_fine_emb_MFXL753', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL571', 'x_fine_emb_MFXL333', 'x_fine_emb_MFXL530', 'x_fine_emb_MFXL359', 'x_fine_emb_MFXL659', 'x_fine_emb_MFXL678', 'x_fine_emb_MFXL721', 'x_fine_emb_MFXL675', 'x_fine_emb_MFXL76', 'ATSC.7', 'x_fine_emb_MFXL632', 'x_fine_emb_MFXL33', 'x_fine_emb_MFXL517', 'ATSC8s', 'x_fine_emb_MFXL184', 'x_fine_emb_MFXL100', 'x_fine_emb_MFXL275', 'x_fine_emb_MFXL315', 'x_fine_emb_MFXL667', 'x_fine_emb_MFXL88', 'maxHBint6', 'x_fine_emb_MFXL660', 'x_fine_emb_MFXL62', 'x_fine_emb_MFXL169', 'x_fine_emb_MFXL604', 'x_fine_emb_MFXL136', 'x_fin

In [20]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 100)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 100)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25498
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 100
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25498
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 100
[LightGBM] [Info] Start training from score -6.29540



0.12093306381170499


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1682,0.3077,0.4101,0.7226,0.8502,0.8311,0.1962,0.3437,0.4429,0.661,0.8147,0.7986
DecisionTreeRegressor,0.3072,0.4107,0.5543,0.4933,0.7473,0.7112,0.2289,0.3613,0.4784,0.6045,0.7853,0.7534
RandomForestRegressor,0.1681,0.3054,0.41,0.7227,0.8504,0.8311,0.1981,0.3462,0.445,0.6578,0.8132,0.7926
GradientBoostingRegressor,0.1747,0.3146,0.4179,0.7119,0.8438,0.8258,0.204,0.3501,0.4516,0.6475,0.8064,0.7885
AdaBoostRegressor,0.1879,0.3348,0.4335,0.6901,0.8347,0.8176,0.2167,0.3721,0.4655,0.6256,0.8012,0.7888
XGBRegressor,0.194,0.3328,0.4405,0.68,0.8257,0.8051,0.2,0.3458,0.4472,0.6545,0.811,0.7957
ExtraTreesRegressor,0.1658,0.3032,0.4072,0.7265,0.8526,0.836,0.2003,0.3489,0.4475,0.6539,0.8106,0.7945
LinearRegression,0.1777,0.3226,0.4216,0.7069,0.8416,0.8331,0.2215,0.3751,0.4706,0.6173,0.7909,0.7741
KNeighborsRegressor,0.1835,0.3207,0.4284,0.6973,0.839,0.8098,0.194,0.3359,0.4405,0.6647,0.8175,0.8034
SVR,0.1664,0.3027,0.4079,0.7256,0.852,0.8404,0.1988,0.3389,0.4458,0.6565,0.8109,0.7964


In [21]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_top_100_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_top_100_features_caco2.csv')

In [22]:
#Top 200 features
n = 200  
top_200_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_200_features = feature_names[top_200_indices].tolist()  # convert to list

# Output the list
print("Top", 200, "features:\n")
print(top_200_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_200_features]], axis=1)
test_df = test[train.columns]

Top 200 features:

['x_fine_emb_MFXL339', 'x_fine_emb_MFXL263', 'x_fine_emb_MFXL682', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL411', 'x_fine_emb_MFXL355', 'x_fine_emb_MFXL618', 'x_fine_emb_MFXL287', 'x_fine_emb_MFXL709', 'x_fine_emb_MFXL451', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL22', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL399', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL388', 'x_fine_emb_MFXL420', 'x_fine_emb_MFXL209', 'x_fine_emb_MFXL753', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL571', 'x_fine_emb_MFXL333', 'x_fine_emb_MFXL530', 'x_fine_emb_MFXL359', 'x_fine_emb_MFXL659', 'x_fine_emb_MFXL678', 'x_fine_emb_MFXL721', 'x_fine_emb_MFXL675', 'x_fine_emb_MFXL76', 'ATSC.7', 'x_fine_emb_MFXL632', 'x_fine_emb_MFXL33', 'x_fine_emb_MFXL517', 'ATSC8s', 'x_fine_emb_MFXL184', 'x_fine_emb_MFXL100', 'x_fine_emb_MFXL275', 'x_fine_emb_MFXL315', 'x_fine_emb_MFXL667', 'x_fine_emb_MFXL88', 'maxHBint6', 'x_fine_emb_MFXL660', 'x_fine_emb_MFXL62', 'x_fine_emb_MFXL169', 'x_fine_emb_MFXL604', 'x_fine_emb_MFXL136', 'x_fin

In [23]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 200)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 200)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50496
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 200
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50494
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 200
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 



0.3223417413454741




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1675,0.3092,0.4092,0.7238,0.8508,0.8332,0.1968,0.3406,0.4437,0.6599,0.814,0.8003
DecisionTreeRegressor,0.3278,0.4253,0.5725,0.4594,0.7293,0.7033,0.2353,0.3645,0.485,0.5935,0.7792,0.753
RandomForestRegressor,0.1681,0.3051,0.41,0.7227,0.8506,0.8325,0.2023,0.348,0.4498,0.6504,0.8086,0.787
GradientBoostingRegressor,0.1715,0.308,0.4141,0.7172,0.8469,0.8296,0.201,0.3479,0.4484,0.6526,0.81,0.7928
AdaBoostRegressor,0.1912,0.3321,0.4373,0.6846,0.8305,0.8111,0.2085,0.3672,0.4566,0.6398,0.8105,0.793
XGBRegressor,0.1879,0.3274,0.4335,0.6901,0.8315,0.8152,0.2151,0.3626,0.4637,0.6284,0.7952,0.7807
ExtraTreesRegressor,0.1621,0.301,0.4026,0.7326,0.8563,0.8398,0.1945,0.3411,0.4411,0.6638,0.8166,0.8011
LinearRegression,0.2021,0.3432,0.4495,0.6667,0.8216,0.816,0.2299,0.3738,0.4795,0.6027,0.7835,0.7779
KNeighborsRegressor,0.1831,0.3192,0.4279,0.6981,0.8392,0.8096,0.1749,0.3138,0.4182,0.6978,0.8359,0.8244
SVR,0.1646,0.3013,0.4058,0.7285,0.8537,0.8426,0.1922,0.3365,0.4384,0.6679,0.8178,0.8098


In [24]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_top_200_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_top_200_features_caco2.csv')

In [25]:
#Top 500 features
n = 500  
top_500_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_500_features = feature_names[top_500_indices].tolist()  # convert to list

# Output the list
print("Top", 500, "features:\n")
print(top_500_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_500_features]], axis=1)
test_df = test[train.columns]

Top 500 features:

['x_fine_emb_MFXL339', 'x_fine_emb_MFXL263', 'x_fine_emb_MFXL682', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL411', 'x_fine_emb_MFXL355', 'x_fine_emb_MFXL618', 'x_fine_emb_MFXL287', 'x_fine_emb_MFXL709', 'x_fine_emb_MFXL451', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL22', 'x_fine_emb_MFXL250', 'x_fine_emb_MFXL399', 'x_fine_emb_MFXL466', 'x_fine_emb_MFXL388', 'x_fine_emb_MFXL420', 'x_fine_emb_MFXL209', 'x_fine_emb_MFXL753', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL571', 'x_fine_emb_MFXL333', 'x_fine_emb_MFXL530', 'x_fine_emb_MFXL359', 'x_fine_emb_MFXL659', 'x_fine_emb_MFXL678', 'x_fine_emb_MFXL721', 'x_fine_emb_MFXL675', 'x_fine_emb_MFXL76', 'ATSC.7', 'x_fine_emb_MFXL632', 'x_fine_emb_MFXL33', 'x_fine_emb_MFXL517', 'ATSC8s', 'x_fine_emb_MFXL184', 'x_fine_emb_MFXL100', 'x_fine_emb_MFXL275', 'x_fine_emb_MFXL315', 'x_fine_emb_MFXL667', 'x_fine_emb_MFXL88', 'maxHBint6', 'x_fine_emb_MFXL660', 'x_fine_emb_MFXL62', 'x_fine_emb_MFXL169', 'x_fine_emb_MFXL604', 'x_fine_emb_MFXL136', 'x_fin

In [26]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 500)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 500)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 126012
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 500
[LightGBM] [Info] Start training from score -6.294177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 126007
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 500
[LightGBM] [Info] Start training from score -6.295407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, th

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.171,0.3102,0.4135,0.718,0.8473,0.8308,0.1972,0.3468,0.444,0.6593,0.8137,0.7938
DecisionTreeRegressor,0.335,0.4266,0.5788,0.4475,0.7267,0.7118,0.2456,0.3753,0.4956,0.5756,0.7653,0.7364
RandomForestRegressor,0.1692,0.3064,0.4113,0.721,0.8497,0.832,0.1998,0.3489,0.447,0.6548,0.812,0.7893
GradientBoostingRegressor,0.1743,0.3126,0.4175,0.7125,0.8442,0.8245,0.197,0.3466,0.4438,0.6597,0.8145,0.7944
AdaBoostRegressor,0.1907,0.3318,0.4367,0.6855,0.8311,0.8132,0.2135,0.3702,0.462,0.6311,0.8054,0.7848
XGBRegressor,0.1998,0.3349,0.447,0.6705,0.8199,0.8043,0.2047,0.3535,0.4524,0.6464,0.8059,0.7933
ExtraTreesRegressor,0.1614,0.2984,0.4017,0.7338,0.8573,0.8414,0.1963,0.3454,0.4431,0.6608,0.8149,0.798
LinearRegression,0.3937,0.4735,0.6275,0.3506,0.7131,0.711,0.3663,0.4454,0.6052,0.367,0.6926,0.7115
KNeighborsRegressor,0.1915,0.3237,0.4376,0.6842,0.8317,0.8056,0.1817,0.3229,0.4262,0.6861,0.8295,0.8164
SVR,0.1698,0.3023,0.412,0.72,0.8489,0.8432,0.1943,0.3383,0.4408,0.6642,0.8168,0.8151


In [27]:
result_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/combined_top_500_features_caco2.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/Caco2/results/combined_features/prediction_combined_top_500_features_caco2.csv')