In [1]:
#Starts from here
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [2]:
#Combined features and then selection

df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/All_fingerprints_train.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/All_fingerprints_test.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Train_all_atomic_desc.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Test_all_atomic_desc.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]

print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)


  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
  df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(5568, 271)
(1392, 271)
(5568, 1128)
(1392, 1128)
(5568, 758)
(1392, 758)
(5568, 12)
(1392, 12)


In [3]:
merge_keys = ['ID', 'SMILES', 'Permeability']

merged_train = df_desc_train.merge(df_fp_train, on=merge_keys)
merged_train = merged_train.merge(df_emb_train, on=merge_keys)
merged_train = merged_train.merge(df_atomic_train, on=merge_keys)

merged_test = df_desc_test.merge(df_fp_test, on=merge_keys)
merged_test = merged_test.merge(df_emb_test, on=merge_keys)
merged_test = merged_test.merge(df_atomic_test, on=merge_keys)

In [4]:
X_train = merged_train.drop(columns=['ID', 'SMILES']).select_dtypes(include=['number'])
selected_final_features = features(X_train, target_column='Permeability')

train = pd.concat([merged_train[['ID', 'SMILES', 'Permeability']], X_train[selected_final_features]], axis=1)
test = merged_test[train.columns] 

print('selected_final_features', selected_final_features )
print("Final Train shape:", train.shape)
print("Final Test shape:", test.shape)

selected_final_features ['MinEStateIndex', 'qed', 'SPS', 'FpDensityMorgan1', 'BCUT2D_MWHI', 'AvgIpc', 'BalabanJ_x', 'Ipc', 'EState_VSA11', 'VSA_EState10', 'fr_Ar_N', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_aryl_methyl', 'fr_methoxy', 'fr_morpholine', 'fr_para_hydroxylation', 'fr_piperdine', 'fr_priamide', 'AdjacencyMatrix.6', 'AdjacencyMatrix.9', 'AdjacencyMatrix.11', 'AATS', 'AATS.3', 'AATS.4', 'AATS.10', 'AATS.23', 'AATS.24', 'AATS.26', 'AATS.27', 'AATS.28', 'AATS.32', 'AATS.33', 'AATS.34', 'AATS.40', 'AATS.48', 'AATS.96', 'AATS.97', 'ATSC.2', 'ATSC.5', 'ATSC.7', 'ATSC.8', 'ATSC.12', 'ATSC.13', 'ATSC.14', 'ATSC.15', 'ATSC.16', 'ATSC.17', 'ATSC.20', 'ATSC.21', 'ATSC.22', 'ATSC.23', 'ATSC.24', 'ATSC.25', 'ATSC.26', 'ATSC.29', 'ATSC.30', 'ATSC.32', 'ATSC.37', 'ATSC.44', 'ATSC.50', 'ATSC.73', 'ATSC.76', 'ATSC.80', 'ATSC.84', 'ATSC.92', 'ATSC.96', 'ATSC.105', 'ATSC.106', 'AATSC.9', 'AATSC.11', 'AATSC.15', 'AATSC.16', 'AATSC.36', 'AATSC.43', 'AATSC.57', 'AATSC.60', 'GATS.1', 'GATS.3', 'G

In [5]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [6]:
X_train = train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test[X_train.columns]
y_test = test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 2138)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 2138)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 252943
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2104
[LightGBM] [Info] Start training from score -5.747051
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.159816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 252956
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 2106
[LightGBM] [Info] Start training from score -5.747251
[LightGBM] [Info] Auto-choosing col-wise multi-threa

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1288,0.2609,0.3589,0.7932,0.8906,0.8738,0.1985,0.3147,0.4455,0.6863,0.8286,0.8124
DecisionTreeRegressor,0.272,0.3786,0.5215,0.5633,0.7832,0.762,0.2193,0.3358,0.4683,0.6535,0.8106,0.7844
RandomForestRegressor,0.1326,0.2642,0.3642,0.787,0.8872,0.8693,0.2011,0.3169,0.4485,0.6821,0.826,0.8087
GradientBoostingRegressor,0.1299,0.2644,0.3605,0.7914,0.8896,0.8689,0.2027,0.3194,0.4503,0.6796,0.8247,0.8051
AdaBoostRegressor,0.1697,0.3229,0.412,0.7275,0.8612,0.8329,0.2273,0.365,0.4767,0.6409,0.8074,0.7814
XGBRegressor,0.1492,0.2813,0.3863,0.7604,0.8732,0.8549,0.198,0.3115,0.445,0.6871,0.8293,0.8152
ExtraTreesRegressor,0.1352,0.2646,0.3677,0.783,0.8849,0.8686,0.1976,0.3149,0.4445,0.6878,0.8294,0.8127
LinearRegression,0.6014,0.4391,0.7755,0.0343,0.6303,0.7483,0.3642,0.3899,0.6035,0.4245,0.7083,0.7477
KNeighborsRegressor,0.1908,0.3176,0.4369,0.6936,0.836,0.8131,0.2284,0.3371,0.4779,0.6391,0.8026,0.7938
SVR,0.149,0.2769,0.386,0.7608,0.8728,0.8588,0.207,0.3213,0.455,0.6728,0.8208,0.8087


In [7]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_features_pampa.csv')

In [8]:
X = train.drop(columns=['ID', 'SMILES', 'Permeability'])
y = train['Permeability']

rf = RandomForestRegressor(n_estimators=100, random_state=101, n_jobs=-1)
rf.fit(X, y)

importances = rf.feature_importances_
feature_names = X.columns


In [9]:
#Top 10 features
n = 10  
top_10_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_10_features = feature_names[top_10_indices].tolist()  # convert to list

# Output the list
print("Top", 10, "features:\n")
print(top_10_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_10_features]], axis=1)
test_df = test[train.columns] 

Top 10 features:

['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556', 'x_fine_emb_MFXL149']


In [10]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('Features: ', X_train.columns)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 10)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 10)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Features:  Index(['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686',
       'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448',
       'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556',
       'x_fine_emb_MFXL149'],
      dtype='object')
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 10
[LightGBM] [Info] Start training from score -5.747051
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000579 seconds.
You can set `force_col_



0.6459508851818214


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1395,0.2774,0.3735,0.7761,0.8809,0.8577,0.2178,0.3359,0.4667,0.6558,0.8101,0.7894
DecisionTreeRegressor,0.2859,0.3908,0.5347,0.541,0.7743,0.7447,0.2405,0.3541,0.4904,0.62,0.7916,0.7633
RandomForestRegressor,0.1421,0.2781,0.377,0.7718,0.8786,0.8572,0.213,0.3316,0.4616,0.6633,0.8147,0.7929
GradientBoostingRegressor,0.14,0.2796,0.3741,0.7753,0.8805,0.8548,0.2213,0.3383,0.4705,0.6503,0.8069,0.7839
AdaBoostRegressor,0.2016,0.3528,0.449,0.6763,0.8363,0.8073,0.2633,0.3976,0.5131,0.584,0.7762,0.7588
XGBRegressor,0.1569,0.2931,0.3961,0.748,0.8662,0.8453,0.219,0.3354,0.468,0.6539,0.8096,0.7876
ExtraTreesRegressor,0.1422,0.2758,0.3771,0.7717,0.8786,0.8597,0.2107,0.329,0.459,0.6671,0.8169,0.7967
LinearRegression,0.1415,0.2815,0.3761,0.7729,0.8791,0.8538,0.2278,0.3481,0.4773,0.64,0.8008,0.7757
KNeighborsRegressor,0.167,0.2989,0.4087,0.7318,0.8576,0.8332,0.2248,0.3361,0.4742,0.6447,0.8057,0.7855
SVR,0.1371,0.2722,0.3702,0.7799,0.8835,0.8595,0.2242,0.3385,0.4735,0.6458,0.805,0.784


In [11]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_top_10_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_top_10_features_pampa.csv')

In [12]:
#Top 20 features
n = 20  
top_20_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_20_features = feature_names[top_20_indices].tolist()  # convert to list

# Output the list
print("Top", 20, "features:\n")
print(top_20_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_20_features]], axis=1)
test_df = test[train.columns] 

Top 20 features:

['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556', 'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166', 'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519', 'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14']


In [13]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('Features: ', X_train.columns)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 20)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 20)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Features:  Index(['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686',
       'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448',
       'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556',
       'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148',
       'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166',
       'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519',
       'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14'],
      dtype='object')
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001370 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data po



0.6468227331924721


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1343,0.2704,0.3664,0.7844,0.8857,0.8638,0.2128,0.3299,0.4613,0.6637,0.815,0.7953
DecisionTreeRegressor,0.273,0.3813,0.5225,0.5617,0.7823,0.7567,0.2369,0.3482,0.4867,0.6257,0.7944,0.7716
RandomForestRegressor,0.1371,0.2717,0.3703,0.7799,0.8832,0.8632,0.2103,0.3273,0.4586,0.6677,0.8174,0.7972
GradientBoostingRegressor,0.1373,0.2753,0.3705,0.7796,0.8829,0.858,0.2198,0.3344,0.4688,0.6527,0.8085,0.7848
AdaBoostRegressor,0.1947,0.3469,0.4413,0.6873,0.8454,0.8143,0.257,0.3936,0.5069,0.5939,0.7841,0.7686
XGBRegressor,0.1538,0.2888,0.3922,0.7531,0.869,0.8492,0.213,0.3304,0.4616,0.6634,0.8152,0.7946
ExtraTreesRegressor,0.1398,0.2704,0.3739,0.7755,0.8808,0.8643,0.208,0.3253,0.4561,0.6713,0.8196,0.8006
LinearRegression,0.1384,0.2781,0.372,0.7778,0.8819,0.8584,0.2265,0.346,0.4759,0.6421,0.8022,0.779
KNeighborsRegressor,0.1619,0.2929,0.4024,0.74,0.8634,0.842,0.2159,0.3269,0.4647,0.6588,0.8141,0.7959
SVR,0.132,0.2663,0.3634,0.788,0.888,0.8659,0.2125,0.3283,0.461,0.6642,0.8161,0.7974


In [14]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_top_20_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_top_20_features_pampa.csv')

In [15]:
#Top 50 features
n = 50  
top_50_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_50_features = feature_names[top_50_indices].tolist()  # convert to list

# Output the list
print("Top", 50, "features:\n")
print(top_50_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_50_features]], axis=1)
test_df = test[train.columns] 

Top 50 features:

['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556', 'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166', 'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519', 'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14', 'x_fine_emb_MFXL719', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL700', 'x_fine_emb_MFXL706', 'x_fine_emb_MFXL219', 'x_fine_emb_MFXL407', 'x_fine_emb_MFXL1', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL340', 'x_fine_emb_MFXL644', 'x_fine_emb_MFXL193', 'x_fine_emb_MFXL404', 'x_fine_emb_MFXL301', 'x_fine_emb_MFXL96', 'x_fine_emb_MFXL330', 'x_fine_emb_MFXL374', 'x_fine_emb_MFXL329', 'x_fine_emb_MFXL192', 'x_fine_emb_MFXL78', 'x_fine_emb_MFXL511', 'x_fine_emb_MFXL121', 'x_fine_emb_MFXL742', 'x_fine_emb_MFXL85', 'x_fine_emb_MFXL251', 'x_fine_emb_MFXL714', 'x

In [16]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('Features: ', X_train.columns)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 50)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 50)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Features:  Index(['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686',
       'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448',
       'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556',
       'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148',
       'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166',
       'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519',
       'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14', 'x_fine_emb_MFXL719',
       'x_fine_emb_MFXL635', 'x_fine_emb_MFXL700', 'x_fine_emb_MFXL706',
       'x_fine_emb_MFXL219', 'x_fine_emb_MFXL407', 'x_fine_emb_MFXL1',
       'x_fine_emb_MFXL607', 'x_fine_emb_MFXL340', 'x_fine_emb_MFXL644',
       'x_fine_



0.6595427355322909


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1289,0.2628,0.359,0.7931,0.8906,0.8705,0.2025,0.3193,0.45,0.68,0.8248,0.8054
DecisionTreeRegressor,0.266,0.3757,0.5158,0.5729,0.7855,0.7609,0.2333,0.3431,0.483,0.6314,0.798,0.7836
RandomForestRegressor,0.1331,0.2649,0.3648,0.7864,0.8868,0.8685,0.2032,0.3203,0.4508,0.6789,0.8241,0.8055
GradientBoostingRegressor,0.1309,0.2674,0.3619,0.7898,0.8887,0.8668,0.2073,0.3236,0.4553,0.6724,0.8204,0.7968
AdaBoostRegressor,0.176,0.3302,0.4196,0.7173,0.8599,0.8303,0.2357,0.3743,0.4855,0.6275,0.8029,0.7783
XGBRegressor,0.1455,0.2771,0.3814,0.7664,0.8764,0.8573,0.2061,0.3186,0.454,0.6743,0.8219,0.8035
ExtraTreesRegressor,0.1353,0.2655,0.3678,0.7828,0.8849,0.8682,0.2036,0.3212,0.4512,0.6782,0.8238,0.8043
LinearRegression,0.1313,0.2695,0.3623,0.7892,0.8884,0.8658,0.2168,0.3358,0.4657,0.6573,0.8116,0.7876
KNeighborsRegressor,0.1566,0.2879,0.3957,0.7485,0.8678,0.8452,0.2062,0.3181,0.4541,0.6742,0.8235,0.8048
SVR,0.1269,0.2577,0.3563,0.7962,0.8925,0.8739,0.2011,0.3162,0.4484,0.6823,0.827,0.8104


In [17]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_top_50_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_top_50_features_pampa.csv')

In [18]:
#Top 100 features
n = 100  
top_100_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_100_features = feature_names[top_100_indices].tolist()  # convert to list

# Output the list
print("Top", 100, "features:\n")
print(top_100_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_100_features]], axis=1)
test_df = test[train.columns] 

Top 100 features:

['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556', 'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166', 'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519', 'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14', 'x_fine_emb_MFXL719', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL700', 'x_fine_emb_MFXL706', 'x_fine_emb_MFXL219', 'x_fine_emb_MFXL407', 'x_fine_emb_MFXL1', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL340', 'x_fine_emb_MFXL644', 'x_fine_emb_MFXL193', 'x_fine_emb_MFXL404', 'x_fine_emb_MFXL301', 'x_fine_emb_MFXL96', 'x_fine_emb_MFXL330', 'x_fine_emb_MFXL374', 'x_fine_emb_MFXL329', 'x_fine_emb_MFXL192', 'x_fine_emb_MFXL78', 'x_fine_emb_MFXL511', 'x_fine_emb_MFXL121', 'x_fine_emb_MFXL742', 'x_fine_emb_MFXL85', 'x_fine_emb_MFXL251', 'x_fine_emb_MFXL714', '

In [19]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('Features: ', X_train.columns)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 100)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 100)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Features:  Index(['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686',
       'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448',
       'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556',
       'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148',
       'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166',
       'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519',
       'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14', 'x_fine_emb_MFXL719',
       'x_fine_emb_MFXL635', 'x_fine_emb_MFXL700', 'x_fine_emb_MFXL706',
       'x_fine_emb_MFXL219', 'x_fine_emb_MFXL407', 'x_fine_emb_MFXL1',
       'x_fine_emb_MFXL607', 'x_fine_emb_MFXL340', 'x_fine_emb_MFXL644',
       'x_fin



0.6227301560965753




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1284,0.2616,0.3583,0.7939,0.891,0.8718,0.2022,0.3182,0.4497,0.6804,0.8251,0.8063
DecisionTreeRegressor,0.2663,0.3785,0.516,0.5725,0.7855,0.7614,0.2292,0.3426,0.4788,0.6377,0.8011,0.7799
RandomForestRegressor,0.1319,0.2643,0.3632,0.7882,0.8878,0.8699,0.2028,0.3197,0.4503,0.6795,0.8244,0.8068
GradientBoostingRegressor,0.1292,0.2649,0.3594,0.7926,0.8903,0.8692,0.204,0.3216,0.4516,0.6777,0.8236,0.8004
AdaBoostRegressor,0.1718,0.325,0.4144,0.7242,0.8605,0.8324,0.2344,0.3696,0.4841,0.6296,0.8021,0.7759
XGBRegressor,0.1462,0.281,0.3824,0.7652,0.8758,0.8556,0.2036,0.3163,0.4512,0.6783,0.8242,0.8082
ExtraTreesRegressor,0.1329,0.2637,0.3645,0.7866,0.887,0.8704,0.2014,0.318,0.4488,0.6817,0.8258,0.8086
LinearRegression,0.1269,0.2644,0.3562,0.7962,0.8923,0.8711,0.2137,0.332,0.4623,0.6622,0.8146,0.7924
KNeighborsRegressor,0.1626,0.292,0.4032,0.7389,0.8623,0.8389,0.2164,0.3299,0.4652,0.658,0.8137,0.7902
SVR,0.1289,0.2593,0.359,0.7931,0.8908,0.8736,0.2045,0.3183,0.4522,0.6769,0.8235,0.8078


In [20]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_top_100_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_top_100_features_pampa.csv')

In [21]:
#Top 200 features
n = 200  
top_200_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_200_features = feature_names[top_200_indices].tolist()  # convert to list

# Output the list
print("Top", 200, "features:\n")
print(top_200_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_200_features]], axis=1)
test_df = test[train.columns]

Top 200 features:

['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556', 'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166', 'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519', 'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14', 'x_fine_emb_MFXL719', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL700', 'x_fine_emb_MFXL706', 'x_fine_emb_MFXL219', 'x_fine_emb_MFXL407', 'x_fine_emb_MFXL1', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL340', 'x_fine_emb_MFXL644', 'x_fine_emb_MFXL193', 'x_fine_emb_MFXL404', 'x_fine_emb_MFXL301', 'x_fine_emb_MFXL96', 'x_fine_emb_MFXL330', 'x_fine_emb_MFXL374', 'x_fine_emb_MFXL329', 'x_fine_emb_MFXL192', 'x_fine_emb_MFXL78', 'x_fine_emb_MFXL511', 'x_fine_emb_MFXL121', 'x_fine_emb_MFXL742', 'x_fine_emb_MFXL85', 'x_fine_emb_MFXL251', 'x_fine_emb_MFXL714', '

In [22]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('Features: ', X_train.columns)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 200)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 200)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Features:  Index(['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686',
       'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448',
       'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556',
       'x_fine_emb_MFXL149',
       ...
       'x_fine_emb_MFXL529', 'x_fine_emb_MFXL159', 'x_fine_emb_MFXL605',
       'x_fine_emb_MFXL48', 'x_fine_emb_MFXL759', 'x_fine_emb_MFXL383',
       'x_fine_emb_MFXL181', 'x_fine_emb_MFXL525', 'x_fine_emb_MFXL59',
       'x_fine_emb_MFXL45'],
      dtype='object', length=200)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[L

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1294,0.2622,0.3597,0.7922,0.8901,0.8722,0.2013,0.3175,0.4487,0.6819,0.826,0.8089
DecisionTreeRegressor,0.2726,0.3816,0.5221,0.5624,0.783,0.7588,0.2326,0.3445,0.4823,0.6324,0.7984,0.7792
RandomForestRegressor,0.1315,0.2638,0.3626,0.7889,0.8882,0.8708,0.2023,0.3187,0.4498,0.6803,0.8249,0.8081
GradientBoostingRegressor,0.1296,0.265,0.36,0.7919,0.8899,0.8692,0.2033,0.3208,0.4509,0.6787,0.8242,0.802
AdaBoostRegressor,0.1687,0.323,0.4108,0.7291,0.8633,0.8327,0.2324,0.3689,0.4821,0.6328,0.8035,0.7744
XGBRegressor,0.1463,0.2796,0.3825,0.7651,0.8757,0.8568,0.2008,0.3132,0.4481,0.6827,0.8268,0.8115
ExtraTreesRegressor,0.1319,0.2628,0.3632,0.7882,0.8878,0.8706,0.1995,0.317,0.4466,0.6848,0.8276,0.8106
LinearRegression,0.1269,0.264,0.3562,0.7963,0.8925,0.872,0.2104,0.3289,0.4587,0.6676,0.8177,0.7973
KNeighborsRegressor,0.1633,0.2935,0.4041,0.7378,0.8614,0.837,0.2123,0.3271,0.4608,0.6644,0.8175,0.7961
SVR,0.1307,0.2608,0.3616,0.7901,0.8891,0.873,0.203,0.3172,0.4506,0.6792,0.8247,0.8098


In [23]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_top_200_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_top_200_features_pampa.csv')

In [24]:
#Top 500 features
n = 500  
top_500_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_500_features = feature_names[top_500_indices].tolist()  # convert to list

# Output the list
print("Top", 500, "features:\n")
print(top_500_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_500_features]], axis=1)
test_df = test[train.columns]

Top 500 features:

['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686', 'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448', 'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556', 'x_fine_emb_MFXL149', 'x_fine_emb_MFXL361', 'x_fine_emb_MFXL148', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL4', 'x_fine_emb_MFXL166', 'x_fine_emb_MFXL395', 'x_fine_emb_MFXL128', 'x_fine_emb_MFXL519', 'x_fine_emb_MFXL499', 'x_fine_emb_MFXL14', 'x_fine_emb_MFXL719', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL700', 'x_fine_emb_MFXL706', 'x_fine_emb_MFXL219', 'x_fine_emb_MFXL407', 'x_fine_emb_MFXL1', 'x_fine_emb_MFXL607', 'x_fine_emb_MFXL340', 'x_fine_emb_MFXL644', 'x_fine_emb_MFXL193', 'x_fine_emb_MFXL404', 'x_fine_emb_MFXL301', 'x_fine_emb_MFXL96', 'x_fine_emb_MFXL330', 'x_fine_emb_MFXL374', 'x_fine_emb_MFXL329', 'x_fine_emb_MFXL192', 'x_fine_emb_MFXL78', 'x_fine_emb_MFXL511', 'x_fine_emb_MFXL121', 'x_fine_emb_MFXL742', 'x_fine_emb_MFXL85', 'x_fine_emb_MFXL251', 'x_fine_emb_MFXL714', '

In [25]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('Features: ', X_train.columns)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (5568, 500)
y_train shape:  (5568,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (1392, 500)
y_test shape:  (1392,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Features:  Index(['x_fine_emb_MFXL143', 'x_fine_emb_MFXL23', 'x_fine_emb_MFXL686',
       'x_fine_emb_MFXL139', 'x_fine_emb_MFXL0', 'x_fine_emb_MFXL448',
       'x_fine_emb_MFXL208', 'x_fine_emb_MFXL730', 'x_fine_emb_MFXL556',
       'x_fine_emb_MFXL149',
       ...
       'x_fine_emb_MFXL502', 'x_fine_emb_MFXL325', 'x_fine_emb_MFXL352',
       'x_fine_emb_MFXL84', 'x_fine_emb_MFXL339', 'x_fine_emb_MFXL68',
       'x_fine_emb_MFXL642', 'x_fine_emb_MFXL572', 'x_fine_emb_MFXL629',
       'x_fine_emb_MFXL234'],
      dtype='object', length=500)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500


Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1279,0.26,0.3577,0.7946,0.8914,0.8732,0.1988,0.3154,0.4459,0.6858,0.8283,0.8123
DecisionTreeRegressor,0.2666,0.3749,0.5163,0.572,0.786,0.7634,0.2284,0.3426,0.4779,0.6391,0.8022,0.7848
RandomForestRegressor,0.1323,0.2638,0.3637,0.7876,0.8875,0.8699,0.2023,0.3181,0.4498,0.6803,0.8249,0.808
GradientBoostingRegressor,0.1299,0.2653,0.3604,0.7914,0.8896,0.869,0.2032,0.3202,0.4508,0.6788,0.8242,0.8045
AdaBoostRegressor,0.1682,0.3204,0.4102,0.7299,0.8617,0.8315,0.2263,0.3642,0.4757,0.6424,0.808,0.7796
XGBRegressor,0.1475,0.2795,0.384,0.7632,0.8748,0.8597,0.1984,0.3122,0.4455,0.6864,0.8289,0.8139
ExtraTreesRegressor,0.1333,0.2635,0.3651,0.786,0.8866,0.8701,0.2001,0.3171,0.4474,0.6837,0.827,0.8109
LinearRegression,0.1312,0.2683,0.3622,0.7893,0.8889,0.8691,0.2141,0.326,0.4627,0.6617,0.815,0.7988
KNeighborsRegressor,0.1617,0.2929,0.4021,0.7404,0.8628,0.8403,0.2119,0.327,0.4603,0.6652,0.8181,0.799
SVR,0.131,0.2598,0.3619,0.7897,0.8888,0.8732,0.1998,0.3134,0.4469,0.6843,0.8277,0.8143


In [26]:
result_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/combined_top_500_features_pampa.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/combined_features/prediction_data_combined_top_500_features_pampa.csv')