In [1]:
#Starts from here
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Identify columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [2]:
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Train_2d_3d_all_descriptors_RRCK.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Descriptors/Test_2d_3d_all_descriptors_RRCK.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Train/All_fingerprints_train_RRCK.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Fingerprints/Test/All_fingerprints_test_RRCK.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings_rrck.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Train_all_atomic_desc_RRCK.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/RRCK/features/Atomic/Test_all_atomic_desc_RRCK.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(140, 249)
(36, 249)
(140, 414)
(36, 414)
(140, 690)
(36, 690)
(140, 13)
(36, 13)


In [3]:
merge_keys = ['ID', 'SMILES', 'Permeability']

merged_train = df_desc_train.merge(df_fp_train, on=merge_keys)
merged_train = merged_train.merge(df_emb_train, on=merge_keys)
merged_train = merged_train.merge(df_atomic_train, on=merge_keys)

merged_test = df_desc_test.merge(df_fp_test, on=merge_keys)
merged_test = merged_test.merge(df_emb_test, on=merge_keys)
merged_test = merged_test.merge(df_atomic_test, on=merge_keys)

In [4]:
X_train = merged_train.drop(columns=['ID', 'SMILES']).select_dtypes(include=['number'])
selected_final_features = features(X_train, target_column='Permeability')

train = pd.concat([merged_train[['ID', 'SMILES', 'Permeability']], X_train[selected_final_features]], axis=1)
test = merged_test[train.columns] 

print('selected_final_features', selected_final_features )
print("Final Train shape:", train.shape)
print("Final Test shape:", test.shape)

selected_final_features ['qed', 'SPS', 'MaxAbsPartialCharge', 'FpDensityMorgan1', 'AvgIpc', 'BalabanJ_x', 'Ipc', 'PEOE_VSA14', 'EState_VSA11', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'fr_allylic_oxid', 'AdjacencyMatrix.6', 'ATS.8', 'AATS.2', 'AATS.3', 'AATS.12', 'AATS.13', 'AATS.95', 'AATS.96', 'ATSC.1', 'ATSC.6', 'ATSC.11', 'ATSC.16', 'ATSC.17', 'ATSC.20', 'ATSC.22', 'ATSC.23', 'ATSC.24', 'ATSC.25', 'ATSC.26', 'ATSC.28', 'ATSC.32', 'ATSC.35', 'ATSC.61', 'ATSC.67', 'ATSC.84', 'ATSC.87', 'ATSC.95', 'ATSC.97', 'ATSC.105', 'ATSC.106', 'AATSC.11', 'AATSC.12', 'AATSC.14', 'AATSC.15', 'AATSC.16', 'AATSC.37', 'AATSC.40', 'AATSC.42', 'AATSC.52', 'AATSC.99', 'GATS.1', 'GATS.3', 'GATS.4', 'GATS.5', 'GATS.12', 'GATS.13', 'GATS.14', 'GATS.19', 'GATS.20', 'GATS.22', 'GATS.27', 'GATS.37', 'GATS.82', 'BCUT.3', 'Chi.8', 'AtomTypeEState.16', 'AtomTypeEState.96', 'AtomTypeEState.170', 'AtomTypeEState.173', 'AtomTypeEState.252', 'AtomTypeEState.271', 'EtaVEMCount.6', 'InformationContent.4', 'Log

In [5]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [6]:
X_train = train.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test[X_train.columns]
y_test = test['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 1336)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 1336)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35951
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1098
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35891
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 1100
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1566,0.2998,0.3957,0.6507,0.8074,0.8155,0.4299,0.4537,0.6557,0.4053,0.7086,0.7711
DecisionTreeRegressor,0.2711,0.3895,0.5206,0.3953,0.6973,0.7088,0.4609,0.4716,0.6789,0.3623,0.6769,0.7515
RandomForestRegressor,0.1683,0.3138,0.4102,0.6246,0.7925,0.8005,0.4879,0.4737,0.6985,0.325,0.6756,0.7342
GradientBoostingRegressor,0.1701,0.3203,0.4124,0.6207,0.7896,0.8023,0.4241,0.4519,0.6512,0.4133,0.7252,0.7825
AdaBoostRegressor,0.1851,0.3265,0.4303,0.587,0.7666,0.7753,0.4469,0.4545,0.6685,0.3818,0.7091,0.7572
XGBRegressor,0.182,0.316,0.4267,0.5939,0.7742,0.7748,0.3954,0.4454,0.6288,0.4531,0.7517,0.7859
ExtraTreesRegressor,0.1521,0.3029,0.39,0.6606,0.8139,0.8311,0.4616,0.4639,0.6794,0.3614,0.6926,0.7678
LinearRegression,0.2053,0.3342,0.453,0.5421,0.7622,0.7881,0.4414,0.4905,0.6644,0.3893,0.6709,0.7158
KNeighborsRegressor,0.2318,0.3439,0.4815,0.4829,0.7231,0.7363,0.5246,0.442,0.7243,0.2742,0.6461,0.7637
SVR,0.1694,0.3085,0.4116,0.6221,0.7949,0.8066,0.4811,0.4413,0.6936,0.3344,0.6577,0.7489


In [7]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/prediction_data_combined_features_rrck.csv')

In [8]:
X = train.drop(columns=['ID', 'SMILES', 'Permeability'])
y = train['Permeability']

rf = RandomForestRegressor(n_estimators=100, random_state=101, n_jobs=-1)
rf.fit(X, y)

importances = rf.feature_importances_
feature_names = X.columns


In [9]:
#Top 10 features
n = 10  
top_10_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_10_features = feature_names[top_10_indices].tolist() 

# Output the list
print("Top", 10, "features:\n")
print(top_10_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_10_features]], axis=1)
test_df = test[train.columns] 

Top 10 features:

['x_fine_emb_MFXL330', 'x_fine_emb_MFXL552', 'x_fine_emb_MFXL501', 'x_fine_emb_MFXL701', 'x_fine_emb_MFXL268', 'x_fine_emb_MFXL206', 'x_fine_emb_MFXL66', 'x_fine_emb_MFXL356', 'x_fine_emb_MFXL281', 'x_fine_emb_MFXL137']


In [10]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 10)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 10)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 390
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 10
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 390
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 10
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 



-2.7652016512770268




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1805,0.3199,0.4248,0.5974,0.7744,0.7822,0.4759,0.4944,0.6898,0.3417,0.6752,0.7266
DecisionTreeRegressor,0.3262,0.4548,0.5711,0.2723,0.6505,0.6541,0.546,0.5171,0.7389,0.2447,0.5911,0.6843
RandomForestRegressor,0.1675,0.3201,0.4093,0.6263,0.7918,0.7902,0.4877,0.4929,0.6983,0.3253,0.6703,0.7335
GradientBoostingRegressor,0.1865,0.3431,0.4319,0.5839,0.7725,0.7782,0.4542,0.4853,0.674,0.3716,0.6978,0.7403
AdaBoostRegressor,0.1766,0.3242,0.4203,0.6059,0.7805,0.7785,0.4516,0.4909,0.672,0.3753,0.6919,0.7454
XGBRegressor,0.185,0.3426,0.4301,0.5873,0.7752,0.7828,0.4728,0.4812,0.6876,0.3459,0.6789,0.7584
ExtraTreesRegressor,0.1573,0.3101,0.3966,0.649,0.806,0.8029,0.5011,0.4987,0.7079,0.3068,0.6522,0.7407
LinearRegression,0.1849,0.3207,0.4301,0.5874,0.7681,0.7866,0.5875,0.5242,0.7665,0.1873,0.5581,0.6205
KNeighborsRegressor,0.1594,0.3024,0.3993,0.6444,0.8079,0.8055,0.4811,0.4819,0.6936,0.3344,0.6584,0.7206
SVR,0.1831,0.3133,0.4279,0.5915,0.7713,0.7787,0.5394,0.4966,0.7345,0.2537,0.6265,0.6938


In [11]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_top_10_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/_prediction_data_combined_top_10_features_rrck.csv')

In [12]:
#Top 20 features
n = 20  
top_20_indices = importances.argsort()[::-1][:n]  
top_20_features = feature_names[top_20_indices].tolist()  # convert to list

# Output the list
print("Top", 20, "features:\n")
print(top_20_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_20_features]], axis=1)
test_df = test[train.columns] 

Top 20 features:

['x_fine_emb_MFXL330', 'x_fine_emb_MFXL552', 'x_fine_emb_MFXL501', 'x_fine_emb_MFXL701', 'x_fine_emb_MFXL268', 'x_fine_emb_MFXL206', 'x_fine_emb_MFXL66', 'x_fine_emb_MFXL356', 'x_fine_emb_MFXL281', 'x_fine_emb_MFXL137', 'x_fine_emb_MFXL369', 'x_fine_emb_MFXL145', 'x_fine_emb_MFXL168', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL59', 'x_fine_emb_MFXL225', 'x_fine_emb_MFXL702', 'x_fine_emb_MFXL229', 'x_fine_emb_MFXL482']


In [13]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 20)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 20)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 20
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 20
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 



-2.5203870952945326




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1796,0.3213,0.4238,0.5993,0.776,0.7832,0.4574,0.481,0.6763,0.3672,0.6815,0.7139
DecisionTreeRegressor,0.3336,0.4477,0.5776,0.2558,0.61,0.6223,0.4535,0.4789,0.6735,0.3726,0.6589,0.6804
RandomForestRegressor,0.1695,0.3156,0.4117,0.6219,0.7888,0.7891,0.4537,0.4672,0.6736,0.3724,0.6982,0.7632
GradientBoostingRegressor,0.1925,0.3387,0.4387,0.5706,0.7619,0.7696,0.4351,0.4676,0.6596,0.3981,0.7037,0.7579
AdaBoostRegressor,0.1782,0.3278,0.4222,0.6024,0.7786,0.7821,0.4149,0.4638,0.6441,0.426,0.7293,0.7474
XGBRegressor,0.2167,0.3611,0.4655,0.5166,0.7327,0.7518,0.4347,0.4651,0.6593,0.3987,0.6942,0.7481
ExtraTreesRegressor,0.1613,0.3015,0.4016,0.6402,0.8006,0.8075,0.4661,0.4727,0.6827,0.3551,0.6749,0.7489
LinearRegression,0.2164,0.3469,0.4652,0.5172,0.7291,0.7479,0.5144,0.5163,0.7172,0.2883,0.6338,0.6794
KNeighborsRegressor,0.1795,0.3145,0.4237,0.5996,0.7786,0.7848,0.532,0.4635,0.7294,0.264,0.598,0.708
SVR,0.1858,0.3139,0.431,0.5855,0.7678,0.7759,0.5484,0.4944,0.7405,0.2414,0.6186,0.708


In [14]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_top_20_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/prediction_data_combined_top_20_features_rrck.csv')

In [15]:
#Top 50 features
n = 50  
top_50_indices = importances.argsort()[::-1][:n] 
top_50_features = feature_names[top_50_indices].tolist()  # convert to list

# Output the list
print("Top", 50, "features:\n")
print(top_50_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_50_features]], axis=1)
test_df = test[train.columns] 

Top 50 features:

['x_fine_emb_MFXL330', 'x_fine_emb_MFXL552', 'x_fine_emb_MFXL501', 'x_fine_emb_MFXL701', 'x_fine_emb_MFXL268', 'x_fine_emb_MFXL206', 'x_fine_emb_MFXL66', 'x_fine_emb_MFXL356', 'x_fine_emb_MFXL281', 'x_fine_emb_MFXL137', 'x_fine_emb_MFXL369', 'x_fine_emb_MFXL145', 'x_fine_emb_MFXL168', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL59', 'x_fine_emb_MFXL225', 'x_fine_emb_MFXL702', 'x_fine_emb_MFXL229', 'x_fine_emb_MFXL482', 'x_fine_emb_MFXL600', 'x_fine_emb_MFXL252', 'x_fine_emb_MFXL7', 'x_fine_emb_MFXL663', 'x_fine_emb_MFXL234', 'x_fine_emb_MFXL79', 'x_fine_emb_MFXL539', 'GATS.4', 'x_fine_emb_MFXL490', 'x_fine_emb_MFXL227', 'x_fine_emb_MFXL146', 'x_fine_emb_MFXL124', 'x_fine_emb_MFXL557', 'x_fine_emb_MFXL683', 'x_fine_emb_MFXL494', 'x_fine_emb_MFXL687', 'x_fine_emb_MFXL299', 'x_fine_emb_MFXL327', 'x_fine_emb_MFXL44', 'x_fine_emb_MFXL34', 'GATS5e', 'VR2_D', 'x_fine_emb_MFXL50', 'x_fine_emb_MFXL485', 'x_fine_emb_MFXL551', 'x_fine_emb_MFXL288', 'TDB5i', 'x_fi

In [16]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 50)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 50)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1948
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 50
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000550 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1948
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 50
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o



-0.5001159719003461




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1419,0.2856,0.3767,0.6834,0.8275,0.8422,0.4405,0.4612,0.6637,0.3906,0.7068,0.7649
DecisionTreeRegressor,0.2415,0.3744,0.4914,0.4612,0.7335,0.7466,0.4131,0.4619,0.6427,0.4285,0.7327,0.7566
RandomForestRegressor,0.1497,0.2986,0.3869,0.666,0.8168,0.8228,0.4599,0.4596,0.6782,0.3637,0.7086,0.7727
GradientBoostingRegressor,0.135,0.2908,0.3674,0.6989,0.8364,0.8387,0.4306,0.4568,0.6562,0.4043,0.7309,0.7868
AdaBoostRegressor,0.1523,0.2889,0.3902,0.6603,0.8128,0.8101,0.4688,0.4723,0.6847,0.3515,0.6859,0.7433
XGBRegressor,0.1697,0.3191,0.412,0.6213,0.7922,0.7982,0.4129,0.4507,0.6426,0.4288,0.7441,0.7938
ExtraTreesRegressor,0.1407,0.2884,0.3751,0.6861,0.8286,0.8384,0.45,0.4609,0.6708,0.3775,0.7021,0.7816
LinearRegression,0.3324,0.4277,0.5765,0.2585,0.6555,0.6808,0.5828,0.547,0.7634,0.1937,0.5837,0.6828
KNeighborsRegressor,0.1857,0.3138,0.4309,0.5858,0.7676,0.7889,0.4908,0.4205,0.7005,0.3211,0.6512,0.7748
SVR,0.162,0.3009,0.4025,0.6385,0.7994,0.8185,0.5118,0.4575,0.7154,0.2919,0.6474,0.7452


In [17]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_top_50_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/prediction_data_combined_top_50_features_rrck.csv')

In [18]:
#Top 100 features
n = 100  
top_100_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_100_features = feature_names[top_100_indices].tolist()  # convert to list

# Output the list
print("Top", 100, "features:\n")
print(top_100_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_100_features]], axis=1)
test_df = test[train.columns] 

Top 100 features:

['x_fine_emb_MFXL330', 'x_fine_emb_MFXL552', 'x_fine_emb_MFXL501', 'x_fine_emb_MFXL701', 'x_fine_emb_MFXL268', 'x_fine_emb_MFXL206', 'x_fine_emb_MFXL66', 'x_fine_emb_MFXL356', 'x_fine_emb_MFXL281', 'x_fine_emb_MFXL137', 'x_fine_emb_MFXL369', 'x_fine_emb_MFXL145', 'x_fine_emb_MFXL168', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL59', 'x_fine_emb_MFXL225', 'x_fine_emb_MFXL702', 'x_fine_emb_MFXL229', 'x_fine_emb_MFXL482', 'x_fine_emb_MFXL600', 'x_fine_emb_MFXL252', 'x_fine_emb_MFXL7', 'x_fine_emb_MFXL663', 'x_fine_emb_MFXL234', 'x_fine_emb_MFXL79', 'x_fine_emb_MFXL539', 'GATS.4', 'x_fine_emb_MFXL490', 'x_fine_emb_MFXL227', 'x_fine_emb_MFXL146', 'x_fine_emb_MFXL124', 'x_fine_emb_MFXL557', 'x_fine_emb_MFXL683', 'x_fine_emb_MFXL494', 'x_fine_emb_MFXL687', 'x_fine_emb_MFXL299', 'x_fine_emb_MFXL327', 'x_fine_emb_MFXL44', 'x_fine_emb_MFXL34', 'GATS5e', 'VR2_D', 'x_fine_emb_MFXL50', 'x_fine_emb_MFXL485', 'x_fine_emb_MFXL551', 'x_fine_emb_MFXL288', 'TDB5i', 'x_f

In [19]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 100)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 100)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3886
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 100
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3883
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 100
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe



-0.6452716153839602




Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1539,0.2923,0.3923,0.6567,0.8109,0.8267,0.4411,0.4522,0.6641,0.3898,0.7079,0.7797
DecisionTreeRegressor,0.2831,0.3963,0.5321,0.3685,0.695,0.7143,0.45,0.4536,0.6708,0.3775,0.7079,0.7639
RandomForestRegressor,0.1467,0.2895,0.3829,0.6729,0.8219,0.8308,0.4706,0.4681,0.686,0.349,0.6916,0.735
GradientBoostingRegressor,0.1379,0.2842,0.3713,0.6925,0.8326,0.8396,0.418,0.4497,0.6466,0.4217,0.729,0.7771
AdaBoostRegressor,0.1481,0.2894,0.3848,0.6697,0.8198,0.822,0.4508,0.4698,0.6714,0.3764,0.7026,0.744
XGBRegressor,0.1589,0.2973,0.3987,0.6454,0.8054,0.8179,0.4533,0.4658,0.6733,0.3729,0.6885,0.7426
ExtraTreesRegressor,0.1354,0.277,0.368,0.6979,0.8366,0.8506,0.4504,0.4622,0.6711,0.3769,0.6969,0.7733
LinearRegression,1.1985,0.8453,1.0947,-1.6735,0.4444,0.4755,1.1769,0.7328,1.0849,-0.6282,0.3065,0.41
KNeighborsRegressor,0.1917,0.3165,0.4378,0.5724,0.7657,0.7989,0.5416,0.4492,0.7359,0.2508,0.6176,0.7405
SVR,0.1533,0.292,0.3916,0.658,0.8129,0.841,0.484,0.4312,0.6957,0.3304,0.6598,0.7593


In [20]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_top_100_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/prediction_data_combined_top_100_features_rrck.csv')

In [21]:
#Top 200 features
n = 200  
top_200_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_200_features = feature_names[top_200_indices].tolist()  # convert to list

# Output the list
print("Top", 200, "features:\n")
print(top_200_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_200_features]], axis=1)
test_df = test[train.columns]

Top 200 features:

['x_fine_emb_MFXL330', 'x_fine_emb_MFXL552', 'x_fine_emb_MFXL501', 'x_fine_emb_MFXL701', 'x_fine_emb_MFXL268', 'x_fine_emb_MFXL206', 'x_fine_emb_MFXL66', 'x_fine_emb_MFXL356', 'x_fine_emb_MFXL281', 'x_fine_emb_MFXL137', 'x_fine_emb_MFXL369', 'x_fine_emb_MFXL145', 'x_fine_emb_MFXL168', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL59', 'x_fine_emb_MFXL225', 'x_fine_emb_MFXL702', 'x_fine_emb_MFXL229', 'x_fine_emb_MFXL482', 'x_fine_emb_MFXL600', 'x_fine_emb_MFXL252', 'x_fine_emb_MFXL7', 'x_fine_emb_MFXL663', 'x_fine_emb_MFXL234', 'x_fine_emb_MFXL79', 'x_fine_emb_MFXL539', 'GATS.4', 'x_fine_emb_MFXL490', 'x_fine_emb_MFXL227', 'x_fine_emb_MFXL146', 'x_fine_emb_MFXL124', 'x_fine_emb_MFXL557', 'x_fine_emb_MFXL683', 'x_fine_emb_MFXL494', 'x_fine_emb_MFXL687', 'x_fine_emb_MFXL299', 'x_fine_emb_MFXL327', 'x_fine_emb_MFXL44', 'x_fine_emb_MFXL34', 'GATS5e', 'VR2_D', 'x_fine_emb_MFXL50', 'x_fine_emb_MFXL485', 'x_fine_emb_MFXL551', 'x_fine_emb_MFXL288', 'TDB5i', 'x_f

In [22]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 200)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 200)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7678
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 200
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7672
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 200
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhe

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.1484,0.2856,0.3852,0.669,0.8183,0.8318,0.4387,0.4606,0.6623,0.3931,0.6958,0.7575
DecisionTreeRegressor,0.2738,0.3815,0.5232,0.3893,0.7077,0.6992,0.3842,0.4459,0.6199,0.4684,0.7663,0.7627
RandomForestRegressor,0.1477,0.293,0.3843,0.6706,0.8212,0.828,0.473,0.4666,0.6878,0.3456,0.6865,0.7424
GradientBoostingRegressor,0.1396,0.2862,0.3736,0.6886,0.8306,0.835,0.4252,0.4573,0.6521,0.4117,0.717,0.779
AdaBoostRegressor,0.1473,0.3003,0.3839,0.6713,0.8209,0.8223,0.43,0.454,0.6557,0.4051,0.7186,0.7712
XGBRegressor,0.1685,0.3033,0.4105,0.624,0.7915,0.804,0.4257,0.4564,0.6525,0.411,0.702,0.7403
ExtraTreesRegressor,0.1314,0.2858,0.3625,0.7068,0.8425,0.849,0.4492,0.4518,0.6703,0.3785,0.6976,0.7649
LinearRegression,0.3581,0.453,0.5984,0.2012,0.6521,0.6693,0.6363,0.5647,0.7977,0.1197,0.5111,0.5515
KNeighborsRegressor,0.2078,0.3233,0.4559,0.5364,0.7498,0.7798,0.5642,0.4875,0.7511,0.2195,0.5994,0.6855
SVR,0.1475,0.2831,0.3841,0.671,0.8227,0.8438,0.4811,0.429,0.6936,0.3344,0.6573,0.7572


In [23]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_top_200_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/prediction_data_combined_top_200_features_rrck.csv')

In [24]:
#Top 500 features
n = 500  
top_500_indices = importances.argsort()[::-1][:n]  # indices of top n features
top_500_features = feature_names[top_500_indices].tolist()  # convert to list

# Output the list
print("Top", 500, "features:\n")
print(top_500_features)

train_df = pd.concat([train[['ID', 'SMILES', 'Permeability']], X[top_500_features]], axis=1)
test_df = test[train.columns]

Top 500 features:

['x_fine_emb_MFXL330', 'x_fine_emb_MFXL552', 'x_fine_emb_MFXL501', 'x_fine_emb_MFXL701', 'x_fine_emb_MFXL268', 'x_fine_emb_MFXL206', 'x_fine_emb_MFXL66', 'x_fine_emb_MFXL356', 'x_fine_emb_MFXL281', 'x_fine_emb_MFXL137', 'x_fine_emb_MFXL369', 'x_fine_emb_MFXL145', 'x_fine_emb_MFXL168', 'x_fine_emb_MFXL635', 'x_fine_emb_MFXL543', 'x_fine_emb_MFXL59', 'x_fine_emb_MFXL225', 'x_fine_emb_MFXL702', 'x_fine_emb_MFXL229', 'x_fine_emb_MFXL482', 'x_fine_emb_MFXL600', 'x_fine_emb_MFXL252', 'x_fine_emb_MFXL7', 'x_fine_emb_MFXL663', 'x_fine_emb_MFXL234', 'x_fine_emb_MFXL79', 'x_fine_emb_MFXL539', 'GATS.4', 'x_fine_emb_MFXL490', 'x_fine_emb_MFXL227', 'x_fine_emb_MFXL146', 'x_fine_emb_MFXL124', 'x_fine_emb_MFXL557', 'x_fine_emb_MFXL683', 'x_fine_emb_MFXL494', 'x_fine_emb_MFXL687', 'x_fine_emb_MFXL299', 'x_fine_emb_MFXL327', 'x_fine_emb_MFXL44', 'x_fine_emb_MFXL34', 'GATS5e', 'VR2_D', 'x_fine_emb_MFXL50', 'x_fine_emb_MFXL485', 'x_fine_emb_MFXL551', 'x_fine_emb_MFXL288', 'TDB5i', 'x_f

In [25]:
X_train = train_df.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_df['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
X_test = test_df[X_train.columns]
y_test = test_df['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=101)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (140, 500)
y_train shape:  (140,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (36, 500)
y_test shape:  (36,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18959
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 497
[LightGBM] [Info] Start training from score -5.540848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18943
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 497
[LightGBM] [Info] Start training from score -5.607991
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.159,0.298,0.3988,0.6453,0.8038,0.8155,0.4393,0.4647,0.6628,0.3922,0.6985,0.7565
DecisionTreeRegressor,0.3063,0.4014,0.5534,0.3168,0.6607,0.6713,0.4065,0.4687,0.6376,0.4376,0.7273,0.7625
RandomForestRegressor,0.1588,0.3029,0.3985,0.6457,0.8066,0.814,0.4906,0.4779,0.7004,0.3214,0.6736,0.7358
GradientBoostingRegressor,0.1607,0.3124,0.4008,0.6416,0.8024,0.8045,0.4114,0.445,0.6414,0.4308,0.7315,0.7891
AdaBoostRegressor,0.1584,0.3007,0.398,0.6466,0.8053,0.8178,0.4512,0.4668,0.6717,0.3758,0.7038,0.7613
XGBRegressor,0.1714,0.3144,0.414,0.6176,0.7878,0.7904,0.4325,0.4648,0.6577,0.4017,0.7014,0.7539
ExtraTreesRegressor,0.1457,0.2951,0.3816,0.6751,0.8228,0.8279,0.4595,0.4598,0.6779,0.3643,0.6874,0.7628
LinearRegression,0.2972,0.4073,0.5451,0.3371,0.6794,0.6885,0.5595,0.5345,0.748,0.2259,0.5691,0.6663
KNeighborsRegressor,0.1892,0.3136,0.4349,0.578,0.7746,0.8061,0.5419,0.4651,0.7362,0.2503,0.6311,0.7251
SVR,0.1552,0.2927,0.394,0.6538,0.8128,0.832,0.4881,0.4372,0.6987,0.3247,0.6521,0.7443


In [26]:
result_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/combined_top_500_features_rrck.csv')
prediction_df.to_csv('/home/users/akshay/PCPpred/RRCK/results/combined_features/prediction_data_combined_top_500_features_rrck.csv')