In [None]:
#Starts from here
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    # Columns with variance below the threshold
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features

In [4]:
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]

# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/All_fingerprints_train.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/All_fingerprints_test.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Train_all_atomic_desc.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Test_all_atomic_desc.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]

print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)

def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

target_column = 'Permeability'
df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
  df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(5568, 271)
(1392, 271)
(5568, 1128)
(1392, 1128)
(5568, 758)
(1392, 758)
(5568, 12)
(1392, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(5568, 271)
(1392, 271)
(5568, 1128)
(1392, 1128)
(5568, 758)
(1392, 758)
(5568, 12)
(1392, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        ID                                             SMILES  Permeability  \
4765     3  CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...     -7.000000   
4772     4  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...     -7.100000   
4766     6  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...     -7.300000   
476

In [11]:
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),

]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500)
]

dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

meta_features_train = []
meta_features_test = []

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1: Training weak learners with 15-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 1: Train weak learners with 15-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_eval = df_test[target_column]

    kf = KFold(n_splits=15, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_train, y_train)

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -3.9)
            test_predictions_fold = np.clip(model.predict(X_eval), -10, -3.9)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
        print(f'Model training done {i}')

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 2: Training meta-learners')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


results = {}
predictions = []

kf = KFold(n_splits=15, shuffle=True, random_state=101)

for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

        model.fit(X_fold_train, y_fold_train)
        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -3.9)

        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -3.9)
        test_predictions_folds.append(test_predictions_fold)

    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test:.4f}, Test RMSE: {rmse_test:.4f}')

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Train actual': actual_y_train,
        'Y Test actual': y_eval,
        'Test Predictions folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_std,
    })

    results[model_name] = {
        'Train MSE (15 fold CV)': mse_train,
        'Train MAE (15 fold CV)': mae_train,
        'Train RMSE (15 fold CV)': rmse_train,
        'Train R2 (15 fold CV)': r2_train,
        'Train PCC (15 fold CV)': pearson_train,
        'Train SCC (15 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
predictions_df = pd.DataFrame(predictions)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stacked Ensemble Training and Evaluation complete')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

results_df

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1: Training weak learners with 15-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


Processing dataframe pairs:   0%|                                                                 | 0/4 [00:00<?, ?it/s]
Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56930
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 262
[LightGBM] [Info] Start training from score -5.747798
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56947
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 263
[LightGBM] [Info] Start training from score -5.741033
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56944
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 263
[LightGBM] [Info] Star


Training models:  10%|██████▋                                                            | 1/10 [00:11<01:39, 11.11s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:33<02:20, 17.55s/it][A

Model training done 1



Training models:  30%|███████████████████▊                                              | 3/10 [06:59<21:42, 186.00s/it][A

Model training done 2



Training models:  40%|██████████████████████████▍                                       | 4/10 [08:55<15:50, 158.34s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████                                 | 5/10 [09:06<08:46, 105.25s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [09:14<04:48, 72.09s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [09:16<02:27, 49.10s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [10:01<01:35, 47.96s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [10:45<00:46, 46.57s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [11:02<00:00, 66.25s/it][A
Processing dataframe pairs:  25%|██████████████                                          | 1/4 [11:02<33:07, 662.47s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4201
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 1106
[LightGBM] [Info] Start training from score -5.747798
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4212
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 1107
[LightGBM] [Info] Start training from score -5.741033
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4209
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 1106
[LightGBM] [Info] Star


Training models:  10%|██████▋                                                            | 1/10 [00:16<02:27, 16.44s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:25<01:38, 12.29s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [02:14<06:35, 56.49s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [03:42<06:53, 68.91s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [03:54<04:01, 48.25s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [04:03<02:19, 34.86s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [04:05<01:12, 24.13s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [06:37<02:10, 65.00s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [07:42<01:04, 64.92s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [07:47<00:00, 46.74s/it][A
Processing dataframe pairs:  50%|████████████████████████████                            | 2/4 [18:49<18:15, 547.72s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 755
[LightGBM] [Info] Start training from score -5.747798
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 755
[LightGBM] [Info] Start training from score -5.741033
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 755
[LightGBM] [Info] S


Training models:  10%|██████▋                                                            | 1/10 [00:11<01:43, 11.49s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [01:41<07:42, 57.75s/it][A

Model training done 1



Training models:  30%|██████████████████▉                                            | 3/10 [38:11<2:00:17, 1031.12s/it][A

Model training done 2



Training models:  40%|█████████████████████████▌                                      | 4/10 [46:17<1:21:36, 816.14s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████                                 | 5/10 [46:53<44:33, 534.70s/it][A

Model training done 4



Training models:  60%|███████████████████████████████████████▌                          | 6/10 [47:13<23:58, 359.67s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▏                   | 7/10 [47:15<12:08, 242.70s/it][A

Model training done 6



Training models:  80%|████████████████████████████████████████████████████▊             | 8/10 [48:51<06:32, 196.11s/it][A

Model training done 7



Training models:  90%|███████████████████████████████████████████████████████████▍      | 9/10 [50:13<02:40, 160.35s/it][A

Model training done 8



Training models: 100%|█████████████████████████████████████████████████████████████████| 10/10 [51:38<00:00, 309.86s/it][A
Processing dataframe pairs:  75%|███████████████████████████████████████▊             | 3/4 [1:10:28<28:32, 1712.53s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 9
[LightGBM] [Info] Start training from score -5.747798
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 9
[LightGBM] [Info] Start training from score -5.741033
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training models:  10%|██████▋                                                            | 1/10 [00:07<01:08,  7.57s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:13<00:51,  6.49s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [00:15<00:31,  4.47s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [00:16<00:18,  3.14s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [00:19<00:14,  2.97s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [00:23<00:13,  3.47s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [00:24<00:07,  2.53s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [00:36<00:11,  5.58s/it][A

Model training done 7



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [00:59<00:00,  5.99s/it][A
Processing dataframe pairs: 100%|█████████████████████████████████████████████████████| 4/4 [1:11:28<00:00, 1072.12s/it]

Model training done 8
Model training done 9
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (5568, 40)
Dimensions of meta_features_test: (1392, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 2: Training meta-learners
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10137
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 40
[LightGBM] [Info] Start training from score -5.747798




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10135
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 40
[LightGBM] [Info] Start training from score -5.741033




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10136
[LightGBM] [Info] Number of data points in the train set: 5196, number of used features: 40
[LightGBM] [Info] Start training from score -5.743723




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10137
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.742838




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10140
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.744857




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10138
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.744324




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10137
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.738795




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10136
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.742697




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10139
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.746284




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10143
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.747981




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10134
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.744388




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10139
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.744043




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10142
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.738129




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10132
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.742162




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10139
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 40
[LightGBM] [Info] Start training from score -5.734536




LGBMRegressor Evaluation completed: Test R2 score: 0.6957, Test RMSE: 0.4388
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.6716, Test RMSE: 0.4558
RandomForestRegressor Evaluation completed: Test R2 score: 0.6930, Test RMSE: 0.4408
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.6963, Test RMSE: 0.4384
AdaBoostRegressor Evaluation completed: Test R2 score: 0.6515, Test RMSE: 0.4696
XGBRegressor Evaluation completed: Test R2 score: 0.6944, Test RMSE: 0.4398
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.6930, Test RMSE: 0.4408
LinearRegression Evaluation completed: Test R2 score: 0.6942, Test RMSE: 0.4399
KNeighborsRegressor Evaluation completed: Test R2 score: 0.6799, Test RMSE: 0.4501
SVR Evaluation completed: Test R2 score: 0.6939, Test RMSE: 0.4401
MLPRegressor Evaluation completed: Test R2 score: 0.6964, Test RMSE: 0.4383
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stacked Ensemble Training and 

Unnamed: 0,Train MSE (15 fold CV),Train MAE (15 fold CV),Train RMSE (15 fold CV),Train R2 (15 fold CV),Train PCC (15 fold CV),Train SCC (15 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.124803,0.256939,0.353275,0.799612,0.894216,0.877238,0.19257,0.307265,0.438829,0.695699,0.834195,0.819372
DecisionTreeRegressor,0.267856,0.376856,0.517548,0.569923,0.789705,0.766357,0.207799,0.321427,0.455849,0.671635,0.820121,0.803596
RandomForestRegressor,0.126177,0.258095,0.355214,0.797407,0.893001,0.875599,0.194302,0.308937,0.440797,0.692963,0.832664,0.815785
GradientBoostingRegressor,0.124109,0.255552,0.352291,0.800726,0.89484,0.877708,0.192199,0.306568,0.438405,0.696286,0.834582,0.819575
AdaBoostRegressor,0.173815,0.32205,0.416911,0.720918,0.861063,0.855866,0.22051,0.352285,0.469585,0.651549,0.816796,0.812278
XGBRegressor,0.140338,0.272763,0.374618,0.774668,0.880829,0.863864,0.193417,0.309127,0.439793,0.69436,0.83355,0.817321
ExtraTreesRegressor,0.124208,0.255444,0.352431,0.800568,0.894773,0.876582,0.194272,0.309753,0.440763,0.69301,0.832764,0.815438
LinearRegression,0.124418,0.25488,0.35273,0.800231,0.894559,0.877908,0.193488,0.30776,0.439873,0.694248,0.833375,0.818256
KNeighborsRegressor,0.147932,0.281681,0.384619,0.762476,0.873291,0.853887,0.202575,0.321204,0.450083,0.67989,0.825404,0.807349
SVR,0.132723,0.260151,0.364312,0.786895,0.887303,0.875771,0.193713,0.305304,0.440128,0.693893,0.834128,0.817935


In [12]:
results_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Results_15_fold_stacked_prediction_LVR_scaled.csv')
predictions_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Prediction_data_Meta_learners_15_fold_stacked_prediction_LVR_scaled.csv')

In [7]:
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101)
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500)
]
target_column = 'Permeability'
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

meta_features_train = []
meta_features_test = []

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1: Training weak learners with 5-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 1: Train weak learners with 5-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_eval = df_test[target_column]
    
    kf = KFold(n_splits=5, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_train, y_train)

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -3.9)
            test_predictions_fold = np.clip(model.predict(X_eval), -10, -3.9)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
        print(f'Model training done {i}')

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 2: Training meta-learners with 5-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


results = {}
predictions = []

kf = KFold(n_splits=5, shuffle=True, random_state=101)

for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

        model.fit(X_fold_train, y_fold_train)
        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -3.9)

        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -3.9)
        test_predictions_folds.append(test_predictions_fold)

    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    
    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test:.4f}, Test RMSE: {rmse_test:.4f}')

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Train actual': actual_y_train,
        'Y Test actual': y_eval,
        'Test Predictions folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_std,
    })

    results[model_name] = {
        'Train MSE (5 fold CV)': mse_train,
        'Train MAE (5 fold CV)': mae_train,
        'Train RMSE (5 fold CV)': rmse_train,
        'Train R2 (5 fold CV)': r2_train,
        'Train PCC (5 fold CV)': pearson_train,
        'Train SCC (5 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
predictions_df = pd.DataFrame(predictions)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stacked Ensemble Training and Evaluation complete')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

results_df

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1: Training weak learners with 5-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


Processing dataframe pairs:   0%|                                                                 | 0/4 [00:00<?, ?it/s]
Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56831
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 260
[LightGBM] [Info] Start training from score -5.747051
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56849
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 260
[LightGBM] [Info] Start training from score -5.747251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56836
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 261
[LightGBM] [Info] Star


Training models:  10%|██████▋                                                            | 1/10 [00:03<00:33,  3.67s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:10<00:42,  5.36s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [02:02<06:20, 54.32s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [02:36<04:36, 46.01s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [02:39<02:33, 30.78s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [02:42<01:24, 21.18s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [02:43<00:43, 14.51s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [02:56<00:28, 14.08s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [03:07<00:13, 13.07s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [03:11<00:00, 19.19s/it][A
Processing dataframe pairs:  25%|██████████████                                          | 1/4 [03:11<09:35, 191.93s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4161
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1098
[LightGBM] [Info] Start training from score -5.747051
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4165
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1100
[LightGBM] [Info] Start training from score -5.747251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4165
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 1101
[LightGBM] [Info] Star


Training models:  10%|██████▋                                                            | 1/10 [00:05<00:47,  5.25s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:08<00:31,  3.92s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [00:39<01:54, 16.32s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [01:04<01:58, 19.73s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [01:08<01:10, 14.02s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [01:10<00:40, 10.14s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [01:11<00:21,  7.12s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [02:00<00:40, 20.36s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [02:23<00:21, 21.13s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [02:24<00:00, 14.46s/it][A
Processing dataframe pairs:  50%|████████████████████████████                            | 2/4 [05:36<05:28, 164.08s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 755
[LightGBM] [Info] Start training from score -5.747051
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 755
[LightGBM] [Info] Start training from score -5.747251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 755
[LightGBM] [Info] S


Training models:  10%|██████▋                                                            | 1/10 [00:03<00:34,  3.86s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:28<02:10, 16.28s/it][A

Model training done 1



Training models:  30%|███████████████████▊                                              | 3/10 [10:44<33:50, 290.12s/it][A

Model training done 2



Training models:  40%|██████████████████████████▍                                       | 4/10 [13:03<23:02, 230.43s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████                                 | 5/10 [13:15<12:38, 151.62s/it][A

Model training done 4



Training models:  60%|███████████████████████████████████████▌                          | 6/10 [13:21<06:48, 102.08s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [13:22<03:26, 68.98s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [13:51<01:52, 56.39s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [14:12<00:45, 45.36s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [14:36<00:00, 87.65s/it][A
Processing dataframe pairs:  75%|██████████████████████████████████████████              | 3/4 [20:13<08:09, 489.39s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 9
[LightGBM] [Info] Start training from score -5.747051
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 148
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 9
[LightGBM] [Info] Start training from score -5.747251
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 445


Training models:  10%|██████▋                                                            | 1/10 [00:02<00:22,  2.51s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:04<00:17,  2.13s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [00:05<00:10,  1.45s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [00:05<00:05,  1.01it/s][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [00:06<00:04,  1.08it/s][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [00:07<00:04,  1.09s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [00:07<00:02,  1.23it/s][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [00:11<00:03,  1.67s/it][A

Model training done 7



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.80s/it][A
Processing dataframe pairs: 100%|████████████████████████████████████████████████████████| 4/4 [20:31<00:00, 307.76s/it]

Model training done 8
Model training done 9
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (5568, 40)
Dimensions of meta_features_test: (1392, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 2: Training meta-learners with 5-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9967
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 40
[LightGBM] [Info] Start training from score -5.747051





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9966
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 40
[LightGBM] [Info] Start training from score -5.747251




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001479 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9966
[LightGBM] [Info] Number of data points in the train set: 4454, number of used features: 40
[LightGBM] [Info] Start training from score -5.741644




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9979
[LightGBM] [Info] Number of data points in the train set: 4455, number of used features: 40
[LightGBM] [Info] Start training from score -5.751882




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9961
[LightGBM] [Info] Number of data points in the train set: 4455, number of used features: 40
[LightGBM] [Info] Start training from score -5.726702




LGBMRegressor Evaluation completed: Test R2 score: 0.6926, Test RMSE: 0.4410
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.6544, Test RMSE: 0.4676
RandomForestRegressor Evaluation completed: Test R2 score: 0.6888, Test RMSE: 0.4438
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.6947, Test RMSE: 0.4395
AdaBoostRegressor Evaluation completed: Test R2 score: 0.6464, Test RMSE: 0.4730
XGBRegressor Evaluation completed: Test R2 score: 0.6804, Test RMSE: 0.4497
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.6909, Test RMSE: 0.4423
LinearRegression Evaluation completed: Test R2 score: 0.6935, Test RMSE: 0.4404
KNeighborsRegressor Evaluation completed: Test R2 score: 0.6793, Test RMSE: 0.4505
SVR Evaluation completed: Test R2 score: 0.6933, Test RMSE: 0.4406
MLPRegressor Evaluation completed: Test R2 score: 0.6953, Test RMSE: 0.4391
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stacked Ensemble Training and 

Unnamed: 0,Train MSE (5 fold CV),Train MAE (5 fold CV),Train RMSE (5 fold CV),Train R2 (5 fold CV),Train PCC (5 fold CV),Train SCC (5 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.127861,0.25989,0.357577,0.794702,0.891473,0.876314,0.194517,0.309135,0.441041,0.692623,0.832415,0.81828
DecisionTreeRegressor,0.264635,0.373133,0.514427,0.575094,0.790962,0.77351,0.218677,0.331339,0.467629,0.654446,0.810925,0.793323
RandomForestRegressor,0.129418,0.261589,0.359747,0.792203,0.890144,0.873851,0.196953,0.309978,0.443793,0.688774,0.830284,0.814919
GradientBoostingRegressor,0.128346,0.260018,0.358254,0.793924,0.891025,0.875486,0.193185,0.308821,0.439529,0.694727,0.833655,0.818067
AdaBoostRegressor,0.17325,0.32101,0.416233,0.721825,0.859417,0.847435,0.223738,0.356501,0.473009,0.646448,0.81112,0.8051
XGBRegressor,0.146996,0.279729,0.3834,0.76398,0.875091,0.858649,0.202248,0.314479,0.44972,0.680406,0.825609,0.810231
ExtraTreesRegressor,0.128417,0.260089,0.358353,0.79381,0.891065,0.875282,0.195636,0.310127,0.442308,0.690854,0.831489,0.816471
LinearRegression,0.12545,0.256851,0.354189,0.798574,0.893633,0.876812,0.19397,0.309375,0.44042,0.693488,0.832908,0.817046
KNeighborsRegressor,0.1533,0.285699,0.391536,0.753856,0.868534,0.850334,0.202936,0.321619,0.450485,0.679318,0.824906,0.810051
SVR,0.135357,0.263448,0.367909,0.782667,0.884809,0.873899,0.194086,0.306082,0.440552,0.693303,0.833898,0.819824


In [8]:
results_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Results_5_fold_stacked_prediction_LVR_scaled.csv')
predictions_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Prediction_data_Meta_learners_5_fold_stacked_prediction_LVR_scaled.csv')

In [9]:
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101)
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500)
]
target_column = 'Permeability'
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

meta_features_train = []
meta_features_test = []

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1: Training weak learners with 10-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 1: Train weak learners with 10-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_eval = df_test[target_column]
    kf = KFold(n_splits=10, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_train, y_train)

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -3.9)
            test_predictions_fold = np.clip(model.predict(X_eval), -10, -3.9)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
        print(f'Model training done {i}')

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 2: Training meta-learners with 10-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


results = {}
predictions = []

kf = KFold(n_splits=10, shuffle=True, random_state=101)

for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

        model.fit(X_fold_train, y_fold_train)
        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -3.9)

        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -3.9)
        test_predictions_folds.append(test_predictions_fold)

    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    
    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test:.4f}, Test RMSE: {rmse_test:.4f}')

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Train actual': actual_y_train,
        'Y Test actual': y_eval,
        'Test Predictions folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_std,
    })

    results[model_name] = {
        'Train MSE (10 fold CV)': mse_train,
        'Train MAE (10 fold CV)': mae_train,
        'Train RMSE (10 fold CV)': rmse_train,
        'Train R2 (10 fold CV)': r2_train,
        'Train PCC (10 fold CV)': pearson_train,
        'Train SCC (10 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
predictions_df = pd.DataFrame(predictions)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stacked Ensemble Training and Evaluation complete')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

results_df

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1: Training weak learners with 10-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


Processing dataframe pairs:   0%|                                                                 | 0/4 [00:00<?, ?it/s]
Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56912
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 262
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56937
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56933
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Star


Training models:  10%|██████▋                                                            | 1/10 [00:07<01:09,  7.68s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:21<01:31, 11.42s/it][A

Model training done 1



Training models:  30%|███████████████████▊                                              | 3/10 [04:30<13:58, 119.83s/it][A

Model training done 2



Training models:  40%|██████████████████████████▍                                       | 4/10 [05:45<10:12, 102.08s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [05:52<05:39, 68.00s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [05:58<03:06, 46.65s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [05:59<01:35, 31.86s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [06:29<01:02, 31.14s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [06:54<00:29, 29.20s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [07:05<00:00, 42.51s/it][A
Processing dataframe pairs:  25%|██████████████                                          | 1/4 [07:05<21:15, 425.13s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4196
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4200
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1105
[LightGBM] [Info] Star


Training models:  10%|██████▋                                                            | 1/10 [00:10<01:35, 10.60s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:17<01:05,  8.18s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [01:26<04:14, 36.37s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [02:23<04:26, 44.34s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [02:31<02:35, 31.12s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [02:36<01:29, 22.48s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [02:38<00:46, 15.64s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [04:19<01:25, 42.86s/it][A

Model training done 7



Training models:  90%|████████████████████████████████████████████████████████████▎      | 9/10 [05:11<00:45, 45.73s/it][A

Model training done 8



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [05:14<00:00, 31.47s/it][A
Processing dataframe pairs:  50%|████████████████████████████                            | 2/4 [12:19<12:00, 360.19s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] S


Training models:  10%|██████▋                                                            | 1/10 [00:08<01:13,  8.12s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [01:02<04:41, 35.25s/it][A

Model training done 1



Training models:  30%|███████████████████▏                                            | 3/10 [24:22<1:16:47, 658.28s/it][A

Model training done 2



Training models:  40%|██████████████████████████▍                                       | 4/10 [29:34<52:10, 521.77s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████                                 | 5/10 [29:58<28:31, 342.20s/it][A

Model training done 4



Training models:  60%|███████████████████████████████████████▌                          | 6/10 [30:11<15:21, 230.25s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▏                   | 7/10 [30:12<07:46, 155.49s/it][A

Model training done 6



Training models:  80%|████████████████████████████████████████████████████▊             | 8/10 [31:14<04:11, 125.68s/it][A

Model training done 7



Training models:  90%|███████████████████████████████████████████████████████████▍      | 9/10 [32:00<01:40, 100.84s/it][A

Model training done 8



Training models: 100%|█████████████████████████████████████████████████████████████████| 10/10 [32:55<00:00, 197.55s/it][A
Processing dataframe pairs:  75%|█████████████████████████████████████████▎             | 3/4 [45:15<18:17, 1097.80s/it]

Model training done 9



Training models:   0%|                                                                           | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training models:  10%|██████▋                                                            | 1/10 [00:05<00:45,  5.02s/it][A

Model training done 0



Training models:  20%|█████████████▍                                                     | 2/10 [00:08<00:34,  4.36s/it][A

Model training done 1



Training models:  30%|████████████████████                                               | 3/10 [00:10<00:20,  2.99s/it][A

Model training done 2



Training models:  40%|██████████████████████████▊                                        | 4/10 [00:10<00:12,  2.04s/it][A

Model training done 3



Training models:  50%|█████████████████████████████████▌                                 | 5/10 [00:12<00:09,  1.93s/it][A

Model training done 4



Training models:  60%|████████████████████████████████████████▏                          | 6/10 [00:15<00:09,  2.32s/it][A

Model training done 5



Training models:  70%|██████████████████████████████████████████████▉                    | 7/10 [00:16<00:05,  1.70s/it][A

Model training done 6



Training models:  80%|█████████████████████████████████████████████████████▌             | 8/10 [00:23<00:07,  3.64s/it][A

Model training done 7



Training models: 100%|██████████████████████████████████████████████████████████████████| 10/10 [00:39<00:00,  3.95s/it][A
Processing dataframe pairs: 100%|████████████████████████████████████████████████████████| 4/4 [45:54<00:00, 688.72s/it]

Model training done 8
Model training done 9
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (5568, 40)
Dimensions of meta_features_test: (1392, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 2: Training meta-learners with 10-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10102
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.747671




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001634 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10101
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.741826




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10095
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.740654




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.749020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10093
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.737478




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10092
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.747212




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10094
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.750687




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.743084




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10091
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 40
[LightGBM] [Info] Start training from score -5.737254




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 40
[LightGBM] [Info] Start training from score -5.734177




LGBMRegressor Evaluation completed: Test R2 score: 0.6957, Test RMSE: 0.4388
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.6612, Test RMSE: 0.4631
RandomForestRegressor Evaluation completed: Test R2 score: 0.6907, Test RMSE: 0.4425
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.6965, Test RMSE: 0.4382
AdaBoostRegressor Evaluation completed: Test R2 score: 0.6657, Test RMSE: 0.4600
XGBRegressor Evaluation completed: Test R2 score: 0.6870, Test RMSE: 0.4451
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.6928, Test RMSE: 0.4409
LinearRegression Evaluation completed: Test R2 score: 0.6941, Test RMSE: 0.4400
KNeighborsRegressor Evaluation completed: Test R2 score: 0.6817, Test RMSE: 0.4488
SVR Evaluation completed: Test R2 score: 0.6933, Test RMSE: 0.4406
MLPRegressor Evaluation completed: Test R2 score: 0.6903, Test RMSE: 0.4427
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stacked Ensemble Training and 

Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.124649,0.257625,0.353057,0.79986,0.894369,0.877403,0.192554,0.307291,0.43881,0.695725,0.834227,0.820651
DecisionTreeRegressor,0.265595,0.376011,0.515359,0.573552,0.789093,0.764616,0.214418,0.323827,0.463053,0.661175,0.815084,0.801343
RandomForestRegressor,0.126932,0.258329,0.356275,0.796194,0.892322,0.875365,0.195763,0.309286,0.442452,0.690653,0.831321,0.815909
GradientBoostingRegressor,0.12479,0.256373,0.353256,0.799633,0.894225,0.876869,0.192059,0.308014,0.438245,0.696507,0.834725,0.820596
AdaBoostRegressor,0.168182,0.315297,0.410101,0.729961,0.864573,0.852041,0.211558,0.345272,0.459955,0.665694,0.823699,0.8129
XGBRegressor,0.140547,0.273822,0.374896,0.774333,0.880612,0.862136,0.19807,0.311171,0.445051,0.687008,0.829228,0.815823
ExtraTreesRegressor,0.125389,0.256109,0.354103,0.798672,0.893708,0.876617,0.194397,0.30913,0.440905,0.692812,0.832603,0.816472
LinearRegression,0.124789,0.255568,0.353255,0.799634,0.894226,0.87755,0.193594,0.308208,0.439993,0.694081,0.833284,0.817977
KNeighborsRegressor,0.151814,0.28539,0.389633,0.756243,0.869643,0.85088,0.201411,0.321141,0.448789,0.681728,0.826085,0.804292
SVR,0.133759,0.261469,0.36573,0.785233,0.886403,0.875806,0.194117,0.305681,0.440587,0.693255,0.833838,0.819023


In [10]:
results_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Results_10_fold_stacked_prediction_LVR_scaled.csv')
predictions_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Prediction_data_Meta_learners_10_fold_stacked_prediction_LVR_scaled.csv')

In [5]:
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101)
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500)
]
target_column = 'Permeability'
dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

meta_features_train = []
meta_features_test = []

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1: Training weak learners with 20-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 1: Train weak learners with 20-fold cross-validation
for df_train, df_test in tqdm(dataframes, desc="Processing dataframe pairs"):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_eval = df_test[target_column]
    kf = KFold(n_splits=20, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for train_index, val_index in kf.split(X_weak):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_train, y_train)

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -3.9)
            test_predictions_fold = np.clip(model.predict(X_eval), -10, -3.9)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
        print(f'Model training done {i}')

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 2: Training meta-learners with 20-fold CV')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


results = {}
predictions = []

kf = KFold(n_splits=20, shuffle=True, random_state=101)

for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    test_predictions_folds = []

    for train_index, val_index in kf.split(meta_features_train):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

        model.fit(X_fold_train, y_fold_train)
        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -3.9)

        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -3.9)
        test_predictions_folds.append(test_predictions_fold)

    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    
    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

    print(f'{model_name} Evaluation completed: Test R2 score: {r2_test:.4f}, Test RMSE: {rmse_test:.4f}')

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Train actual': actual_y_train,
        'Y Test actual': y_eval,
        'Test Predictions folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_std,

    })

    results[model_name] = {
        'Train MSE (20 fold CV)': mse_train,
        'Train MAE (20 fold CV)': mae_train,
        'Train RMSE (20 fold CV)': rmse_train,
        'Train R2 (20 fold CV)': r2_train,
        'Train PCC (20 fold CV)': pearson_train,
        'Train SCC (20 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T
predictions_df = pd.DataFrame(predictions)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stacked Ensemble Training and Evaluation complete')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

results_df

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1: Training weak learners with 20-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 56958
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 263
[LightGBM] [Info] Start training from score -5.747281
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56944
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 263
[LightGBM] [Info] Start training from score -5.743197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56956
[LightGBM] [Info] Number of data points in the trai


Training models:  10%|█         | 1/10 [00:18<02:45, 18.44s/it][A

Model training done 0



Training models:  20%|██        | 2/10 [00:47<03:17, 24.69s/it][A

Model training done 1



Training models:  30%|███       | 3/10 [09:32<29:30, 252.97s/it][A

Model training done 2



Training models:  40%|████      | 4/10 [12:09<21:31, 215.24s/it][A

Model training done 3



Training models:  50%|█████     | 5/10 [12:26<11:58, 143.75s/it][A

Model training done 4



Training models:  60%|██████    | 6/10 [12:37<06:34, 98.61s/it] [A

Model training done 5



Training models:  70%|███████   | 7/10 [12:40<03:21, 67.27s/it][A

Model training done 6



Training models:  80%|████████  | 8/10 [13:38<02:09, 64.54s/it][A

Model training done 7



Training models:  90%|█████████ | 9/10 [14:37<01:02, 62.56s/it][A

Model training done 8



Training models: 100%|██████████| 10/10 [14:59<00:00, 89.99s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [14:59<44:59, 899.88s/it]

Model training done 9



Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4209
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 1107
[LightGBM] [Info] Start training from score -5.747281
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4204
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 1106
[LightGBM] [Info] Start training from score -5.743197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4211
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 1106
[LightGBM] [Info] Star


Training models:  10%|█         | 1/10 [00:22<03:20, 22.28s/it][A

Model training done 0



Training models:  20%|██        | 2/10 [00:34<02:11, 16.50s/it][A

Model training done 1



Training models:  30%|███       | 3/10 [03:03<08:58, 76.88s/it][A

Model training done 2



Training models:  40%|████      | 4/10 [04:59<09:13, 92.24s/it][A

Model training done 3



Training models:  50%|█████     | 5/10 [05:14<05:22, 64.55s/it][A

Model training done 4



Training models:  60%|██████    | 6/10 [05:26<03:07, 46.75s/it][A

Model training done 5



Training models:  70%|███████   | 7/10 [05:29<01:37, 32.45s/it][A

Model training done 6



Training models:  80%|████████  | 8/10 [08:55<02:55, 87.66s/it][A

Model training done 7



Training models:  90%|█████████ | 9/10 [10:30<01:29, 89.83s/it][A

Model training done 8



Training models: 100%|██████████| 10/10 [10:36<00:00, 63.67s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [25:36<24:50, 745.10s/it]

Model training done 9



Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 755
[LightGBM] [Info] Start training from score -5.747281
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 755
[LightGBM] [Info] Start training from score -5.743197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 755
[LightGBM] [Info] S


Training models:  10%|█         | 1/10 [00:15<02:16, 15.20s/it][A

Model training done 0



Training models:  20%|██        | 2/10 [02:16<10:20, 77.62s/it][A

Model training done 1



Training models:  30%|███       | 3/10 [51:55<2:43:36, 1402.42s/it][A

Model training done 2



Training models:  40%|████      | 4/10 [1:02:57<1:51:00, 1110.06s/it][A

Model training done 3



Training models:  50%|█████     | 5/10 [1:03:46<1:00:37, 727.43s/it] [A

Model training done 4



Training models:  60%|██████    | 6/10 [1:04:14<32:38, 489.64s/it]  [A

Model training done 5



Training models:  70%|███████   | 7/10 [1:04:16<16:31, 330.39s/it][A

Model training done 6



Training models:  80%|████████  | 8/10 [1:06:24<08:51, 265.94s/it][A

Model training done 7



Training models:  90%|█████████ | 9/10 [1:08:04<03:33, 213.89s/it][A

Model training done 8



Training models: 100%|██████████| 10/10 [1:10:00<00:00, 420.09s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [1:35:37<38:43, 2323.10s/it]

Model training done 9



Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.139206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 9
[LightGBM] [Info] Start training from score -5.747281
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 9
[LightGBM] [Info] Start training from score -5.743197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000755 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training models:  10%|█         | 1/10 [00:10<01:38, 10.91s/it][A

Model training done 0



Training models:  20%|██        | 2/10 [00:18<01:12,  9.12s/it][A

Model training done 1



Training models:  30%|███       | 3/10 [00:21<00:43,  6.23s/it][A

Model training done 2



Training models:  40%|████      | 4/10 [00:23<00:26,  4.35s/it][A

Model training done 3



Training models:  50%|█████     | 5/10 [00:26<00:20,  4.01s/it][A

Model training done 4



Training models:  60%|██████    | 6/10 [00:32<00:18,  4.61s/it][A

Model training done 5



Training models:  70%|███████   | 7/10 [00:33<00:10,  3.36s/it][A

Model training done 6



Training models:  80%|████████  | 8/10 [00:49<00:14,  7.48s/it][A

Model training done 7



Training models:  90%|█████████ | 9/10 [01:19<00:14, 14.67s/it][A
Training models: 100%|██████████| 10/10 [01:19<00:00,  7.99s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [1:36:57<00:00, 1454.37s/it]

Model training done 8
Model training done 9
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (5568, 40)
Dimensions of meta_features_test: (1392, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 2: Training meta-learners with 20-fold CV
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10160
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.747281





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10163
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.743197




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10160
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.740467




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10161
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.744446




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10159
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.740965




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10160
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.742240




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10161
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.748642




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10160
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 40
[LightGBM] [Info] Start training from score -5.743284




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001635 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10161
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.739611




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10160
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.740964




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10158
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.744793




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10159
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.745041




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10163
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.747061




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001609 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10160
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.746083




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10163
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.740864




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10158
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.745182




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10161
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.739569




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10162
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.740887




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001608 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10163
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.741020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10164
[LightGBM] [Info] Number of data points in the train set: 5290, number of used features: 40
[LightGBM] [Info] Start training from score -5.736521




LGBMRegressor Evaluation completed: Test R2 score: 0.6972, Test RMSE: 0.4377
DecisionTreeRegressor Evaluation completed: Test R2 score: 0.6674, Test RMSE: 0.4588
RandomForestRegressor Evaluation completed: Test R2 score: 0.6926, Test RMSE: 0.4411
GradientBoostingRegressor Evaluation completed: Test R2 score: 0.6952, Test RMSE: 0.4392
AdaBoostRegressor Evaluation completed: Test R2 score: 0.6557, Test RMSE: 0.4668
XGBRegressor Evaluation completed: Test R2 score: 0.6931, Test RMSE: 0.4407
ExtraTreesRegressor Evaluation completed: Test R2 score: 0.6942, Test RMSE: 0.4399
LinearRegression Evaluation completed: Test R2 score: 0.6962, Test RMSE: 0.4385
KNeighborsRegressor Evaluation completed: Test R2 score: 0.6772, Test RMSE: 0.4520
SVR Evaluation completed: Test R2 score: 0.6925, Test RMSE: 0.4411
MLPRegressor Evaluation completed: Test R2 score: 0.6987, Test RMSE: 0.4367
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stacked Ensemble Training and 

Unnamed: 0,Train MSE (20 fold CV),Train MAE (20 fold CV),Train RMSE (20 fold CV),Train R2 (20 fold CV),Train PCC (20 fold CV),Train SCC (20 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.123625,0.255786,0.351603,0.801504,0.895277,0.878916,0.19161,0.30749,0.437732,0.697217,0.835119,0.820597
DecisionTreeRegressor,0.255956,0.370629,0.505921,0.58903,0.798817,0.770858,0.210502,0.323504,0.458805,0.667363,0.818505,0.798929
RandomForestRegressor,0.125116,0.256122,0.353718,0.799109,0.893964,0.87742,0.194562,0.308341,0.441091,0.692552,0.832448,0.815556
GradientBoostingRegressor,0.124204,0.256301,0.352426,0.800574,0.894748,0.877182,0.192875,0.307301,0.439175,0.695218,0.833927,0.819573
AdaBoostRegressor,0.165833,0.311031,0.407225,0.733734,0.866184,0.858277,0.217909,0.34754,0.466807,0.655659,0.816905,0.811944
XGBRegressor,0.142237,0.274186,0.377143,0.77162,0.878973,0.862483,0.19422,0.307843,0.440704,0.693093,0.832948,0.818835
ExtraTreesRegressor,0.123368,0.254954,0.351238,0.801916,0.895523,0.877273,0.193505,0.30835,0.439892,0.694222,0.833482,0.816034
LinearRegression,0.123873,0.254337,0.351956,0.801106,0.895048,0.878397,0.192259,0.307641,0.438474,0.69619,0.834527,0.818674
KNeighborsRegressor,0.148924,0.280575,0.385906,0.760884,0.872412,0.852664,0.204262,0.31727,0.451954,0.677223,0.824058,0.809096
SVR,0.131766,0.259583,0.362996,0.788433,0.888171,0.877363,0.194604,0.305052,0.441139,0.692485,0.833372,0.818657


In [6]:
results_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Results_20_fold_stacked_prediction_LVR_scaled.csv')
predictions_df.to_csv('/home/users/akshay/PCPpred/PAMPA/Results/Stacked_architecture/Prediction_data_Meta_learners_20_fold_stacked_prediction_LVR_scaled.csv')

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import joblib


os.makedirs('/home/users/akshay/PCPpred/PAMPA/pampa_models/', exist_ok=True)

# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_desc = features(train, "Permeability")
joblib.dump(selected_features_desc, '/home/users/akshay/PCPpred/PAMPA/pampa_models/selected_features_descriptors.joblib')
df_desc_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_desc]], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test = df_desc_test.dropna()
df_desc_test = df_desc_test[df_desc_train.columns]

# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/All_fingerprints_train.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_fp = features(train, "Permeability")
joblib.dump(selected_features_fp, '/home/users/akshay/PCPpred/PAMPA/pampa_models/selected_features_fingerprints.joblib')
df_fp_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_fp]], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/All_fingerprints_test.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test = df_fp_test[df_fp_train.columns]

# Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_emb = features(train, "Permeability")
joblib.dump(selected_features_emb, '/home/users/akshay/PCPpred/PAMPA/pampa_models/selected_features_embeddings.joblib')
df_emb_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_emb]], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test = df_emb_test[df_emb_train.columns]

# Atomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Train_all_atomic_desc.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'], axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features_atomic = features(train, "Permeability")
joblib.dump(selected_features_atomic, '/home/users/akshay/PCPpred/PAMPA/pampa_models/selected_features_atomic.joblib')
df_atomic_train = pd.concat([df_train[['ID','SMILES','Permeability']], df_train[selected_features_atomic]], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Test_all_atomic_desc.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test = df_atomic_test[df_atomic_train.columns]

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Filter dataframes to have consistent IDs
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]
df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]
df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

target_column = 'Permeability'

def scale_features(df_train, df_test, feature_type):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    # Save the scaler
    joblib.dump(scaler, f'/home/users/akshay/PCPpred/PAMPA/pampa_models/scaler_{feature_type}.joblib')
    return df_train_scaled, df_test_scaled

df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test, 'Descriptor')
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test, 'Fingerprints')
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test, 'Embeddings')
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test , 'Atomic')

models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),
]

models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101)
]

dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]
data_names = ['descriptors', 'fingerprints', 'embeddings', 'atomic']

meta_features_train = []
meta_features_test = []

# Stage 1: Train weak learners with 10-fold cross-validation
for df_idx, (df_train, df_test) in enumerate(tqdm(dataframes, desc="Processing dataframe pairs")):
    X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
    X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
    y_weak = df_train[target_column]
    y_eval = df_test[target_column]

    kf = KFold(n_splits=10, shuffle=True, random_state=101)

    fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
    fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

    for i, model in tqdm(enumerate(models_weak), desc="Training models", total=len(models_weak)):
        fold_predictions = np.zeros(X_weak.shape[0])
        test_predictions_folds = []

        for fold_idx, (train_index, val_index) in enumerate(kf.split(X_weak)):
            X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
            y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
            
            model.fit(X_train, y_train)
            
            model_name = model.__class__.__name__
            joblib.dump(model, f'/home/users/akshay/PCPpred/PAMPA/pampa_models/weak_{data_names[df_idx]}_{model_name}_fold_{fold_idx}.joblib')

            fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -3.9)

            test_predictions_fold = np.clip(model.predict(X_eval), -10, -3.9)
            test_predictions_folds.append(test_predictions_fold)

        fold_meta_features_train[:, i] = fold_predictions
        fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)

    meta_features_train.append(fold_meta_features_train)
    meta_features_test.append(fold_meta_features_test)
    
    joblib.dump(fold_meta_features_train, f'/home/users/akshay/PCPpred/PAMPA/pampa_models/meta_features_train_{data_names[df_idx]}.joblib')
    joblib.dump(fold_meta_features_test, f'/home/users/akshay/PCPpred/PAMPA/pampa_models/meta_features_test_{data_names[df_idx]}.joblib')

meta_features_train = np.hstack(meta_features_train)
meta_features_test = np.hstack(meta_features_test)

joblib.dump(meta_features_train, '/home/users/akshay/PCPpred/PAMPA/pampa_models/meta_features_train_combined.joblib')
joblib.dump(meta_features_test, '/home/users/akshay/PCPpred/PAMPA/pampa_models/meta_features_test_combined.joblib')

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print("Dimensions of meta_features_train:", meta_features_train.shape)
print("Dimensions of meta_features_test:", meta_features_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Stage 1 completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# Stage 2: Train the meta-learner using predictions from weak learners
kf = KFold(n_splits=10, shuffle=True, random_state=101)
results = {}
predictions = []
for model in models_meta:
    model_name = model.__class__.__name__
    predictions_train = []
    actual_y_train = []
    
    test_predictions_folds = []

    for fold_idx, (train_index, val_index) in enumerate(kf.split(meta_features_train)):
        X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
        y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        
        joblib.dump(model, f'/home/users/akshay/PCPpred/PAMPA/pampa_models/meta_{model_name}_fold_{fold_idx}.joblib')

        y_pred_fold = model.predict(X_fold_val)
        y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
        predictions_train.extend(y_pred_fold)
        actual_y_train.extend(y_fold_val)

        test_predictions_fold = model.predict(meta_features_test)
        test_predictions_fold = np.clip(test_predictions_fold, -10, -3.9)
        test_predictions_folds.append(test_predictions_fold)

    # Metrics
    predictions_test_mean = np.mean(test_predictions_folds, axis=0)
    predictions_test_std = np.std(test_predictions_folds, axis=0)

    mse_train = mean_squared_error(actual_y_train, predictions_train)
    mae_train = mean_absolute_error(actual_y_train, predictions_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(actual_y_train, predictions_train)
    pearson_train, _ = pearsonr(actual_y_train, predictions_train)
    spearman_train, _ = spearmanr(actual_y_train, predictions_train)

    mse_test = mean_squared_error(y_eval, predictions_test_mean)
    mae_test = mean_absolute_error(y_eval, predictions_test_mean)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_eval, predictions_test_mean)
    pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
    spearman_test, _ = spearmanr(y_eval, predictions_test_mean)
    

    predictions.append({
        'Model': model_name,
        'Y Train pred': predictions_train,
        'Y Test actual': y_eval,
        'Test prediction folds': test_predictions_folds,
        'Test Predictions Mean': predictions_test_mean,
        'Test Predictions Std': predictions_test_mean,
    })

    results[model_name] = {
        'Train MSE (10 fold CV)': mse_train,
        'Train MAE (10 fold CV)': mae_train,
        'Train RMSE (10 fold CV)': rmse_train,
        'Train R2 (10 fold CV)': r2_train,
        'Train PCC (10 fold CV)': pearson_train,
        'Train SCC (10 fold CV)': spearman_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test RMSE': rmse_test,
        'Test R2': r2_test,
        'Test PCC': pearson_test,
        'Test SCC': spearman_test,
    }

results_df = pd.DataFrame(results).T

  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
  df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(5568, 271)
(1392, 271)
(5568, 1128)
(1392, 1128)
(5568, 758)
(1392, 758)
(5568, 12)
(1392, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        ID                                             SMILES  Permeability  \
4765     3  CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...     -7.000000   
4772     4  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...     -7.100000   
4766     6  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...     -7.300000   
4767     7  CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...     -7.300000   
4585    11  COc1ccc(

Processing dataframe pairs:   0%|          | 0/4 [00:00<?, ?it/s]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56912
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 262
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56937
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56933
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Star


Training models:  10%|█         | 1/10 [00:07<01:09,  7.69s/it][A
Training models:  20%|██        | 2/10 [00:49<03:41, 27.70s/it][A
Training models:  30%|███       | 3/10 [04:57<14:58, 128.40s/it][A
Training models:  40%|████      | 4/10 [06:12<10:42, 107.10s/it][A
Training models:  50%|█████     | 5/10 [06:19<05:56, 71.30s/it] [A
Training models:  60%|██████    | 6/10 [06:33<03:26, 51.66s/it][A
Training models:  70%|███████   | 7/10 [06:36<01:47, 35.79s/it][A
Training models:  80%|████████  | 8/10 [07:04<01:06, 33.25s/it][A
Training models:  90%|█████████ | 9/10 [07:28<00:30, 30.38s/it][A
Training models: 100%|██████████| 10/10 [07:39<00:00, 45.95s/it][A
Processing dataframe pairs:  25%|██▌       | 1/4 [07:39<22:58, 459.46s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4196
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4200
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1105
[LightGBM] [Info] Star


Training models:  10%|█         | 1/10 [00:10<01:35, 10.59s/it][A
Training models:  20%|██        | 2/10 [00:25<01:43, 12.91s/it][A
Training models:  30%|███       | 3/10 [01:36<04:36, 39.44s/it][A
Training models:  40%|████      | 4/10 [02:47<05:11, 51.89s/it][A
Training models:  50%|█████     | 5/10 [02:55<03:00, 36.12s/it][A
Training models:  60%|██████    | 6/10 [03:19<02:08, 32.02s/it][A
Training models:  70%|███████   | 7/10 [03:54<01:39, 33.08s/it][A
Training models:  80%|████████  | 8/10 [05:38<01:51, 55.62s/it][A
Training models:  90%|█████████ | 9/10 [06:31<00:54, 54.69s/it][A
Training models: 100%|██████████| 10/10 [06:34<00:00, 39.43s/it][A
Processing dataframe pairs:  50%|█████     | 2/4 [14:13<14:02, 421.12s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] S


Training models:  10%|█         | 1/10 [00:07<01:11,  8.00s/it][A
Training models:  20%|██        | 2/10 [01:07<05:07, 38.49s/it][A
Training models:  30%|███       | 3/10 [24:32<1:17:17, 662.46s/it][A
Training models:  40%|████      | 4/10 [29:45<52:27, 524.55s/it]  [A
Training models:  50%|█████     | 5/10 [30:10<28:41, 344.33s/it][A
Training models:  60%|██████    | 6/10 [30:53<16:07, 241.82s/it][A
Training models:  70%|███████   | 7/10 [31:21<08:36, 172.04s/it][A
Training models:  80%|████████  | 8/10 [32:26<04:35, 137.94s/it][A
Training models:  90%|█████████ | 9/10 [33:13<01:49, 109.49s/it][A
Training models: 100%|██████████| 10/10 [34:08<00:00, 204.82s/it][A
Processing dataframe pairs:  75%|███████▌  | 3/4 [48:21<19:24, 1164.09s/it]
Training models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training models:  10%|█         | 1/10 [00:05<00:47,  5.29s/it][A
Training models:  20%|██        | 2/10 [00:09<00:37,  4.64s/it][A
Training models:  30%|███       | 3/10 [00:11<00:22,  3.23s/it][A
Training models:  40%|████      | 4/10 [00:11<00:13,  2.21s/it][A
Training models:  50%|█████     | 5/10 [00:13<00:10,  2.04s/it][A
Training models:  60%|██████    | 6/10 [00:16<00:09,  2.48s/it][A
Training models:  70%|███████   | 7/10 [00:17<00:05,  1.84s/it][A
Training models:  80%|████████  | 8/10 [00:25<00:07,  3.75s/it][A
Training models: 100%|██████████| 10/10 [00:40<00:00,  4.09s/it][A
Processing dataframe pairs: 100%|██████████| 4/4 [49:02<00:00, 735.73s/it] 


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Dimensions of meta_features_train: (5568, 40)
Dimensions of meta_features_test: (1392, 40)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10102
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.747671




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001654 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10101
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.741826




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10095
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.740654




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001641 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.749020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10093
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.737478




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10092
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.747212




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10094
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.750687




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 40
[LightGBM] [Info] Start training from score -5.743084




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10091
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 40
[LightGBM] [Info] Start training from score -5.737254




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 40
[LightGBM] [Info] Start training from score -5.734177




In [None]:
#Ablations study

In [1]:
import os
import joblib
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def remove_low_variance_columns(df, threshold=0.005):
    # df = df.drop(['ID','SMILES','Permeability'],axis=1)
    variances = df.var()
    
    low_variance_columns = variances[variances < threshold].index.tolist()
    
    df_cleaned = df.drop(columns=low_variance_columns)
    
    return df_cleaned, low_variance_columns

def features(df, target_column='Permeability', threshold=0.9):
    correlation_matrix = df.corr()
    
    features_to_drop = set()
    
    for feature in correlation_matrix.columns:
        if feature == target_column:
            continue 
        target_corr = correlation_matrix[target_column][feature]
        
        for other_feature in correlation_matrix.columns:
            if other_feature == feature or other_feature == target_column:
                continue
            
            if abs(correlation_matrix[feature][other_feature]) > threshold:
                other_target_corr = correlation_matrix[target_column][other_feature]

                if abs(other_target_corr) < abs(target_corr):
                    features_to_drop.add(other_feature)
                else:
                    features_to_drop.add(feature)
    selected_features = [col for col in df.columns if col not in features_to_drop and col != target_column]
    
    return selected_features


In [3]:
# 2D and 3D descriptors dataframes
df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
df_train = df_desc_train.sort_values(by='ID')
df_train =df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_desc_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')
df_desc_test = df_desc_test.sort_values(by='ID')
df_desc_test =df_desc_test.dropna()
df_desc_test =  df_desc_test[df_desc_train.columns]


# Fingerprints
df_fp_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Train/All_fingerprints_train.csv')
df_train = df_fp_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_fp_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_fp_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Fingerprints/Test/All_fingerprints_test.csv')
df_fp_test = df_fp_test.sort_values(by='ID')
df_fp_test = df_fp_test.dropna()
df_fp_test =  df_fp_test[df_fp_train.columns]


#Smiles Embeddings
df_emb_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Train_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_train = df_emb_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_emb_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
df_emb_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Embeddings/Test_MoLFormer-XL-both-10pct_model_1_fine_tuned_embeddings.csv')
df_emb_test = df_emb_test.sort_values(by='ID')
df_emb_test = df_emb_test.dropna()
df_emb_test =  df_emb_test[df_emb_train.columns]

#ATomic features
df_atomic_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Train_all_atomic_desc.csv')
df_train = df_atomic_train.sort_values(by='ID')
df_train = df_train.dropna()
train = df_train.drop(['ID','SMILES'],axis=1)
train = train.select_dtypes(include=['number'])
train, _ = remove_low_variance_columns(train)
selected_features = features(train, "Permeability")
df_atomic_train = pd.concat( [df_train[['ID','SMILES','Permeability']],df_train[selected_features] ], axis=1)
# df_atomic_train =pd.concat( [df_train['SMILES'], df_train.select_dtypes(include=['number'])], axis=1)
df_atomic_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Atomic/Test_all_atomic_desc.csv')
df_atomic_test = df_atomic_test.sort_values(by='ID')
df_atomic_test = df_atomic_test.dropna()
df_atomic_test =  df_atomic_test[df_atomic_train.columns]


print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Loading completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
df_fp_test = df_fp_test[df_fp_test['ID'].isin(df_desc_test['ID'])]
df_fp_train = df_fp_train[df_fp_train['ID'].isin(df_desc_train['ID'])]

df_emb_test = df_emb_test[df_emb_test['ID'].isin(df_desc_test['ID'])]
df_emb_train = df_emb_train[df_emb_train['ID'].isin(df_desc_train['ID'])]

df_atomic_test = df_atomic_test[df_atomic_test['ID'].isin(df_desc_test['ID'])]
df_atomic_train = df_atomic_train[df_atomic_train['ID'].isin(df_desc_train['ID'])]

print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)

def scale_features(df_train, df_test):
    scaler = StandardScaler()
    train_features = df_train.drop(columns=['ID', 'SMILES', target_column])
    test_features = df_test.drop(columns=['ID', 'SMILES', target_column])
    scaler.fit(train_features)
    train_scaled = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns, index=df_train.index)
    test_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns, index=df_test.index)
    df_train_scaled = pd.concat([df_train[['ID', 'SMILES', target_column]], train_scaled], axis=1)
    df_test_scaled = pd.concat([df_test[['ID', 'SMILES', target_column]], test_scaled], axis=1)
    return df_train_scaled, df_test_scaled

target_column = 'Permeability'
df_desc_train, df_desc_test = scale_features(df_desc_train, df_desc_test)
df_fp_train, df_fp_test = scale_features(df_fp_train, df_fp_test)
df_emb_train, df_emb_test = scale_features(df_emb_train, df_emb_test)
df_atomic_train, df_atomic_test = scale_features(df_atomic_train, df_atomic_test)

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Data Processing completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train.shape)
print(df_desc_test.shape)
print(df_fp_train.shape)
print(df_fp_test.shape)
print(df_emb_train.shape)
print(df_emb_test.shape)
print(df_atomic_train.shape)
print(df_atomic_test.shape)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_desc_train)
print(df_desc_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_fp_train)
print(df_fp_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_emb_train)
print(df_emb_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print(df_atomic_train)
print(df_atomic_test)
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')


  df_desc_train = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Train_2d_3d_all_descriptors.csv')
  df_desc_test = pd.read_csv('/home/users/akshay/PCPpred/PAMPA/features/Descriptors/Test_2d_3d_all_descriptors.csv')


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Loading completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(5568, 271)
(1392, 271)
(5568, 1128)
(1392, 1128)
(5568, 758)
(1392, 758)
(5568, 12)
(1392, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Data Processing completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
(5568, 271)
(1392, 271)
(5568, 1128)
(1392, 1128)
(5568, 758)
(1392, 758)
(5568, 12)
(1392, 12)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        ID                                             SMILES  Permeability  \
4765     3  CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...     -7.000000   
4772     4  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...     -7.100000   
4766     6  CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...     -7.300000   
476

In [4]:
models_weak = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500),
    DecisionTreeRegressor(random_state=101),

]
models_meta = [
    lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, random_state=101),
    DecisionTreeRegressor(random_state=101),
    RandomForestRegressor(n_jobs=-1, random_state=101),
    GradientBoostingRegressor(random_state=101),
    AdaBoostRegressor(random_state=101),
    xgb.XGBRegressor(random_state=101, eval_metric='rmse'),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=101),
    LinearRegression(),
    KNeighborsRegressor(),
    SVR(),
    MLPRegressor(random_state=101, max_iter=500)
]

dataframes = [(df_desc_train, df_desc_test), (df_fp_train, df_fp_test), (df_emb_train, df_emb_test), (df_atomic_train, df_atomic_test)]

In [5]:
ablation_results = {}

for ablation_idx in range(len(dataframes)):
    print(f"========== Ablation: Excluding feature at index {ablation_idx} ==========")
    feature_names = ['Descriptor', 'Fingerprints', 'Embeddings', 'Atomic']
    print(f"========== Ablation: Excluding feature :-- {feature_names[ablation_idx]} ==========")

    ablated_dataframes = [pair for i, pair in enumerate(dataframes) if i != ablation_idx]

    meta_features_train = []
    meta_features_test = []

    # Stage 1
    for df_train, df_test in tqdm(ablated_dataframes, desc="Processing ablated dataframes"):
        X_weak = df_train.drop(columns=['ID', 'SMILES', target_column])
        y_weak = df_train[target_column]
        X_eval = df_test.drop(columns=['ID', 'SMILES', target_column])
        y_eval = df_test[target_column]

        kf = KFold(n_splits=10, shuffle=True, random_state=101)

        fold_meta_features_train = np.zeros((X_weak.shape[0], len(models_weak)))
        fold_meta_features_test = np.zeros((X_eval.shape[0], len(models_weak)))

        for i, model in tqdm(enumerate(models_weak), desc="Training weak models", total=len(models_weak)):
            fold_predictions = np.zeros(X_weak.shape[0])
            test_predictions_folds = []

            for train_index, val_index in kf.split(X_weak):
                X_train, X_val = X_weak.iloc[train_index], X_weak.iloc[val_index]
                y_train, y_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

                model.fit(X_train, y_train)

                fold_predictions[val_index] = np.clip(model.predict(X_val), -10, -3.9)
                test_predictions_fold = np.clip(model.predict(X_eval), -10, -3.9)
                test_predictions_folds.append(test_predictions_fold)

            fold_meta_features_train[:, i] = fold_predictions
            fold_meta_features_test[:, i] = np.mean(test_predictions_folds, axis=0)
            print(f'Model training done {i}: {model.__class__.__name__}')

        meta_features_train.append(fold_meta_features_train)
        meta_features_test.append(fold_meta_features_test)
        print('Dataframe training completed')

    # Stack all meta-features
    meta_features_train = np.hstack(meta_features_train)
    meta_features_test = np.hstack(meta_features_test)

    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
    print('Stage 1 completed (Weak Learners)')
    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

    # Stage 2
    results = {}
    kf = KFold(n_splits=10, shuffle=True, random_state=101)

    for model in models_meta:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []
        test_predictions_folds = []

        for train_index, val_index in kf.split(meta_features_train):
            X_fold_train, X_fold_val = meta_features_train[train_index], meta_features_train[val_index]
            y_fold_train, y_fold_val = y_weak.iloc[train_index], y_weak.iloc[val_index]

            model.fit(X_fold_train, y_fold_train)
            y_pred_fold = np.clip(model.predict(X_fold_val), -10, -3.9)

            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_fold_val)

            test_predictions_fold = model.predict(meta_features_test)
            test_predictions_fold = np.clip(test_predictions_fold, -10, -3.9)
            test_predictions_folds.append(test_predictions_fold)

        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)

        mse_test = mean_squared_error(y_eval, predictions_test_mean)
        mae_test = mean_absolute_error(y_eval, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_eval, predictions_test_mean)
        pearson_test, _ = pearsonr(y_eval, predictions_test_mean)
        spearman_test, _ = spearmanr(y_eval, predictions_test_mean)

        results[model_name] = {
            'Train MSE (10 fold CV)': mse_train,
            'Train MAE (10 fold CV)': mae_train,
            'Train RMSE (10 fold CV)': rmse_train,
            'Train R2 (10 fold CV)': r2_train,
            'Train PCC (10 fold CV)': pearson_train,
            'Train SCC (10 fold CV)': spearman_train,
            'Test MSE': mse_test,
            'Test MAE': mae_test,
            'Test RMSE': rmse_test,
            'Test R2': r2_test,
            'Test PCC': pearson_test,
            'Test SCC': spearman_test,
        }

    ablation_results[f"Ablation_{feature_names[ablation_idx]}"] = pd.DataFrame(results).T

print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
print('Ablation Study Completed')
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

# To view the results
ablation_results_df = {key: value for key, value in ablation_results.items()}




Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4196
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4200
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1105
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:11<01:41, 11.30s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:17<01:06,  8.35s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [01:28<04:19, 37.09s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [02:53<05:34, 55.71s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [03:00<03:12, 38.45s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [03:06<01:49, 27.41s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [03:08<00:57, 19.05s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [04:57<01:34, 47.45s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [05:51<00:49, 49.69s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [05:54<00:00, 35.49s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [05:54<11:49, 354.87s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] S


Training weak models:  10%|█         | 1/10 [00:08<01:13,  8.21s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [01:04<04:52, 36.54s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [24:26<1:16:59, 659.91s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [29:44<52:29, 524.98s/it]  [A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [30:11<28:47, 345.54s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [30:24<15:30, 232.52s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [30:26<07:50, 156.97s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [31:27<04:12, 126.41s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [32:13<01:41, 101.33s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [33:07<00:00, 198.79s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [39:02<21:55, 1315.50s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.123457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training weak models:  10%|█         | 1/10 [00:05<00:47,  5.23s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:09<00:35,  4.48s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:10<00:21,  3.05s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:11<00:12,  2.08s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:12<00:09,  1.97s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:15<00:09,  2.32s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:16<00:05,  1.70s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:24<00:07,  3.64s/it][A

Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:39<00:00,  3.99s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [39:42<00:00, 794.23s/it] 

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7559
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747671





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7558
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.741826




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7554
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.740654




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7555
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.749020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7552
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.737478




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7551
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747212




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7553
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.750687




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7555
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.743084




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7548
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.737254




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7555
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.734177






Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56912
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 262
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56937
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56933
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:07<01:09,  7.74s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:22<01:36, 12.11s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [04:32<14:02, 120.39s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [05:47<10:16, 102.70s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [05:55<05:43, 68.60s/it] [A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [06:01<03:08, 47.15s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [06:02<01:36, 32.19s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [06:32<01:02, 31.44s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [06:58<00:29, 29.59s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [07:09<00:00, 42.91s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [07:09<14:18, 429.08s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] S


Training weak models:  10%|█         | 1/10 [00:08<01:14,  8.31s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [01:08<05:09, 38.63s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [24:30<1:17:08, 661.18s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [29:44<52:24, 524.13s/it]  [A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [30:09<28:41, 344.23s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [30:23<15:27, 231.86s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [30:24<07:49, 156.52s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [31:24<04:11, 125.81s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [32:08<01:40, 100.28s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [33:03<00:00, 198.31s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [40:12<22:23, 1343.20s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012734 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training weak models:  10%|█         | 1/10 [00:05<00:46,  5.18s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:09<00:35,  4.44s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:10<00:21,  3.03s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:11<00:12,  2.07s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:12<00:09,  1.94s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:15<00:09,  2.33s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:16<00:05,  1.71s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:24<00:07,  3.64s/it][A

Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:39<00:00,  3.98s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [40:51<00:00, 817.33s/it] 

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7554





[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747671




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7552
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.741826




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7547
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.740654




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7548
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.749020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7544
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.737478




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7544
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747212




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7546
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.750687




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7549
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.743084




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7543
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.737254




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7548
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.734177






Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56912
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 262
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56937
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56933
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:08<01:14,  8.25s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:22<01:34, 11.86s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [04:31<13:59, 119.93s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [05:46<10:12, 102.13s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [05:53<05:39, 67.93s/it] [A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [05:58<03:06, 46.61s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [05:59<01:35, 31.81s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [06:30<01:02, 31.29s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [06:54<00:29, 29.10s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [07:05<00:00, 42.53s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [07:05<14:10, 425.27s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042009 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4196
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4200
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1105
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:11<01:40, 11.15s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:17<01:06,  8.33s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [01:27<04:16, 36.65s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [02:26<04:32, 45.43s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [02:34<02:39, 31.87s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [02:40<01:32, 23.04s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [02:42<00:48, 16.02s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [04:27<01:29, 44.60s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [05:21<00:47, 47.58s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [05:25<00:00, 32.50s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [12:30<06:06, 366.32s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 9
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y


Training weak models:  10%|█         | 1/10 [00:05<00:47,  5.30s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:09<00:36,  4.52s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [00:10<00:21,  3.07s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [00:11<00:12,  2.10s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [00:12<00:09,  1.95s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [00:15<00:09,  2.27s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [00:16<00:04,  1.67s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [00:24<00:07,  3.62s/it][A

Model training done 7: SVR



Training weak models: 100%|██████████| 10/10 [00:39<00:00,  3.97s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [13:10<00:00, 263.33s/it]

Model training done 8: MLPRegressor
Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064722 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7560





[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747671




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7560
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.741826




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7553
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.740654




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7557
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.749020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7552
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.737478




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7552
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747212




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7553
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.750687




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7555
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.743084




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7553
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.737254




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7557
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.734177






Processing ablated dataframes:   0%|          | 0/3 [00:00<?, ?it/s]
Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56912
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 262
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56937
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56933
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 263
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:07<01:10,  7.87s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:22<01:35, 11.98s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [04:31<13:59, 119.93s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [05:45<10:12, 102.08s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [05:53<05:40, 68.00s/it] [A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [05:59<03:07, 46.83s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [06:00<01:35, 31.96s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [06:30<01:02, 31.29s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [06:54<00:29, 29.10s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [07:05<00:00, 42.58s/it][A
Processing ablated dataframes:  33%|███▎      | 1/3 [07:05<14:11, 425.76s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4196
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1106
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4200
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 1105
[LightGBM] [Info] Star


Training weak models:  10%|█         | 1/10 [00:10<01:36, 10.77s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [00:17<01:04,  8.11s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [01:27<04:15, 36.45s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [02:25<04:31, 45.24s/it][A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [02:33<02:38, 31.80s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [02:39<01:31, 22.99s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [02:41<00:47, 15.97s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [04:20<01:25, 42.50s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [05:16<00:46, 46.65s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [05:19<00:00, 31.94s/it][A
Processing ablated dataframes:  67%|██████▋   | 2/3 [12:25<06:03, 363.21s/it]

Model training done 9: DecisionTreeRegressor
Dataframe training completed



Training weak models:   0%|          | 0/10 [00:00<?, ?it/s][A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027447 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.747671
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] Start training from score -5.741826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 192525
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 755
[LightGBM] [Info] S


Training weak models:  10%|█         | 1/10 [00:08<01:13,  8.18s/it][A

Model training done 0: LGBMRegressor



Training weak models:  20%|██        | 2/10 [01:07<05:05, 38.22s/it][A

Model training done 1: RandomForestRegressor



Training weak models:  30%|███       | 3/10 [24:27<1:17:01, 660.26s/it][A

Model training done 2: GradientBoostingRegressor



Training weak models:  40%|████      | 4/10 [29:40<52:18, 523.06s/it]  [A

Model training done 3: AdaBoostRegressor



Training weak models:  50%|█████     | 5/10 [30:05<28:37, 343.55s/it][A

Model training done 4: XGBRegressor



Training weak models:  60%|██████    | 6/10 [30:18<15:24, 231.16s/it][A

Model training done 5: ExtraTreesRegressor



Training weak models:  70%|███████   | 7/10 [30:20<07:48, 156.04s/it][A

Model training done 6: KNeighborsRegressor



Training weak models:  80%|████████  | 8/10 [31:22<04:12, 126.28s/it][A

Model training done 7: SVR



Training weak models:  90%|█████████ | 9/10 [32:10<01:41, 101.71s/it][A

Model training done 8: MLPRegressor



Training weak models: 100%|██████████| 10/10 [33:05<00:00, 198.50s/it][A
Processing ablated dataframes: 100%|██████████| 3/3 [45:30<00:00, 910.07s/it] 

Model training done 9: DecisionTreeRegressor
Dataframe training completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Stage 1 completed (Weak Learners)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7633
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747671





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7633
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.741826




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7631
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.740654




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7631
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.749020




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7631
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.737478




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7629
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.747212




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7630
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.750687




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7632
[LightGBM] [Info] Number of data points in the train set: 5011, number of used features: 30
[LightGBM] [Info] Start training from score -5.743084




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7629
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.737254




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7631
[LightGBM] [Info] Number of data points in the train set: 5012, number of used features: 30
[LightGBM] [Info] Start training from score -5.734177




XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Ablation Study Completed
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [6]:
ablation_results

{'Ablation_Descriptor':                            Train MSE (10 fold CV)  Train MAE (10 fold CV)  \
 LGBMRegressor                            0.124547                0.257339   
 DecisionTreeRegressor                    0.271159                0.378883   
 RandomForestRegressor                    0.127370                0.258899   
 GradientBoostingRegressor                0.124091                0.256223   
 AdaBoostRegressor                        0.173011                0.317938   
 XGBRegressor                             0.141366                0.274843   
 ExtraTreesRegressor                      0.127268                0.258200   
 LinearRegression                         0.124801                0.255590   
 KNeighborsRegressor                      0.151503                0.284705   
 SVR                                      0.134686                0.262531   
 MLPRegressor                             0.135549                0.271661   
 
                            Train RMSE 

In [7]:
import os
import pickle

ablation_result_dir = '/home/users/akshay/PCPpred/PAMPA/Results/Ablation/'
os.makedirs(ablation_result_dir, exist_ok=True)

pickle_path = os.path.join(ablation_result_dir, 'ablation_results.pkl')
with open(pickle_path, 'wb') as f:
    pickle.dump(ablation_results, f)


with open(pickle_path, 'rb') as f:
    ablation_results = pickle.load(f)


ablation_results

{'Ablation_Descriptor':                            Train MSE (10 fold CV)  Train MAE (10 fold CV)  \
 LGBMRegressor                            0.124547                0.257339   
 DecisionTreeRegressor                    0.271159                0.378883   
 RandomForestRegressor                    0.127370                0.258899   
 GradientBoostingRegressor                0.124091                0.256223   
 AdaBoostRegressor                        0.173011                0.317938   
 XGBRegressor                             0.141366                0.274843   
 ExtraTreesRegressor                      0.127268                0.258200   
 LinearRegression                         0.124801                0.255590   
 KNeighborsRegressor                      0.151503                0.284705   
 SVR                                      0.134686                0.262531   
 MLPRegressor                             0.135549                0.271661   
 
                            Train RMSE 

In [8]:
ablation_result_dir = '/home/users/akshay/PCPpred/PAMPA/Results/Ablation/'
os.makedirs(ablation_result_dir, exist_ok=True)

for ablation_label, df in ablation_results.items():
    print(f"Results for {ablation_label}: \n")
    safe_label = ablation_label.replace(" ", "_").replace("/", "_")
    file_path = os.path.join(ablation_result_dir, f"{safe_label}.csv")
    df.to_csv(file_path)

Results for Ablation_Descriptor: 

Results for Ablation_Fingerprints: 

Results for Ablation_Embeddings: 

Results for Ablation_Atomic: 



In [9]:
from IPython.display import display
for ablation_label, df in ablation_results.items():
    print(f"Results for {ablation_label}: \n")
    display(df)

Results for Ablation_Descriptor: 



Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.124547,0.257339,0.352913,0.800023,0.894462,0.87764,0.193359,0.309692,0.439726,0.694453,0.833456,0.82065
DecisionTreeRegressor,0.271159,0.378883,0.520729,0.564619,0.784426,0.759031,0.216569,0.32704,0.465369,0.657777,0.812709,0.799445
RandomForestRegressor,0.12737,0.258899,0.356889,0.795491,0.891936,0.875326,0.19671,0.31136,0.44352,0.689158,0.830406,0.816779
GradientBoostingRegressor,0.124091,0.256223,0.352266,0.800755,0.894851,0.876724,0.194348,0.310251,0.440849,0.69289,0.832579,0.819734
AdaBoostRegressor,0.173011,0.317938,0.415946,0.722208,0.859052,0.850397,0.220874,0.352171,0.469972,0.650974,0.814813,0.809462
XGBRegressor,0.141366,0.274843,0.375987,0.773019,0.880031,0.862635,0.197695,0.31384,0.444629,0.687601,0.829688,0.816433
ExtraTreesRegressor,0.127268,0.2582,0.356747,0.795654,0.892023,0.874573,0.196641,0.311836,0.443442,0.689267,0.830498,0.816118
LinearRegression,0.124801,0.25559,0.353271,0.799617,0.894215,0.877585,0.195866,0.310431,0.442568,0.690491,0.831149,0.816576
KNeighborsRegressor,0.151503,0.284705,0.389234,0.756742,0.869998,0.849301,0.199434,0.320926,0.446581,0.684853,0.827842,0.80584
SVR,0.134686,0.262531,0.366995,0.783745,0.885559,0.87433,0.195332,0.30707,0.441963,0.691336,0.832594,0.818707


Results for Ablation_Fingerprints: 



Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.125781,0.257481,0.354656,0.798042,0.893349,0.876897,0.191068,0.306364,0.437114,0.698072,0.835614,0.820365
DecisionTreeRegressor,0.252359,0.368848,0.502353,0.594805,0.799321,0.775558,0.208692,0.322135,0.456828,0.670224,0.820248,0.799778
RandomForestRegressor,0.127854,0.259588,0.357567,0.794714,0.891494,0.874756,0.194917,0.309333,0.441494,0.691991,0.83214,0.815763
GradientBoostingRegressor,0.125087,0.257019,0.353676,0.799157,0.893964,0.876321,0.192349,0.30766,0.438576,0.696049,0.834473,0.820331
AdaBoostRegressor,0.173866,0.31979,0.416972,0.720836,0.859506,0.851227,0.214561,0.347395,0.463208,0.660949,0.82144,0.812258
XGBRegressor,0.141224,0.276066,0.375798,0.773246,0.879863,0.859806,0.195996,0.309806,0.442714,0.690286,0.831231,0.815661
ExtraTreesRegressor,0.127932,0.259208,0.357676,0.794588,0.891426,0.874096,0.193109,0.308256,0.439442,0.694847,0.833804,0.81799
LinearRegression,0.125036,0.255569,0.353604,0.799239,0.894004,0.877131,0.192681,0.308454,0.438955,0.695524,0.834174,0.819747
KNeighborsRegressor,0.149706,0.283514,0.386919,0.759627,0.871711,0.852315,0.207293,0.323132,0.455295,0.672434,0.821086,0.803682
SVR,0.13261,0.259854,0.364156,0.787078,0.887462,0.876649,0.194822,0.30677,0.441386,0.692141,0.833169,0.81886


Results for Ablation_Embeddings: 



Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.197204,0.325799,0.444077,0.683363,0.826659,0.798828,0.207748,0.326902,0.455794,0.671715,0.819593,0.806872
DecisionTreeRegressor,0.400688,0.45813,0.632999,0.356643,0.680875,0.656305,0.24974,0.355437,0.49974,0.605359,0.780632,0.762524
RandomForestRegressor,0.193206,0.322189,0.439552,0.689782,0.830666,0.804122,0.208753,0.326254,0.456895,0.670127,0.818713,0.803257
GradientBoostingRegressor,0.198992,0.327857,0.446085,0.680493,0.824943,0.795793,0.20371,0.324691,0.451342,0.678096,0.82348,0.808707
AdaBoostRegressor,0.25956,0.402047,0.50947,0.583243,0.789216,0.761066,0.247968,0.388195,0.497964,0.608159,0.804846,0.791111
XGBRegressor,0.219899,0.344418,0.468934,0.646924,0.806869,0.778609,0.219158,0.334292,0.468143,0.653685,0.809387,0.795459
ExtraTreesRegressor,0.186845,0.315897,0.432256,0.699996,0.836786,0.811588,0.211241,0.328015,0.45961,0.666195,0.816328,0.802155
LinearRegression,0.196513,0.325,0.443298,0.684472,0.827342,0.797443,0.198451,0.318054,0.445478,0.686406,0.828584,0.814382
KNeighborsRegressor,0.222483,0.343583,0.471681,0.642775,0.802678,0.773852,0.235183,0.350971,0.484956,0.628363,0.794114,0.776606
SVR,0.208088,0.328868,0.456167,0.665888,0.817145,0.792233,0.206904,0.320719,0.454867,0.673048,0.821811,0.811838


Results for Ablation_Atomic: 



Unnamed: 0,Train MSE (10 fold CV),Train MAE (10 fold CV),Train RMSE (10 fold CV),Train R2 (10 fold CV),Train PCC (10 fold CV),Train SCC (10 fold CV),Test MSE,Test MAE,Test RMSE,Test R2,Test PCC,Test SCC
LGBMRegressor,0.125275,0.257694,0.353942,0.798855,0.893805,0.877471,0.193031,0.308192,0.439353,0.694971,0.833787,0.820005
DecisionTreeRegressor,0.263743,0.374735,0.513559,0.576526,0.792243,0.768514,0.212271,0.322619,0.460729,0.664568,0.816842,0.803776
RandomForestRegressor,0.127749,0.259039,0.357421,0.794882,0.891594,0.875051,0.195506,0.309744,0.442161,0.69106,0.831602,0.816087
GradientBoostingRegressor,0.125566,0.257346,0.354353,0.798388,0.893527,0.876989,0.191852,0.308384,0.438009,0.696834,0.834908,0.819767
AdaBoostRegressor,0.178632,0.325035,0.422649,0.713183,0.85731,0.848489,0.220008,0.355082,0.46905,0.652342,0.817703,0.807913
XGBRegressor,0.141496,0.275541,0.376159,0.77281,0.879515,0.862526,0.197481,0.312857,0.444389,0.687939,0.829882,0.815774
ExtraTreesRegressor,0.126231,0.257413,0.35529,0.79732,0.892954,0.875876,0.194766,0.309874,0.441323,0.692229,0.832291,0.815771
LinearRegression,0.124419,0.255014,0.352731,0.800229,0.894557,0.877798,0.192792,0.307843,0.439081,0.695348,0.834029,0.817838
KNeighborsRegressor,0.151292,0.285708,0.388962,0.757082,0.870208,0.848718,0.20165,0.322872,0.449054,0.681351,0.825972,0.803488
SVR,0.131419,0.259541,0.362517,0.78899,0.88853,0.877179,0.192793,0.305884,0.439083,0.695346,0.834871,0.81894
