In [93]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404

In [94]:
df = pd.read_csv('data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [95]:
X = df.drop(columns=['cardio'])
y = df['cardio']

## Data standardization

In [96]:
from sklearn.preprocessing import StandardScaler

# Select columns for scaling (excluding binary or categorical features)
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform your data (only for numeric columns)
X_scaled = X.copy()
X_scaled[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Apply one-hot encoding to categorical columns
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
X_encoded = pd.get_dummies(X[categorical_columns])

# Concatenate scaled numeric columns with one-hot encoded categorical columns
X_final = pd.concat([X_scaled[numeric_columns], X_encoded], axis=1)

# Ensure X_final is a DataFrame
X_final = pd.DataFrame(X_final)
X = X_final

X

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,gluc,smoke,alco,active
0,-0.436062,0.443452,-0.847873,-0.122182,-0.088238,2,1,1,0,0,1
1,0.307686,-1.018168,0.749831,0.072610,-0.035180,1,3,1,0,0,1
2,-0.247997,0.078047,-0.708942,0.007679,-0.141297,1,3,1,0,0,0
3,-0.748152,0.565254,0.541435,0.137541,0.017879,2,1,1,0,0,1
4,-0.808543,-1.018168,-1.264666,-0.187113,-0.194356,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,-0.092762,0.443452,0.124642,-0.057251,-0.088238,2,1,1,1,0,1
69996,1.269492,-0.774565,3.597913,0.072610,-0.035180,1,2,2,0,0,1
69997,-0.163286,2.270477,2.139139,0.332333,-0.035180,2,3,1,0,1,0
69998,1.200589,-0.165556,-0.153219,0.040145,-0.088238,1,1,2,0,0,0


In [97]:
X.describe()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,gluc,smoke,alco,active
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,5.272227e-16,1.450116e-15,-2.905105e-16,7.623108000000001e-17,1.7459050000000003e-17,1.349571,1.366871,1.226457,0.088129,0.053771,0.803729
std,1.000007,1.000007,1.000007,1.000007,1.000007,0.476838,0.68025,0.57227,0.283484,0.225568,0.397179
min,-3.514407,-13.32014,-4.460075,-1.810381,-0.8841161,1.0,1.0,1.0,0.0,0.0,0.0
25%,-0.7315341,-0.652763,-0.639477,-0.05725127,-0.0882385,1.0,1.0,1.0,0.0,0.0,1.0
50%,0.09489744,0.07804703,-0.1532192,-0.05725127,-0.0882385,1.0,1.0,1.0,0.0,0.0,1.0
75%,0.7531244,0.6870554,0.5414349,0.07261016,-0.03517999,2.0,2.0,1.0,0.0,0.0,1.0
max,1.720199,10.43119,8.738353,103.1826,57.85165,2.0,3.0,3.0,1.0,1.0,1.0


### Using BIC to get the optimal number of components for GMM

In [98]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)  # Fit the model to the data
        bic.append(gmm.bic(X))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

optimal_n_components = 26

if optimal_n_components is None:
    n_components_range = range(1, 51)  # Range of number of components to evaluate
    bic_values = compute_bic(X, n_components_range)  # Compute BIC values
    optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

    # Plotting BIC values
    plt.plot(n_components_range, bic_values, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC Value')
    plt.title('BIC for Gaussian Mixture Models')
    plt.grid(True)
    plt.show()

In [99]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)

# Fit the Gaussian Mixture Model to the imputed DataFrame
gmm.fit(X)

### Generating 10 subsets with randomly removed number of features

In [100]:
subsets_fraction = 0.01

# Function that randomly removes features and replace their values with NaN
def remove_features(data, num_features_to_remove):
    subset = data.sample(frac=subsets_fraction, random_state=RANDOM_STATE)
    features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    subset = subset.astype(object)
    subset.loc[:, features_to_remove] = np.nan
    return subset

subsets = []

for _ in range(2):
    subsets.append(remove_features(X, 1))

for _ in range(2):
    subsets.append(remove_features(X, 2))

for _ in range(6):
    num_features_to_remove = np.random.randint(3, min(5, len(X.columns) - 1))
    subsets.append(remove_features(X, num_features_to_remove))

for subset_index, subset in enumerate(subsets):
    nan_columns = subset.columns[subset.isnull().all()]
    print(f"Subset {subset_index+1} has missing values in: {', '.join(nan_columns)}")

Subset 1 has missing values in: weight
Subset 2 has missing values in: smoke
Subset 3 has missing values in: height, weight
Subset 4 has missing values in: age, smoke
Subset 5 has missing values in: age, gluc, alco
Subset 6 has missing values in: weight, ap_lo, gluc, smoke
Subset 7 has missing values in: age, weight, ap_lo, gender
Subset 8 has missing values in: age, gluc, alco
Subset 9 has missing values in: height, gender, gluc, alco
Subset 10 has missing values in: height, ap_hi, gender


In [101]:
import copy

# Creating a deep copy for further use similar for VAE
vae_subsets = copy.deepcopy(subsets)

## Conditional GMM imputation

In [102]:
from ConditionalGMM.condGMM import CondGMM
import json

number_of_samples = 100

for subset in subsets:
    index = 0
    for row_index, row in subset.iterrows():
        # Get indices and values of unknown and known features
        unknown_features_indexes = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
        
        # Find indices of known features
        known_features_indexes = list(set(range(subset.shape[1])) - set(unknown_features_indexes))
        
        # Extract values of known features for the given row
        known_features_values = subset.iloc[index, known_features_indexes]
        
        # If all features are known, continue
        if len(unknown_features_indexes) == 0:
            continue
        
        # Initialize CondGMM
        cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
        
        # Generate samples using Conditional GMM
        sampled_data = cGMM.rvs(known_features_values, size=number_of_samples, random_state=RANDOM_STATE)
        
        # Update unknown features with sampled data
        for feature_index in range(len(unknown_features_indexes)):
            if unknown_features_indexes[feature_index] in categorical_columns:
                # Approximate categorical values to the nearest whole number
                sampled_data[:, feature_index] = np.round(sampled_data[:, feature_index])
            subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])
            
        index += 1

### Calculate MSE for imputed values

In [103]:
import json
from sklearn.metrics import mean_squared_error

# Define folder and file paths
folder_path = "results/without_missingness"
output_file_path = os.path.join(folder_path, "cgmm_mse_scores.txt")

# Check if folder exists, if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Open the file in "append" mode and create if it doesn't exist
with open(output_file_path, "a+") as f:
    f.write(f"Results for {number_of_samples} samples with {subsets_fraction} fraction of data:\n")

for subset_index, subset in enumerate(subsets):
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, str)).any()]

    if not unknown_features_indexes:
        continue  # Skip if there are no missing values
    
    # Initialize dictionary to store MSE and NMSE values for each feature in the subset
    feature_mse = {}
    
    # Iterate through rows in the subset DataFrame
    for index, row in subset.iterrows():
        # Extract original values for the current row from X
        original_values = X.iloc[index, unknown_features_indexes].values
        
        # Compute MSE for each feature separately
        for feature_index in range(len(unknown_features_indexes)):
            # Extract generated samples for unknown features and drop NaN values
            generated_samples_raw = json.loads(row.iloc[unknown_features_indexes].values[feature_index])
            generated_samples = [sample for sample in generated_samples_raw if not pd.isna(sample)]
            
            # Extract original value for the current feature
            original_value = original_values[feature_index]
            
            # Calculate variance of generated samples
            var_generated = np.var(generated_samples)
            
            for sample in generated_samples:
                if not pd.isna(sample):
                    # Ensure both original_value and sample are arrays of the same length
                    original_value_array = np.full_like(np.array(sample), original_value)
                    mse_value = mean_squared_error(original_value_array.flatten(), np.array(sample).flatten())
                    
                    # Add MSE value to the dictionary under the corresponding feature index
                    if unknown_features_indexes[feature_index] not in feature_mse:
                        feature_mse[unknown_features_indexes[feature_index]] = []
                        
                    feature_mse[unknown_features_indexes[feature_index]].append(mse_value)
    
    # Print the mean squared error  for each feature and save to file
    with open(output_file_path, "a") as f:
        print(f"MSE for Subset {subset_index + 1}:")
        f.write(f"MSE for Subset {subset_index + 1}:\n")
        for feature_index, mse_values in feature_mse.items():
            mean_mse = np.mean(mse_values)
            # mean_nmse = np.mean(feature_nmse[feature_index])
            f.write(f'Feature {df.columns[feature_index]} MSE: {mean_mse}\n')
            print(f"Feature {df.columns[feature_index]}: MSE = {mean_mse}")
            

with open(output_file_path, "a") as f:
    f.write(f"\n\n")

MSE for Subset 1:
Feature height: MSE = 1.5931680256498213
MSE for Subset 2:
Feature smoke: MSE = 0.13755234323605026
MSE for Subset 3:
Feature gender: MSE = 1.3307465083507877
Feature height: MSE = 1.2802487354787104
MSE for Subset 4:
Feature age: MSE = 1.2596629166744562
Feature smoke: MSE = 0.0900889994224423
MSE for Subset 5:
Feature age: MSE = 1.0800861604639225
Feature gluc: MSE = 1.651756252288043
Feature alco: MSE = 0.058127976007216864
MSE for Subset 6:
Feature height: MSE = 0.9881225302713966
Feature ap_hi: MSE = 2.935008048726963
Feature gluc: MSE = 1.700670973953893
Feature smoke: MSE = 0.09155933883343995
MSE for Subset 7:
Feature age: MSE = 1.4919109467280252
Feature height: MSE = 1.1022474265185724
Feature ap_hi: MSE = 2.9310921690919747
Feature ap_lo: MSE = 1.1715889053551538
MSE for Subset 8:
Feature age: MSE = 1.0800861604639225
Feature gluc: MSE = 1.651756252288043
Feature alco: MSE = 0.058127976007216864
MSE for Subset 9:
Feature gender: MSE = 1.190344643160313
Feat

### Classification after cGMM

In [104]:
# Iterate through each subset and convert strings to lists
for subset in subsets:
    for col in subset.columns:
        subset[col] = subset[col].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

In [105]:
# from joblib import load
# import warnings

# # Suppress all warnings related to feature names
# warnings.filterwarnings('ignore', message="X does not have valid feature names")

# # Load the classifier
# classifier = load('classifiers\cardio_classifier.h5')

# cgmm_classification_results = []

# # Iterate through each subset
# for subset_index, subset in enumerate(subsets):
#     subset_results = []  # Initialize results for this subset
    
#     # Iterate over each row in the subset
#     for row_index, row in subset.iterrows():
#         serialized_arrays = []
#         non_serialized_values = []
        
#         # Separate serialized arrays from non-serialized values
#         for col, value in row.items():
#             if isinstance(value, list):
#                 serialized_arrays.append((col, value))
#             else:
#                 non_serialized_values.append((col, value))
        
#         # Initialize an empty list to store results for this row
#         row_results = []
        
#         # Iterate over each index of serialized arrays
#         for i in range(number_of_samples):
#             # Initialize a combined row with non-serialized values
#             combined_row = non_serialized_values.copy()
            
#             # Append the entry at index i of each serialized array to the combined row
#             for col, serialized_array in serialized_arrays:
#                 if i < len(serialized_array):
#                     combined_row.append((col, serialized_array[i]))
            
#             # Convert combined_row to an array
#             combined_row_array = [value for _, value in combined_row]
            
#             try:
#                 # Run the combined row through the classifier
#                 result_array = classifier.predict([combined_row_array])
#                 row_results.append(result_array)
#             except Exception as e:
#                 # Handle any potential errors
#                 print(f"Error processing row {row_index}: {e}")
#                 row_results.append(None)
        
#         # Append the row results to the subset results
#         subset_results.append(row_results)
    
#     # Append the subset results to the overall results
#     cgmm_classification_results.append(subset_results)  

In [106]:
# from sklearn.metrics import accuracy_score

# # Initialize lists to store accuracy per subset
# accuracy_per_subset = []

# # Iterate through each subset and its corresponding results
# for subset_index, subset_results in enumerate(cgmm_classification_results):
#     true_labels = y.loc[subsets[subset_index].index]  # Get true labels for the current subset
    
#     # Initialize list to store predicted labels for this subset
#     subset_predicted_labels = []
    
#     # Iterate through each row and its corresponding results
#     for row_results in subset_results:
#         # Get the predicted label for each row (assuming binary classification)
#         predicted_label = 1 if row_results[0] > 0.5 else 0
#         subset_predicted_labels.append(predicted_label)
    
#     # Calculate accuracy for this subset
#     subset_accuracy = accuracy_score(true_labels, subset_predicted_labels)
    
#     # Append the accuracy for this subset to accuracy_per_subset
#     accuracy_per_subset.append(subset_accuracy)

# # Print accuracy per subset
# for subset_index, accuracy_subset in enumerate(accuracy_per_subset):
#     print("Subset", subset_index, "accuracy:", accuracy_subset)


## VAE imputation

In [107]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
    
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Define the encoder
input_dim = X.shape[1]
latent_dim = 10

inputs = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(inputs)
encoded = Dense(32, activation='relu')(encoded)
z_mean = Dense(latent_dim)(encoded)
z_log_var = Dense(latent_dim)(encoded)

# Reparameterization trick
def sampling(args):
    z_mean, z_log_var = args
    epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], latent_dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling)([z_mean, z_log_var])

# Define the decoder
decoded = Dense(32, activation='relu')(z)
decoded = Dense(64, activation='relu')(decoded)
outputs = Dense(input_dim)(decoded)

# Create the VAE model
vae = Model(inputs, outputs)

# Compile the model
vae.compile(optimizer='adam', loss='mse')  # Use MSE as the reconstruction loss

# Train the model
history = vae.fit(X_train, X_train, epochs=20, batch_size=32, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [108]:
number_of_samples = 10

# Iterate through each subset
for subset_index, subset in enumerate(vae_subsets):
    index = 0
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = np.where(subset.isnull().any())[0]

    # Iterate through each row
    for row_index, row in subset.iterrows():
        sampled_data = np.empty((number_of_samples, len(unknown_features_indexes)))
        # Repeat the prediction process for the specified number of samples
        for _ in range(number_of_samples):
            imputed_values_row = []
            # Impute missing values for each feature index
            for feature_index in unknown_features_indexes:
                # Impute missing value using the VAE for the current feature and row
                imputed_value = vae.predict(row.values.reshape(1, -1).astype(np.float32), verbose=0)[0, feature_index]
                imputed_values_row.append(imputed_value)
            sampled_data[_] = imputed_values_row
        
        for feature_index in range(len(unknown_features_indexes)):
            if unknown_features_indexes[feature_index] in categorical_columns:
                # Approximate categorical values to the nearest whole number
                sampled_data[:, feature_index] = np.round(sampled_data[:, feature_index])
            subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])

        index += 1

### Calculating MSE for VAE imputation

In [109]:
import json
from sklearn.metrics import mean_squared_error

output_file_path = os.path.join("results/without_missingness", "vae_mse_scores.txt")
with open(output_file_path, "a+") as f:
    f.write(f"Results for {number_of_samples} samples with {subsets_fraction} fraction of data:\n")

for subset_index, subset in enumerate(vae_subsets):
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, str)).any()]

    if not unknown_features_indexes:
        continue  # Skip if there are no missing values
    
    # Initialize dictionary to store MSE and NMSE values for each feature in the subset
    feature_mse = {}
    
    # Iterate through rows in the subset DataFrame
    for index, row in subset.iterrows():
        # Extract original values for the current row from X
        original_values = X.iloc[index, unknown_features_indexes].values
        
        # Compute MSE for each feature separately
        for feature_index in range(len(unknown_features_indexes)):
            # Extract generated samples for unknown features and drop NaN values
            generated_samples_raw = json.loads(row.iloc[unknown_features_indexes].values[feature_index])
            generated_samples = [sample for sample in generated_samples_raw if not pd.isna(sample)]
            
            # Extract original value for the current feature
            original_value = original_values[feature_index]
            
            # Calculate variance of generated samples
            var_generated = np.var(generated_samples)
            
            for sample in generated_samples:
                if not pd.isna(sample):
                    # Ensure both original_value and sample are arrays of the same length
                    original_value_array = np.full_like(np.array(sample), original_value)
                    mse_value = mean_squared_error(original_value_array.flatten(), np.array(sample).flatten())
                    
                    # Add MSE value to the dictionary under the corresponding feature index
                    if unknown_features_indexes[feature_index] not in feature_mse:
                        feature_mse[unknown_features_indexes[feature_index]] = []
                        
                    feature_mse[unknown_features_indexes[feature_index]].append(mse_value)
    
    # Print the mean squared error  for each feature and save to file
    with open(output_file_path, "a") as f:
        print(f"MSE for Subset {subset_index + 1}:")
        f.write(f"MSE for Subset {subset_index + 1}:\n")
        for feature_index, mse_values in feature_mse.items():
            mean_mse = np.mean(mse_values)
            f.write(f'Feature {df.columns[feature_index]} MSE: {mean_mse}\n')
            print(f"Feature {df.columns[feature_index]}: MSE = {mean_mse}")
            

with open(output_file_path, "a") as f:
    f.write(f"\n\n")

MSE for Subset 1:
Feature height: MSE = 1.327385834058073
MSE for Subset 2:
Feature smoke: MSE = 0.213240934433591
MSE for Subset 3:
Feature gender: MSE = 1.4378163572569738
Feature height: MSE = 1.3003734495152457
MSE for Subset 4:
Feature age: MSE = 1.150825195530349
Feature smoke: MSE = 0.21091146399868282
MSE for Subset 5:
Feature age: MSE = 1.1520055093253845
Feature gluc: MSE = 1.4563990899388597
Feature alco: MSE = 0.08009747038850727
MSE for Subset 6:
Feature height: MSE = 1.3264861759008881
Feature ap_hi: MSE = 3.315431838826927
Feature gluc: MSE = 1.4630357201957633
Feature smoke: MSE = 0.21238083045462372
MSE for Subset 7:
Feature age: MSE = 1.1568424326367113
Feature height: MSE = 1.3283215112535052
Feature ap_hi: MSE = 3.317003367408704
Feature ap_lo: MSE = 2.061657052946286
MSE for Subset 8:
Feature age: MSE = 1.1606346838454302
Feature gluc: MSE = 1.4571027430470982
Feature alco: MSE = 0.08083754107069166
MSE for Subset 9:
Feature gender: MSE = 1.4353797169967468
Feature

### Classification after VAE

In [110]:
# Iterate through each subset and convert strings to lists
for subset in vae_subsets:
    for col in subset.columns:
        subset[col] = subset[col].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

In [111]:
from joblib import load
import warnings

# Suppress all warnings related to feature names
warnings.filterwarnings('ignore', message="X does not have valid feature names")

# Load the classifier
classifier = load('classifiers\cardio_classifier.h5')

vae_classification_results = []

# Iterate through each subset
for subset_index, subset in enumerate(subsets):
    subset_results = []  # Initialize results for this subset
    
    # Iterate over each row in the subset
    for row_index, row in subset.iterrows():
        serialized_arrays = []
        non_serialized_values = []
        
        # Separate serialized arrays from non-serialized values
        for col, value in row.items():
            if isinstance(value, list):
                serialized_arrays.append((col, value))
            else:
                non_serialized_values.append((col, value))
        
        # Initialize an empty list to store results for this row
        row_results = []
        
        # Iterate over each index of serialized arrays
        for i in range(number_of_samples):
            # Initialize a combined row with non-serialized values
            combined_row = non_serialized_values.copy()
            
            # Append the entry at index i of each serialized array to the combined row
            for col, serialized_array in serialized_arrays:
                if i < len(serialized_array):
                    combined_row.append((col, serialized_array[i]))
            
            # Convert combined_row to an array
            combined_row_array = [value for _, value in combined_row]
            
            try:
                # Run the combined row through the classifier
                result_array = classifier.predict([combined_row_array])
                row_results.append(result_array)
            except Exception as e:
                # Handle any potential errors
                print(f"Error processing row {row_index}: {e}")
                row_results.append(None)
        
        # Append the row results to the subset results
        subset_results.append(row_results)
    
    # Append the subset results to the overall results
    vae_classification_results.append(subset_results)  

In [112]:
from sklearn.metrics import accuracy_score

# Initialize lists to store accuracy per subset
accuracy_per_subset = []

# Iterate through each subset and its corresponding results
for subset_index, subset_results in enumerate(vae_classification_results):
    true_labels = y.loc[subsets[subset_index].index]  # Get true labels for the current subset
    
    # Initialize list to store predicted labels for this subset
    subset_predicted_labels = []
    
    # Iterate through each row and its corresponding results
    for row_results in subset_results:
        # Get the predicted label for each row (assuming binary classification)
        predicted_label = 1 if row_results[0] > 0.5 else 0
        subset_predicted_labels.append(predicted_label)
    
    # Calculate accuracy for this subset
    subset_accuracy = accuracy_score(true_labels, subset_predicted_labels)
    
    # Append the accuracy for this subset to accuracy_per_subset
    accuracy_per_subset.append(subset_accuracy)

# Print accuracy per subset
for subset_index, accuracy_subset in enumerate(accuracy_per_subset):
    print("Subset", subset_index, "accuracy:", accuracy_subset)

Subset 0 accuracy: 0.5771428571428572
Subset 1 accuracy: 0.6928571428571428
Subset 2 accuracy: 0.5185714285714286
Subset 3 accuracy: 0.5471428571428572
Subset 4 accuracy: 0.5585714285714286
Subset 5 accuracy: 0.5185714285714286
Subset 6 accuracy: 0.5157142857142857
Subset 7 accuracy: 0.5585714285714286
Subset 8 accuracy: 0.5971428571428572
Subset 9 accuracy: 0.5185714285714286
