In [None]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404
number_of_samples = 10

In [None]:
df = pd.read_csv('data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head(5)

In [None]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [None]:
# Create an empty dictionary to store the results after each method
results_dict = {}

## Data manipulation

#### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'gender', 'cholesterol']
categorical_columns = ['gluc', 'smoke', 'alco', 'active']

# Fit and transform your data (only for numeric columns)
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Apply one-hot encoding to categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = label_encoder.fit_transform(X[col])

X.head(5)

In [None]:
X.describe()

#### Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

## Preparing 10 subsets with removed features

In [None]:
# Function that randomly removes features and replace their values with NaN
def remove_features(num_features_to_remove=None, feature_indices_to_remove=None):
    """
    Randomly removes features from a subset of data and replaces their values with NaN.
    
    Parameters:
        num_features_to_remove (int): Number of features to remove randomly.
        feature_indices_to_remove (array-like): Indices of features to remove.
        
    Returns:
        pandas.DataFrame: Subset of data with removed features and NaN values.
    """
    # Sample a subset of data
    subset = X_train.sample(frac=0.001, random_state=RANDOM_STATE)
    
    # Determine features to remove based on number or indices provided
    if feature_indices_to_remove is None:
        if num_features_to_remove is None:
            num_features_to_remove = np.random.randint(1, min(5, len(X_train.columns) - 1))
        features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    else:
        features_to_remove = subset.columns[feature_indices_to_remove]
    
    # Replace values of selected features with NaN
    features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    subset = subset.astype(object)
    subset.loc[:, features_to_remove] = np.nan
    
    return subset

list_of_subsets = []

# Generate subsets with varying numbers of removed features
for _ in range(2):
    list_of_subsets.append(remove_features(1))

for _ in range(2):
    list_of_subsets.append(remove_features(2))

for _ in range(2):
    list_of_subsets.append(remove_features(3))

for _ in range(2):
    list_of_subsets.append(remove_features(4))

for _ in range(2):
    list_of_subsets.append(remove_features(np.random.randint(5, 7)))

# Print information about subsets and their missing columns
print(f'Subsets with {list_of_subsets[0].shape[0]} datapoints and their columns with missing values:')
for subset_index, subset in enumerate(list_of_subsets):
    nan_columns = subset.columns[subset.isnull().all()]
    print(f"Subset {subset_index+1}: {', '.join(nan_columns)}")

## Shared functions

#### Imputation

In [None]:
# from ConditionalGMM.condGMM import CondGMM
# import json

# def imputing_subsets(subsets, method, model):
#     for subset in subsets:
#         row_in_subset_index = 0
#         for row in subset.iterrows():
#             # Get indices and values of unknown and known features
#             unknown_features_indexes = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
            
#             # Find indices of known features
#             known_features_indexes = list(set(range(subset.shape[1])) - set(unknown_features_indexes))
            
#             # Extract values of known features for the given row
#             known_features_values = subset.iloc[row_in_subset_index, known_features_indexes]
            
#             # If all features are known, continue
#             if len(unknown_features_indexes) == 0:
#                 continue
            
#             sampled_data = None
            
#             if method == 'simple':
#                 # Simple Imputer
#                 continue
#             elif method == 'multivariate' or method == 'cgmm':
#                 # Multivariate Imputer or Conditional GMM
#                 sampled_data = cgmm_impute(model, unknown_features_indexes, known_features_values)
#                 continue
#             elif method == 'vae':
#                 # Variational AutoEncoder
#                 continue
            
#             # Update unknown features with sampled data
#             for feature_index in range(len(unknown_features_indexes)):
#                 if unknown_features_indexes[feature_index] in categorical_columns:
#                     # Approximate categorical values to the nearest whole number
#                     sampled_data[:, feature_index] = np.round(sampled_data[:, feature_index])
#                 subset.iloc[row_in_subset_index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])
                
#             row_in_subset_index += 1
            
# def cgmm_impute(gmm, known_features_indexes, known_features_values):
#     # Initialize CondGMM
#     cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
    
#     # Generate samples using Conditional GMM
#     sampled_data = cGMM.rvs(known_features_values, size=number_of_samples, random_state=RANDOM_STATE)
    
#     return sampled_data
    

#### Scoring

In [None]:
from sklearn.metrics import mean_squared_error
import json

def get_scoring(subsets, method, should_print=False):
    """
    Calculate Mean Squared Error (MSE) scores for features in subsets of data.
    
    Parameters:
        subsets (list): List of subsets of data.
        method (str): Method used for imputation ('simple', 'multivariate', 'cgmm', or 'vae').
        should_print (bool): Whether to print MSE scores or not.
        
    Returns:
        dict: Dictionary containing MSE scores for each feature in the subsets.
    """    
    # Convert columns to lists if using multivariate or CGMM methods
    method = method.lower()
    if method == 'multivariate' or method == 'cgmm':
        for subset in subsets:
            for col in subset.columns:
                subset[col] = subset[col].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    feature_mse = {}  # Initialize dictionary to store MSE values
    
    # Iterate through subsets
    for subset_index, subset in enumerate(subsets):
        # Determine unknown features indexes dynamically for each subset
        if method == 'simple':
            unknown_features_indexes = [col_index for col_index, col in enumerate(list_of_subsets[subset_index].columns) if list_of_subsets[subset_index][col].isnull().all()]
        else:
            unknown_features_indexes = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, list)).any()]

        if not unknown_features_indexes:
            continue  # Skip if there are no missing values
        
        # Iterate through rows in the subset DataFrame
        for index, row in subset.iterrows():
            original_values = X.iloc[index, unknown_features_indexes].values
            
            # Compute MSE for each feature separately
            for feature_index in range(len(unknown_features_indexes)):
                if method == 'simple':
                    generated_samples = [row.iloc[unknown_features_indexes].values[feature_index]]
                else:    
                    generated_samples_raw = row.iloc[unknown_features_indexes].values[feature_index]
                    generated_samples = [sample for sample in generated_samples_raw if not pd.isna(sample)]
                
                # Grab the original value of the feature
                original_value = original_values[feature_index]
                
                for sample in generated_samples:
                    original_value_array = np.full_like(np.array(sample), original_value)
                    
                    mse_value = mean_squared_error(original_value_array.flatten(), np.array(sample).flatten())
                    
                    if unknown_features_indexes[feature_index] not in feature_mse:
                        feature_mse[unknown_features_indexes[feature_index]] = []
                        
                    feature_mse[unknown_features_indexes[feature_index]].append(mse_value)
    
        # Print MSE scores if required
        if should_print:
            print(f"MSE for Subset {subset_index + 1}:")
            for feature_index, mse_values in feature_mse.items():
                mean_mse = np.mean(mse_values)
                print(f"Feature {df.columns[feature_index]}: MSE = {mean_mse}")
    
    # Return dictionary containing MSE scores
    return feature_mse


#### Classification

In [None]:
from joblib import load
from sklearn.metrics import accuracy_score
import warnings

# Suppress all warnings related to feature names
warnings.filterwarnings('ignore', message="X does not have valid feature names")

# Load the classifier
classifier = load('classifiers\cardio_classifier.h5')

def get_accuracy(subsets, method, should_print=False):
    method = method.lower()
    if method == 'multivariate' or method == 'cgmm':
        # Iterate through each subset and convert strings to lists
        for subset in subsets:
            for col in subset.columns:
                subset[col] = subset[col].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    # Initialize an empty list to store classification results and accuracy per subset
    classification_results = []
    accuracy_per_subset = []

    # Iterate through each subset
    for subset_index, subset in enumerate(subsets):
        subset_results = []  # Initialize results for this subset
        
        # Iterate over each row in the subset
        for row_index, row in subset.iterrows():
            # Initialize an empty list to store results for this row
            row_results = []
            
            if method != 'simple':
                serialized_arrays = []
                non_serialized_values = []
                
                # Separate serialized arrays from non-serialized values
                for col, value in row.items():
                    if isinstance(value, list):
                        serialized_arrays.append((col, value))
                    else:
                        non_serialized_values.append((col, value))
                
                # Iterate over each index of serialized arrays
                for i in range(number_of_samples):
                    # Initialize a combined row with non-serialized values
                    combined_row = non_serialized_values.copy()
                    
                    # Append the entry at index i of each serialized array to the combined row
                    for col, serialized_array in serialized_arrays:
                        if i < len(serialized_array):
                            combined_row.append((col, serialized_array[i]))
                            
                    # Convert combined_row to an array
                    combined_row_array = [value for _, value in combined_row]
                    
                    try:
                        # Run the combined row through the classifier
                        result_array = classifier.predict([combined_row_array], verbose=0)
                        row_results.append(result_array)
                    except Exception as e:
                        # Handle any potential errors
                        print(f"Error processing row {row_index}: {e}")
            else:
                result_array = classifier.predict([row.values.tolist()], verbose=0)
                row_results.append(result_array)

            # Append the row results to the subset results
            subset_results.append(row_results)
        
        # Append the subset results to the overall results
        classification_results.append(subset_results)  


    # Iterate through each subset and its corresponding results
    for subset_index, subset_results in enumerate(classification_results):
        true_labels = y.loc[subsets[subset_index].index]  # Get true labels for the current subset
        
        # Initialize list to store predicted labels for this subset
        subset_predicted_labels = []
        
        # Iterate through each row and its corresponding results
        for row_results in subset_results:
            # Get the predicted label for each row (assuming binary classification)
            predicted_label = 1 if row_results[0] > 0.5 else 0
            subset_predicted_labels.append(predicted_label)
        
        # Calculate accuracy for this subset
        subset_accuracy = accuracy_score(true_labels, subset_predicted_labels)
        
        # Append the accuracy for this subset to accuracy_per_subset
        accuracy_per_subset.append(subset_accuracy)


    if (should_print):
        # Print accuracy per subset
        for subset_index, subset_accuracy in enumerate(accuracy_per_subset):
            print("Subset", subset_index+1, "accuracy:", subset_accuracy)
        
    return accuracy_per_subset

## SimpleImputer with mean strategy

In [None]:
import copy

imputer_subsets = copy.deepcopy(list_of_subsets)

#### Imputation

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

for subset in imputer_subsets:
    for col in subset.columns:
        if pd.isna(subset[col]).any():
            imp.fit(X_train[[col]])
            subset[col] = imp.transform(subset[[col]])
            
imputer_subsets[0].head(5)

#### Scoring

In [None]:
simple_imputer_score = get_scoring(imputer_subsets, 'simple')

#### Classification

In [None]:
simple_imputer_accuracy = get_accuracy(imputer_subsets, 'simple', True)

In [None]:
results_dict['simple_imputer'] = {'score': simple_imputer_score, 'accuracy': simple_imputer_accuracy}

## Multivariate normal distribution

In [None]:
multivariate_subsets = copy.deepcopy(list_of_subsets)

In [None]:
from sklearn.mixture import GaussianMixture

# Create Gaussian Mixture Model with a single component
gmm = GaussianMixture(n_components=1, random_state=RANDOM_STATE)
gmm.fit(X_train)

#### Imputation

In [None]:
from ConditionalGMM.condGMM import CondGMM

def cgmm_generate_samples(subsets, gmm_weights, gmm_means, gmm_covariances):
    for subset in subsets:
        index = 0
        for row_index, row in subset.iterrows():
            # Get indices and values of unknown and known features
            unknown_features_indexes = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
            
            # Find indices of known features
            known_features_indexes = list(set(range(subset.shape[1])) - set(unknown_features_indexes))
            
            # Extract values of known features for the given row
            known_features_values = subset.iloc[index, known_features_indexes]
            
            # If all features are known, continue
            if len(unknown_features_indexes) == 0:
                continue
            
            # Initialize CondGMM
            cGMM = CondGMM(gmm_weights, gmm_means, gmm_covariances, known_features_indexes)
            
            # Generate samples using Conditional GMM
            sampled_data = cGMM.rvs(known_features_values, size=number_of_samples, random_state=RANDOM_STATE)
            
            # Update unknown features with sampled data
            for feature_index in range(len(unknown_features_indexes)):
                if unknown_features_indexes[feature_index] in categorical_columns:
                    # Approximate categorical values to the nearest whole number
                    sampled_data[:, feature_index] = np.round(sampled_data[:, feature_index])
                subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])
                
            index += 1

In [None]:
cgmm_generate_samples(multivariate_subsets, gmm.weights_, gmm.means_, gmm.covariances_)

multivariate_subsets[0].head(5)

#### Scoring

In [None]:
multivariate_score = get_scoring(multivariate_subsets, 'multivariate')

#### Classification

In [None]:
multivariate_accuracy = get_accuracy(multivariate_subsets, 'multivariate', True)

In [None]:
results_dict['multivariate'] = {'score': multivariate_score, 'accuracy': multivariate_accuracy}

## Conditional GMM

In [None]:
cgmm_subsets = copy.deepcopy(list_of_subsets)

### Using BIC to get the optimal number of components for GMM

In [60]:
import matplotlib.pyplot as plt

def compute_bic(data, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(data)  # Fit the model to the data
        bic.append(gmm.bic(data))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

n_components_range = range(1, 51)  # Range of number of components to evaluate
bic_values = compute_bic(X_train, n_components_range)  # Compute BIC values
optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

# Plotting BIC values
plt.plot(n_components_range, bic_values, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('BIC Value')
plt.title('BIC for Gaussian Mixture Models')
plt.grid(True)
plt.savefig('images/BIC_without_missingness.png')
plt.show()

In [None]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)
gmm.fit(X_train)

#### Imputation

In [None]:
cgmm_generate_samples(cgmm_subsets, gmm.weights_, gmm.means_, gmm.covariances_)

cgmm_subsets[0].head(5)

#### Scoring

In [None]:
cgmm_score = get_scoring(cgmm_subsets, 'cgmm')

#### Classification

In [None]:
cgmm_accuracy = get_accuracy(cgmm_subsets, 'cgmm', True)

In [None]:
results_dict['cgmm'] = {'score': cgmm_score, 'accuracy': cgmm_accuracy}

## Variational Autoencoder

In [None]:
vae_subsets = copy.deepcopy(list_of_subsets)

#### Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model

# Define function to compute negative log likelihood
def compute_nll(model, X_test):
    reconstructions = model.predict(X_test)
    mse = np.mean(np.square(X_test - reconstructions), axis=1)
    nll = 0.5 * np.log(2 * np.pi * mse)
    return np.mean(nll)

# Define range of latent space dimensionalities to try
latent_dim_range = range(2, 5)

# Train VAE models with different latent space dimensionalities and model capacities
vae_models = {}
for latent_dim in latent_dim_range:
    for num_layers in [1, 2, 3]:
        for num_neurons in [32, 64]:
            # Define the encoder
            input_dim = X_train.shape[1]
            inputs = Input(shape=(input_dim,))
            encoded = inputs
            for _ in range(num_layers):
                encoded = Dense(num_neurons, activation='relu')(encoded)
            z_mean = Dense(latent_dim)(encoded)
            z_log_var = Dense(latent_dim)(encoded)

            # Reparameterization trick
            def sampling(args):
                z_mean, z_log_var = args
                epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], latent_dim))
                return z_mean + tf.exp(0.5 * z_log_var) * epsilon

            z = Lambda(sampling)([z_mean, z_log_var])

            # Define the decoder
            decoded = z
            for _ in range(num_layers):
                decoded = Dense(num_neurons, activation='relu')(decoded)
            outputs = Dense(input_dim)(decoded)

            # Create the VAE model
            vae = Model(inputs, outputs)

            # Compile the model
            vae.compile(optimizer='adam', loss='mse')  # Use MSE as the reconstruction loss

            # Train the model
            history = vae.fit(X_train, X_train, epochs=10, batch_size=32, verbose=0)

            # Evaluate performance on test set
            nll_test = compute_nll(vae, X_test)
            print(f"Latent Dim: {latent_dim}, Num Layers: {num_layers}, Num Neurons: {num_neurons}, Test NLL: {nll_test}")

            # Store model and its performance
            vae_models[(latent_dim, num_layers, num_neurons)] = {'model': vae, 'nll_test': nll_test}

# Select model with lowest mean NLL on test set
best_config = min(vae_models, key=lambda x: vae_models[x]['nll_test'])
best_model = vae_models[best_config]['model']

print(f"Best Model Configuration: Latent Dim = {best_config[0]}, Num Layers = {best_config[1]}, Num Neurons = {best_config[2]}, Test NLL = {vae_models[best_config]['nll_test']}")

#### Imputation

In [None]:
vae_subsets = copy.deepcopy(list_of_subsets)

# Iterate through each subset
for subset_index, subset in enumerate(vae_subsets):
    index = 0
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = np.where(subset.isnull().any())[0]

    # Iterate through each row
    for row_index, row in subset.iterrows():
        sampled_data = np.empty((number_of_samples, len(unknown_features_indexes)))
        
        # Repeat the prediction process for the specified number of samples
        for _ in range(number_of_samples):
            imputed_values_row = []
            # Impute missing values for each feature index
            for feature_index in unknown_features_indexes:
                # Impute missing value using the VAE for the current feature and row
                imputed_value = best_model.predict(row.values.reshape(1, -1).astype(np.float32), verbose=0)[0, feature_index]
                imputed_values_row.append(imputed_value)
            sampled_data[_] = imputed_values_row
         
        for feature_index in range(len(unknown_features_indexes)):
            if unknown_features_indexes[feature_index] in categorical_columns:
                # Approximate categorical values to the nearest whole number
                sampled_data[:, feature_index] = np.round(sampled_data[:, feature_index])
            subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])

        index += 1

#### Scoring

In [None]:
vae_score = get_scoring(vae_subsets, 'vae')

#### Classification

In [None]:
vae_accuracy = get_accuracy(vae_subsets, 'vae', True)

In [None]:
results_dict['vae'] = {'score': vae_score, 'accuracy': vae_accuracy}

## Comparison of results

In [None]:
from tabulate import tabulate

# Convert accuracy values to percentages
for key, value in results_dict.items():
    results_dict[key]["accuracy"] = [round(acc * 100, 2) for acc in value["accuracy"]]

# Create a table
table = [[""] + list(results_dict.keys())]
for i in range(10):
    table.append([i+1] + [results_dict[key]["accuracy"][i] for key in results_dict.keys()])

# Print the table
print(tabulate(table, headers="firstrow", tablefmt="grid"))