This notebook is used to perform a validation of methods used for multiple missing data imputation on a Cardiovascular Disease dataset.

In [27]:
import pandas as pd
import numpy as np
import copy
import os
import sys

# Add the parent directory to the system path
module_path = os.path.abspath(os.getcwd() + '\\..')
if module_path not in sys.path:
    sys.path.append(module_path)

RANDOM_STATE = 404

In [28]:
df = pd.read_csv('../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [29]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [30]:
# Create an empty dictionary to store the results after each method
results_dict = {}

# Define the number of samples and the fraction of data to use
default_number_of_samples = 100
fraction_of_data = 0.01

## Data manipulation

#### Standardization

In [31]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_columns = ['gluc', 'smoke', 'alco', 'active', 'cholesterol', 'gender']

# Initialize the scalers
standard_scaler = StandardScaler()

# Fit and transform your data
X[numeric_columns] = standard_scaler.fit_transform(X[numeric_columns])

X.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.436062,2,0.443452,-0.847873,-0.122182,-0.088238,1,1,0,0,1
1,0.307686,1,-1.018168,0.749831,0.07261,-0.03518,3,1,0,0,1
2,-0.247997,1,0.078047,-0.708942,0.007679,-0.141297,3,1,0,0,0
3,-0.748152,2,0.565254,0.541435,0.137541,0.017879,1,1,0,0,1
4,-0.808543,1,-1.018168,-1.264666,-0.187113,-0.194356,1,1,0,0,0


In [32]:
X.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,5.272227e-16,1.349571,1.450116e-15,-2.905105e-16,7.623108000000001e-17,1.7459050000000003e-17,1.366871,1.226457,0.088129,0.053771,0.803729
std,1.000007,0.476838,1.000007,1.000007,1.000007,1.000007,0.68025,0.57227,0.283484,0.225568,0.397179
min,-3.514407,1.0,-13.32014,-4.460075,-1.810381,-0.8841161,1.0,1.0,0.0,0.0,0.0
25%,-0.7315341,1.0,-0.652763,-0.639477,-0.05725127,-0.0882385,1.0,1.0,0.0,0.0,1.0
50%,0.09489744,1.0,0.07804703,-0.1532192,-0.05725127,-0.0882385,1.0,1.0,0.0,0.0,1.0
75%,0.7531244,2.0,0.6870554,0.5414349,0.07261016,-0.03517999,2.0,1.0,0.0,0.0,1.0
max,1.720199,2.0,10.43119,8.738353,103.1826,57.85165,3.0,3.0,1.0,1.0,1.0


#### Visualization

In [33]:
import matplotlib.pyplot as plt

# Check if the file exists
file_path = "../images/without_missingness/density_plots.png"
if not os.path.isfile(file_path):
    # Set the size of the figure
    plt.figure(figsize=(15, 8))

    # Loop through each column in X and generate a density plot
    for i, feature_name in enumerate(X.columns):
        # Set the subplot and plot the density of the column
        plt.subplot(4, 4, i + 1)  # 4x4 grid, current subplot index
        X[feature_name].plot(kind='density', color='blue', label=feature_name)
        plt.title(feature_name)
        plt.xlabel('Scaled Value')
        plt.ylabel('Density')

    # Adjust the layout of the subplots
    plt.tight_layout()

    # Save the figure as an image
    plt.savefig(file_path)

    # Display the figure
    plt.show()

In [34]:
import seaborn as sns

# Check if the file exists
file_path = "../images/without_missingness/correlation_matrix.png"
if not os.path.isfile(file_path):
    # Set the size of the figure
    plt.figure(figsize=(10, 8))

    # Draw correlation matrix
    sns.heatmap(X.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

    # Show the figure
    plt.title('Correlation Matrix')

    # Save the figure as an image
    plt.savefig(file_path)

    # Display the figure
    plt.show()

#### Splitting

In [35]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

((56000, 11), (14000, 11))

## Preparing 10 subsets with removed features

In [36]:
def remove_features(num_features_to_remove=None, feature_indices_to_remove=None):
    """
    Randomly removes features from a subset of data and replaces their values with NaN.
    
    Parameters:
        num_features_to_remove (int): Number of features to remove randomly.
        feature_indices_to_remove (array-like): Indices of features to remove.
        
    Returns:
        pandas.DataFrame: Subset of data with removed features and NaN values.
    """
    # Sample a subset of data
    subset = X_test.sample(frac=fraction_of_data, random_state=RANDOM_STATE)
    
    # Determine features to remove based on number or indices provided
    if feature_indices_to_remove is None:
        if num_features_to_remove is None:
            num_features_to_remove = np.random.randint(1, min(5, len(X_test.columns) - 1))
        else:
            features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    else:
        features_to_remove = subset.columns[feature_indices_to_remove]
    
    # Replace values of selected features with NaN
    for feature in features_to_remove:
        subset[feature] = np.NaN
    
    return subset.astype('object')

list_of_subsets = []
subset_without_changes = X_test.sample(frac=fraction_of_data, random_state=RANDOM_STATE)

# Generate subsets with varying numbers of removed features
list_of_subsets.append(remove_features(1, [4]))

for _ in range(2):
    list_of_subsets.append(remove_features(1))

for _ in range(3):
    list_of_subsets.append(remove_features(2))

for _ in range(3):
    list_of_subsets.append(remove_features(3))

for _ in range(1):
    list_of_subsets.append(remove_features(4))

# Print information about subsets and their missing columns
print(f'Subsets with {list_of_subsets[0].shape[0]} datapoints and their columns with missing values:')
for subset_index, current_row in enumerate(list_of_subsets):
    nan_columns = current_row.columns[current_row.isnull().all()]
    print(f"Subset {subset_index+1}: {', '.join(nan_columns)}")

Subsets with 140 datapoints and their columns with missing values:
Subset 1: ap_hi
Subset 2: gluc
Subset 3: gender
Subset 4: ap_lo, gluc
Subset 5: gluc, smoke
Subset 6: weight, smoke
Subset 7: weight, cholesterol, gluc
Subset 8: age, gender, ap_lo
Subset 9: ap_hi, gluc, smoke
Subset 10: weight, ap_hi, cholesterol, alco


## Shared functions

### Imputation

##### Simple Imputer

In [37]:
from sklearn.impute import SimpleImputer

def simple_impute(current_subset):
    """
    Impute missing values using SimpleImputer with mean strategy.

    Parameters:
        current_subset (pandas.DataFrame): Subset of data with missing values.

    This function iterates over each column in the given DataFrame and imputes missing values using
    the SimpleImputer class from scikit-learn. The imputer is initialized with the 'mean' strategy,
    which replaces missing values with the mean of the non-missing values in the column.
    """

    # Create a SimpleImputer object with 'mean' strategy
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')

    # Iterate over each column in the DataFrame
    for col in current_subset.columns:
        # Check if any values in the column are missing
        if pd.isna(current_subset[col]).any():
            # Fit the imputer to the non-missing values in the column
            imp.fit(subset_without_changes[[col]])
            # Transform the missing values in the column
            current_subset[col] = imp.transform(current_subset[[col]])
            
            # if col in categorical_columns:
            #     # Approximate categorical values to the nearest whole number
            #     current_subset[col] = np.round(current_subset[col])

#### Multivariate normal distribution & cGMM

In [38]:
from helpers.ConditionalGMM.condGMM import CondGMM

def cgmm_impute(gmm, missing_features_indices, current_row, number_of_samples):
    """
    Impute missing values using Conditional GMM, returning parameters of the predictive distribution.
    
    Parameters:
        gmm (GMM): Gaussian Mixture Model.
        missing_features_indices (list): Indices of missing features.
        current_row (pandas.Series): Current row with missing values.
        number_of_samples (int): Number of samples to generate.
        
    Returns:
        dict: A dictionary containing samples, means ('mu'), and covariances ('sigma').
    """
    # Find indices of known features
    known_features_indices = [i for i in range(len(current_row)) if i not in missing_features_indices]
    
    # Extract values of known features for the given row
    known_features_values = current_row.iloc[known_features_indices].values
    
    # Initialize CondGMM
    cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indices)
    
    # Generate samples using Conditional GMM
    generated_samples = cGMM.rvs(known_features_values, size=number_of_samples, random_state=RANDOM_STATE)
    
    # Make sure that the generated samples do not contain NaN values, if it does then change it's value to 0
    generated_samples[np.isnan(generated_samples)] = 0
    
    # Extract mean and covariance for the conditional distribution
    mus = cGMM.conditional_component_means(known_features_values)
    c_weights = cGMM.conditional_weights(known_features_values)
    c_weights = c_weights[:, np.newaxis]  # Ensure weights are aligned for broadcasting
    mu = np.sum(c_weights * mus, axis=0)  # Weighted sum across the correct axis
    sigma = cGMM.conditional_component_covs()

    return {
        "samples": generated_samples,
        "mu": mu,
        "sigma": sigma
    }

#### VAEAC & GAIN

In [39]:
def vaeac_gain_impute(model, missing_features_indices, current_row, number_of_samples):
    """
    Impute missing values using a Variational Autoencoder or Generative Adversarial Imputation Network.

    Args:
        model (keras.Model): Trained Variational Autoencoder or GAIN model.
        missing_features_indices (list): Indices of missing features.
        current_row (pandas.Series): Current row with missing values.
        number_of_samples (int): Number of samples to generate.

    Returns:
        list: Generated data with imputed missing values.
    """
    generated_samples = np.empty((number_of_samples, len(missing_features_indices)))
    
    # Repeat the prediction process for the specified number of samples
    for i in range(number_of_samples):
        # Impute missing values for each feature index
        # Reshape the missing values to a 2D array with one row and all missing features
        missing_features_values = model.predict(current_row.values.reshape(1, -1).astype(np.float32), verbose=0)
        
        # Store the generated data in the array
        generated_samples[i] = missing_features_values[:, missing_features_indices]
    
    return {
        "samples": generated_samples
    }

#### Impute multiple missing data function

In [40]:
import json

def imputing_missing_data(subsets, method='simple', number_of_samples=default_number_of_samples, model=None):
    """
    Impute missing data in subsets using different imputation methods.
    
    Parameters:
        subsets (list): List of subsets of data.
        method (str): Imputation method ('simple', 'multivariate', 'cgmm', 'vaeac', or 'gain').
        model: Trained model for certain imputation methods.
    """
    for subset_index, subset in enumerate(subsets):
        if method == 'simple':
            # Simple Imputer
            generated_data = simple_impute(subset)
        else:
            # Initialize to keep track of actual row index, because indices were shuffled
            row_in_subset_index = 0
            
            for row_index, row in subset.iterrows():
                # Get indices of unknown features
                missing_features_indices = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
                
                # If all features are known, continue   
                if len(missing_features_indices) == 0:
                    continue
                
                generated_data = None
                
                if method == 'multivariate' or method == 'cgmm':
                    # Multivariate Imputer or Conditional GMM
                    generated_data = cgmm_impute(model, missing_features_indices, row, number_of_samples)
                elif method == 'vaeac' or method == 'gain':
                    # Variational AutoEncoder or Generative Adversarial Imputation Network
                    generated_data = vaeac_gain_impute(model, missing_features_indices, row, number_of_samples)
                    
                # Update unknown features with sampled data
                for feature_index in range(len(missing_features_indices)):
                    # Check if generated_data is a dictionary
                    if isinstance(generated_data, dict):
                        if 'mu' in generated_data and 'sigma' in generated_data:
                            # Convert mu and sigma to lists if they are numpy arrays
                            mu = generated_data['mu'].tolist() if isinstance(generated_data['mu'], np.ndarray) else generated_data['mu']
                            sigma = generated_data['sigma'].tolist() if isinstance(generated_data['sigma'], np.ndarray) else generated_data['sigma']
                            samples = [sample[feature_index] for sample in generated_data['samples']]
                            
                            data_to_insert = json.dumps({
                                "samples": samples,
                                "mu": mu,
                                "sigma": sigma
                            })
                        else:
                            samples = [sample[feature_index] for sample in generated_data['samples']]
                            data_to_insert = json.dumps({
                                "samples": samples
                            })
                        
                        subset.at[row_index, subset.columns[missing_features_indices[feature_index]]] = data_to_insert
                
                row_in_subset_index += 1

### Scoring

In [41]:
from scipy.stats import norm
from properscoring import crps_ensemble

def get_scoring(subsets, method='simple', print_results=False):
    """
    Calculate scores (NMSE, Log Score, and CRPS) for features in subsets of data.
    
    Parameters:
        subsets (list): List of subsets of data.
        method (str): Imputation method ('simple', 'multivariate', 'cgmm', 'vaeac', or 'gain').
        print_results (bool): Whether to print scores or not.
        
    Returns:
        dict: Dictionary containing scores for each feature in the subsets organized by score type.
    """
    method = method.lower()

    # Deserialize any strings in subsets
    for subset in subsets:
        for col in subset.columns:
            subset[col] = subset[col].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    all_subsets_scores = {}

    # TODO: confirm that the order of missing features is correct
    # Iterate through subsets
    for subset_index, subset in enumerate(subsets):
        feature_scores = {}  # Dictionary for each type of score per feature
        
        # Identify features with missing values
        if method == 'simple':
            missing_features_indices = [col_index for col_index, col in enumerate(list_of_subsets[subset_index].columns) if list_of_subsets[subset_index][col].isnull().all()]
        else:
            missing_features_indices = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, (list, dict))).any()]

        if not missing_features_indices:
            continue  # Skip if no missing values
        
        for row_index, row in subset.iterrows():
            original_values = X.iloc[row_index, missing_features_indices].values
            
            for feature_index, col_index in enumerate(missing_features_indices):
                feature_name = subset.columns[col_index]
                generated_samples = row.iloc[col_index]
                original_value = original_values[feature_index]
                
                # Initialize dictionary only with NMSE, others will be added as needed
                if feature_name not in feature_scores:
                    feature_scores[feature_name] = {'nmse': []}
                
                if not isinstance(generated_samples, dict):
                    generated_samples = {'samples': [generated_samples]}

                if 'mu' in generated_samples and 'sigma' in generated_samples:
                    epsilon = 1e-10
                    
                    # Read mean and covariance
                    mu = np.mean(generated_samples['mu']) if isinstance(generated_samples['mu'], (list, np.ndarray)) else generated_samples['mu']
                    sigma = np.mean(generated_samples['sigma']) if isinstance(generated_samples['sigma'], (list, np.ndarray)) else generated_samples['sigma']
                    sigma = max(sigma, epsilon)  # Ensure sigma is positive
                    
                    # Log score calculation
                    log_score = -np.log(norm.pdf(original_value, loc=mu, scale=sigma) + epsilon)
                    feature_scores[feature_name].setdefault('log_score', []).append(log_score)
                
                elif 'samples' in generated_samples:
                    # CRPS for ensemble predictions
                    samples_size = len(generated_samples['samples'])
                    original_value_array = np.repeat(original_value, samples_size)
                    crps_score = crps_ensemble(original_value_array, generated_samples['samples'])
                    feature_scores[feature_name].setdefault('crps', []).append(crps_score)
                
                # MSE calculation
                squared_errors = [(original_value - x)**2 for x in generated_samples['samples']]
                feature_scores[feature_name]['nmse'].append(squared_errors)
        
        # Average scores for each feature
        for feature_name, scores in feature_scores.items():
            if feature_name == 'ap_hi':
                print("ap_hi")
            for score_type, values in scores.items():
                if values:
                    mean_score = np.mean(values)
                    
                    if score_type == 'nmse':
                        variance = np.var(subset_without_changes[feature_name])
                        
                        if variance == 0:
                            if mean_score != 0:
                                raise Exception("Mean Squared Error cannot be different than 0 when variance is equal to 0!")
                            else:
                                mean_score = 0
                        else:
                            mean_score = mean_score / variance
                            
                    feature_scores[feature_name][score_type] = np.round(mean_score, 3)
        
        all_subsets_scores[subset_index] = feature_scores
        
        # Print scores if required
        if print_results:
            print(f"Scores for Subset {subset_index + 1}:")
            for feature_name, scores in feature_scores.items():
                print(f"Feature {feature_name}: ", end="")
                for score_type, score_value in scores.items():
                    print(f"{score_type.upper()} = {score_value}, ", end="")
                print()  # New line for each feature
            
    return all_subsets_scores


### Classification

In [42]:
from joblib import load
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings

warnings.filterwarnings('ignore', message="X does not have valid feature names")

# Load the trained classifier model
classifier = load('..\helpers\predictive_models\cardio_classifier.h5')

def get_classification_result(subsets, method='simple', should_print=False):
    """
    Calculate AUC scores for subsets of data using a trained classifier.
    
    Parameters:
        subsets (list): List of subsets of data.
        method (str): Imputation method ('simple', 'multivariate', 'cgmm', 'vaeac', or 'gain').
        should_print (bool): Whether to print AUC scores or not.
        
    Returns:
        list: List of AUC scores for each subset.
    """
    method = method.lower()
    
    # Deserialize any strings in subsets
    for subset in subsets:
        for col_index in subset.columns:
            subset[col_index] = subset[col_index].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    classification_results = []  # Initialize list to store classification results
    accuracy_per_subset = []  # Initialize list to store accuracy scores
    auc_per_subset = []  # Initialize list to store AUC scores

    # Iterate through subsets
    for subset_index, subset in enumerate(subsets):
        subset_results = []  # Initialize list to store results for the current subset
        
        # Iterate through rows in the subset DataFrame
        for row_index, row in subset.iterrows():
            output_probs = []  # Initialize list to store results for the current row
            
            # Process each row based on the method used
            if method != 'simple':
                serialized_arrays = []
                non_serialized_values = []
                
                # Split row values into serialized arrays and non-serialized values
                for col_index, value in enumerate(row):
                    if isinstance(value, dict):
                        serialized_arrays.append((col_index, value['samples']))
                    else:
                        non_serialized_values.append((col_index, value))
                
                # Generate combined rows by combining serialized arrays with non-serialized values
                for i in range(default_number_of_samples):
                    combined_row = non_serialized_values.copy()
                    
                    for col_index, serialized_array in serialized_arrays:
                        assert len(serialized_array) == default_number_of_samples
                        combined_row.append((col_index, serialized_array[i]))
                    
                    # TODO: confirm that combined_row_array comes out in proper order
                    combined_row_array = np.zeros(shape=len(combined_row))
                    for col_index, value in combined_row:
                        combined_row_array[col_index] = value
                    
                    output_probs.append(combined_row_array)
            else:
                output_probs.append(row.values.tolist())
            
            predicted_probs = classifier.predict_proba(np.vstack(output_probs))
                
            subset_results.append(predicted_probs)
        
        classification_results.append(subset_results)

    # Create an empty list to store dictionaries of results
    results_list = []

    # Calculate AUC scores and accuracy for each subset
    for subset_index, subset_results in enumerate(classification_results):
        true_labels = y.loc[subsets[subset_index].index]
        
        subset_predicted_probs = []  # Initialize list to store predicted probabilities for the subset
        
        # Determine predicted probabilities for each row in the subset
        for output_probs in subset_results:
            predicted_prob = np.mean(output_probs[:, 1])  # Assuming the second column contains probabilities of the positive class
            subset_predicted_probs.append(predicted_prob)
        
        # Convert probabilities to binary predictions based on the threshold
        subset_predictions = [1 if prob > 0.5 else 0 for prob in subset_predicted_probs]
        
        subset_accuracy = accuracy_score(true_labels, subset_predictions)  # Calculate accuracy score for the subset
        subset_auc = roc_auc_score(true_labels, subset_predicted_probs)  # Calculate AUC score for the subset
        
        accuracy_per_subset.append(round(subset_accuracy, 2))  # Append accuracy score to the list
        auc_per_subset.append(round(subset_auc, 2))  # Append AUC score to the list
        
        # Append results to the list of dictionaries
        results_list.append({'Accuracy': np.round(subset_accuracy * 100, 2), 'AUC': np.round(subset_auc, 3)})

    # Convert the list of dictionaries to a DataFrame
    results_table = pd.DataFrame(results_list)

    # Print classification scores in a table
    if should_print:
        print(results_table)
        
    return accuracy_per_subset, auc_per_subset  # Return list of AUC scores for each subset

## SimpleImputer with mean strategy

#### Preparation

In [43]:
imputer_subsets = copy.deepcopy(list_of_subsets)

#### Imputation

In [44]:
imputing_missing_data(imputer_subsets, 'simple')

imputer_subsets[0].head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
2258,1.164922,1,-1.87078,-1.125735,-0.017783,-0.088238,2,3,0,0,1
24996,-1.175756,1,-0.530961,0.194108,-0.017783,-0.03518,1,1,0,0,0
53089,0.360782,2,0.930659,-0.014288,-0.017783,-0.03518,1,1,0,0,1
17450,-1.643486,1,-0.40916,1.444485,-0.017783,-0.141297,1,1,0,1,1
24702,1.599416,1,-0.530961,2.555932,-0.017783,-0.03518,1,1,0,0,1


#### Scoring

In [45]:
simple_imputer_score = get_scoring(imputer_subsets, 'simple')

ap_hi
ap_hi
ap_hi


#### Classification

In [46]:
simple_imputer_accuracy, simple_imputer_auc = get_classification_result(imputer_subsets, 'simple', True)

   Accuracy    AUC
0     74.29  0.762
1     76.43  0.824
2     75.71  0.833
3     73.57  0.828
4     76.43  0.818
5     76.43  0.812
6     77.86  0.820
7     74.29  0.802
8     70.71  0.740
9     62.86  0.705


In [47]:
results_dict['simple_imputer'] = {'score': simple_imputer_score, 'accuracy': simple_imputer_accuracy, 'auc': simple_imputer_auc}

## Multivariate normal distribution

#### Preparation

In [48]:
multivariate_subsets = copy.deepcopy(list_of_subsets)

In [49]:
from sklearn.mixture import GaussianMixture

# Create Gaussian Mixture Model with a single component
gmm = GaussianMixture(n_components=1, random_state=RANDOM_STATE)
gmm.fit(X_train)

#### Imputation

In [50]:
imputing_missing_data(multivariate_subsets, 'multivariate', default_number_of_samples, gmm)

multivariate_subsets[0].head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
2258,1.164922,1,-1.87078,-1.125735,"{""samples"": [0.8396580083006695, -1.9221773806...",-0.088238,2,3,0,0,1
24996,-1.175756,1,-0.530961,0.194108,"{""samples"": [0.8142443832080619, -1.9475910057...",-0.03518,1,1,0,0,0
53089,0.360782,2,0.930659,-0.014288,"{""samples"": [0.8405931506134794, -1.9212422383...",-0.03518,1,1,0,0,1
17450,-1.643486,1,-0.40916,1.444485,"{""samples"": [0.840238835059227, -1.92159655390...",-0.141297,1,1,0,1,1
24702,1.599416,1,-0.530961,2.555932,"{""samples"": [0.9206615919666465, -1.8411737970...",-0.03518,1,1,0,0,1


#### Scoring

In [52]:
multivariate_score = get_scoring(multivariate_subsets, 'multivariate', True)

ap_hi
Scores for Subset 1:
Feature ap_hi: NMSE = 147.863, LOG_SCORE = 1.138, 
Scores for Subset 2:
Feature gluc: NMSE = 1.397, LOG_SCORE = 1.406, 
Scores for Subset 3:
Feature gender: NMSE = 1.552, LOG_SCORE = 2.324, 
Scores for Subset 4:
Feature ap_lo: NMSE = 3.005, LOG_SCORE = 2.928, 
Feature gluc: NMSE = 1.279, LOG_SCORE = 3.493, 
Scores for Subset 5:
Feature gluc: NMSE = 1.3, LOG_SCORE = 12.414, 
Feature smoke: NMSE = 1.72, LOG_SCORE = 20.818, 
Scores for Subset 6:
Feature weight: NMSE = 1.981, LOG_SCORE = 5.41, 
Feature smoke: NMSE = 1.717, LOG_SCORE = 0.137, 
Scores for Subset 7:
Feature weight: NMSE = 1.819, LOG_SCORE = 9.639, 
Feature cholesterol: NMSE = 1.889, LOG_SCORE = 4.131, 
Feature gluc: NMSE = 1.85, LOG_SCORE = 2.866, 
Scores for Subset 8:
Feature age: NMSE = 1.766, LOG_SCORE = 7.524, 
Feature gender: NMSE = 1.573, LOG_SCORE = 7.904, 
Feature ap_lo: NMSE = 2.889, LOG_SCORE = 2.521, 
ap_hi


KeyboardInterrupt: 

#### Classification

In [None]:
multivariate_accuracy, multivariate_auc = get_classification_result(multivariate_subsets, 'multivariate', True)

In [None]:
results_dict['multivariate'] = {'score': multivariate_score, 'accuracy': multivariate_accuracy, 'auc': multivariate_auc}

## Conditional GMM

#### Preparation

In [None]:
cgmm_subsets = copy.deepcopy(list_of_subsets)

In [None]:
import matplotlib.pyplot as plt

def compute_bic(data, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models
    with different numbers of components.

    Parameters:
        data (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.

    Returns:
        list: BIC values for each number of components.
    """
    # List to store BIC values
    bic = []
    
    # Loop through number of components and compute BIC for each
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(data)  # Fit the model to the data
        bic.append(gmm.bic(data))  # Calculate BIC and add to list
    
    return bic  # Return list of BIC values

# Used to simplify getting optimal number of components based on previous run
optimal_n_components = 46

if optimal_n_components is None:
    # Range of number of components to evaluate
    n_components_range = range(1, 51)

    # Compute BIC values
    bic_values = compute_bic(X_train, n_components_range)

    # Optimal number of components
    optimal_n_components = n_components_range[np.argmin(bic_values)]

    # Plotting BIC values
    plt.plot(n_components_range, bic_values, marker='o', label='BIC Values')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC Value')
    plt.title('BIC for Gaussian Mixture Models')
    plt.grid(True)

    # Add legend
    plt.legend()
    plt.savefig('../images/without_missingness/BIC.png')
    plt.show()

In [None]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)
gmm.fit(X_train)

#### Imputation

In [None]:
imputing_missing_data(cgmm_subsets, 'cgmm', default_number_of_samples, gmm)

cgmm_subsets[0].head(5)

#### Scoring

In [None]:
cgmm_score = get_scoring(cgmm_subsets, 'cgmm')

#### Classification

In [None]:
cgmm_accuracy, cgmm_auc = get_classification_result(cgmm_subsets, 'cgmm', True)

In [None]:
results_dict['cgmm'] = {'score': cgmm_score, 'accuracy': cgmm_accuracy, 'auc': cgmm_auc}

## Variational Autoencoder with Arbitrary Conditioning

#### Preparation

In [None]:
vaeac_subsets = copy.deepcopy(list_of_subsets)

In [None]:
from sklearn.decomposition import TruncatedSVD

# Perform Singular Value Decomposition (SVD) on training data
svd = TruncatedSVD(n_components=min(X_train.shape), random_state=RANDOM_STATE)
svd.fit(X_train)

# Calculate cumulative explained variance
explained_variance_ratio = svd.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# Choose threshold to preserve 90% of total variance
threshold_index = np.argmax(cumulative_variance_ratio >= 0.90)
threshold = svd.singular_values_[threshold_index]

print(f"Starting threshold to preserve 90% of total variance: {threshold}")

# Analyze singular values
singular_values = svd.singular_values_
num_non_trivial = np.sum(singular_values > threshold)  # Choose a threshold to determine non-trivial singular values

# Select latent space dimensionality
latent_dim = num_non_trivial

print(f"Number of non-trivial singular values: {num_non_trivial}")
print(f"Selected latent space dimensionality: {latent_dim}")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model

input_dim = X_train.shape[1]
inputs = Input(shape=(input_dim,))
encoded = inputs
encoded = Dense(128, activation='relu')(encoded)
encoded = Dense(64, activation='relu')(encoded)
z_mean = Dense(latent_dim)(encoded)
z_log_var = Dense(latent_dim)(encoded)

# Reparameterization trick
def sampling(args):
    z_mean, z_log_var = args
    epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], latent_dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling)([z_mean, z_log_var])

# Define the decoder
decoded = z
decoded = Dense(64, activation='relu')(decoded)
decoded = Dense(128, activation='relu')(decoded)
outputs = Dense(input_dim)(decoded)

# Create the VAE model
vaeac = Model(inputs, outputs)

# Compile the model
vaeac.compile(optimizer='adam', loss='mse')  # Use MSE as the reconstruction loss

# Train the model
history = vaeac.fit(X_train, X_train, epochs=10, batch_size=32, verbose=1)

#### Imputation

In [None]:
imputing_missing_data(vaeac_subsets, 'vaeac', default_number_of_samples, vaeac)

vaeac_subsets[0].head(5)

#### Scoring

In [None]:
vaeac_score = get_scoring(vaeac_subsets, 'vaeac')

#### Classification

In [None]:
vaeac_accuracy, vaeac_auc = get_classification_result(vaeac_subsets, 'vaeac', True)

In [None]:
results_dict['vaeac'] = {'score': vaeac_score, 'accuracy': vaeac_accuracy, 'auc': vaeac_auc}

## Generative Adversarial Imputation Network

#### Preparation

In [None]:
gain_subsets = copy.deepcopy(list_of_subsets)

In [None]:
gain = load('..\helpers\generative_models\cardio_gain_generator.h5')

#### Imputation

In [None]:
imputing_missing_data(gain_subsets, 'gain', default_number_of_samples, gain)

gain_subsets[0].head(5)

#### Scoring

In [None]:
gain_score = get_scoring(gain_subsets, 'gain')

#### Classification

In [None]:
gain_accuracy, gain_auc = get_classification_result(gain_subsets, 'gain', True)

In [None]:
results_dict['gain'] = {'score': gain_score, 'accuracy': gain_accuracy, 'auc': gain_auc}

## Comparison of results

In [None]:
from tabulate import tabulate

# Define the directory where results are stored
results_directory = '..\\results\\without_missingness'

# Get the list of existing result files to determine the next run number
existing_files = os.listdir(results_directory)
run_numbers = [int(file.split("_")[1].split(".")[0]) for file in existing_files if file.startswith("run_")]

# Determine the next run number
next_run_number = max(run_numbers, default=0) + 1

# Create tables for accuracy, AUC, and scores
accuracy_table = [[""] + list(results_dict.keys())]
auc_table = [[""] + list(results_dict.keys())]
score_table = [[""] + list(results_dict.keys())]

for i in range(len(next(iter(results_dict.values()))["accuracy"])):
    accuracy_row = [i+1] + [results_dict[key]["accuracy"][i] for key in results_dict.keys()]
    auc_row = [i+1] + [results_dict[key]["auc"][i] for key in results_dict.keys()]
    score_row = [i+1] + [results_dict[key]["score"].get(i, "") for key in results_dict.keys()]
    accuracy_table.append(accuracy_row)
    auc_table.append(auc_row)
    score_table.append(score_row)

# Generate the tabulated strings for accuracy, AUC, and scores
tabulated_accuracy_table = tabulate(accuracy_table, headers="firstrow", tablefmt="grid")
tabulated_auc_table = tabulate(auc_table, headers="firstrow", tablefmt="grid")
tabulated_score_table = tabulate(score_table, headers="firstrow", tablefmt="grid")

# Define the file name for the new result
new_file_name = f"run_{next_run_number}.txt"
file_path = os.path.join(results_directory, new_file_name)

# Save accuracy, AUC, and score tables to the same file with separation
with open(file_path, "w") as file:
    file.write("Number of datapoints: " + str(X_train.shape[0] * fraction_of_data) + "\n" + "Number of samples: " + str(default_number_of_samples) + "\n\n")
    file.write("Accuracy:\n")
    file.write(tabulated_accuracy_table + "\n\n")
    file.write("AUC:\n")
    file.write(tabulated_auc_table + "\n\n")
    file.write("Scores:\n")
    file.write(tabulated_score_table)
