In [None]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404

In [None]:
df = pd.read_csv('data/HAD.csv')
df.head()

In [None]:
target_feature = 'MRS_90'
y = df.filter([target_feature])
y.to_csv('data/HAD_target.csv', index=False)
df = df.drop(columns=['MRS_90', 'MRS_90_DICHO'], axis=1)

## Data standardization

In [None]:
from dill import load as dill_load

scalerFile = "classifier\scaler.pkl"

with open(scalerFile, "rb") as f:
    scaler = dill_load(f)

df_scaled = scaler.preprocess_clinical_data(np.asarray(df, dtype=float))
X = pd.DataFrame(df_scaled, columns=df.columns)
X

## Generating missing data based on sklearn imputers

In [None]:
# Because of the warning given while imputing missing values, 'SERUM_GLUCOSE' and 'VALV_HEART' columns are removed due to not having any other entry than '-1' value
X = X.drop(columns=['SERUM_GLUCOSE', 'SERUM_GLUCOSE_MISSING', 'VALV_HEART'], axis=1)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer_df = X.copy()  # Creating a copy of the original DataFrame for imputation

# Initialize IterativeImputer with median strategy for missing values
imp = IterativeImputer(missing_values=-1, initial_strategy='mean', random_state=RANDOM_STATE)

# Iterate through columns in the DataFrame
for col in imputer_df.columns:
    # Skip columns ending with '_MISSING'
    if col.endswith('_MISSING'):
        continue
    # Check if there is a corresponding missing flag column
    elif col + "_MISSING" in X.columns:
        # Check if there are missing values to impute
        if (imputer_df[col + "_MISSING"] == 1).any() and (imputer_df[col] == -1).any():
            # Fit imputer on the column and transform the values
            imp.fit(imputer_df[[col]])
            imputer_df[col] = imp.transform(imputer_df[[col]]).ravel()
            imputer_df[col + '_MISSING'] = 0
    else:
        # Check if there are missing values to impute
        if (imputer_df[col] == -1).any():
            # Fit imputer on the column and transform the values
            imp.fit(imputer_df[[col]])
            imputer_df[col] = imp.transform(imputer_df[[col]]).ravel()

# Remove columns related to missingness flags
imputer_df = imputer_df[imputer_df.columns.drop(list(imputer_df.filter(regex='MISSING')))]

imputer_df

### Using BIC to get the optimal number of components for GMM

In [None]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)
        bic.append(gmm.bic(X))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

optimal_n_components = None

if optimal_n_components is None:
    n_components_range = range(1, 51)  # Range of number of components to evaluate
    bic_values = compute_bic(imputer_df, n_components_range)  # Compute BIC values
    optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

    # Plotting BIC values
    plt.plot(n_components_range, bic_values, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC Value')
    plt.title('BIC for Gaussian Mixture Models')
    plt.grid(True)
    plt.show()


### Gaussian Mixture Model fitting

In [None]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)

# Fit the Gaussian Mixture Model to the imputed DataFrame
gmm.fit(imputer_df)

### *Optional:* Can set all the values of a specific feature to -1 for the purpose of displaying it's distribution on the heatmap below

In [None]:
# Saving the values of a feature to mark them in the heatmap, e.g. CTA_CS
heatmap_feature = 'SYS_BLOOD_PRESSURE'
heatmap_feature_original_values = pd.Series(X[heatmap_feature].copy().values, index=X.index)

# Get maximum and minimum values from the column
heatmap_feature_max_value = X[heatmap_feature].max()
heatmap_feature_min_value = X[heatmap_feature].min()

X[heatmap_feature] = -1
# Check if the column exists
if heatmap_feature + '_MISSING' in X.columns:
    X[heatmap_feature + '_MISSING'] = 1
    
X

## Generating samples of missing values using conditional GMM

In [None]:
from ConditionalGMM.condGMM import CondGMM
import json

def gmm_generate_samples(n_datapoints=2, n_samples=1000, plot_histograms=False, print_index=False):
    """
    Generate samples using Conditional Gaussian Mixture Model for imputing missing data.

    Args:
        n_datapoints (int/str): Number of data points to consider
        n_samples (int): Number of samples to generate
        plot_histograms (bool): Whether to plot histograms of sampled data
        print_index (bool): Whether to print the index of the current data point
    Returns:
        pd.DataFrame: DataFrame with imputed missing data
    """
    if n_datapoints > X.shape[0]:
        n_datapoints = X.shape[0]

    # Create a copy of the DataFrame for data manipulation
    temp_df = X.head(n_datapoints).copy().astype(object)
    # Remove missing flag columns
    temp_df = temp_df[temp_df.columns.drop(list(temp_df.filter(regex='MISSING')))]
        
    for index, row in X.head(n_datapoints).iterrows():
        # Get indices and values of unknown and known features
        unknown_features_indexes, known_features_indexes, known_features_values = get_feature_indices_and_values(temp_df, index, row)
        
        # If all features are known, continue
        if len(unknown_features_indexes) == 0:
            continue
        
        # Initialize CondGMM
        cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
        
        # Generate samples using Conditional GMM
        sampled_data = cGMM.rvs(known_features_values, size=n_samples, random_state=RANDOM_STATE)
        
        # Update unknown features with sampled data
        for feature_index in range(len(unknown_features_indexes)):
            temp_df.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(n_samples)])

        # Plot histograms if specified
        if plot_histograms:
            plot_histogram(temp_df, unknown_features_indexes, index, sampled_data)
        
        # Print index if specified, for debugging purposes
        if print_index:
            print(index)
        
    return temp_df
        
def get_feature_indices_and_values(temp_df, index, row):    
    unknown_features_indexes = []  # Initialize list to store indices of unknown features
    
    # Iterate through columns in the DataFrame
    for col in X.columns:
        # Skip columns ending with '_MISSING'
        if col.endswith('_MISSING'):
            continue
        # Check if there is a corresponding missing flag column
        elif col + "_MISSING" in X.columns:
            # Identify unknown features where missing flag is 1 and value is -1
            if row[col + "_MISSING"] == 1 and (row[col] == -1):
                unknown_features_indexes.append(temp_df.columns.get_loc(col))  # Add index of feature
        else:
            # Identify unknown features where value is -1
            if (row[col] == -1):
                unknown_features_indexes.append(temp_df.columns.get_loc(col))  # Add index of feature
        
    # Find indices of known features
    known_features_indexes = list(set(range(temp_df.shape[1])) - set(unknown_features_indexes))
    
    # Extract values of known features for the given row
    known_features_values = temp_df.iloc[index, known_features_indexes]
    
    return unknown_features_indexes, known_features_indexes, known_features_values  # Return indices and values

def plot_histogram(temp_df, unknown_features_indexes, index, sampled_data, n_bins=20):
    # Create subplots based on the number of unknown features
    fig, axs = plt.subplots(sampled_data.shape[1], 1, figsize=(8, len(unknown_features_indexes) * 4))

    # Plot histograms for each feature
    plot_features(temp_df, sampled_data, axs, unknown_features_indexes, index, n_bins)

    # Add title and labels to the figure
    fig.suptitle(f'Histograms for index {index}', fontsize=20)  # Title with the index
    fig.text(0.5, 0.04, 'Value', ha='center', fontsize=14)  # X-axis label
    fig.text(0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=14)  # Y-axis label
    plt.show()  # Show the figure

    
def plot_features(temp_df, sampled_data, axs, unknown_features_indexes, index, n_bins):
    # Ensure axs is a list for consistency in cases when only one feature is plotted
    if not isinstance(axs, np.ndarray):
        axs = [axs]

    for i in range(sampled_data.shape[1]):
        expected_value = imputer_df.iloc[index, unknown_features_indexes[i]]
        
        # Plot histogram for the feature
        axs[i].hist(sampled_data[:, i], bins=n_bins, alpha=0.5, label='sampled data')
        axs[i].hist(imputer_df[imputer_df.columns[unknown_features_indexes[i]]], bins=n_bins, alpha=0.2, label='marginal data')
        axs[i].set_title(f'{temp_df.columns[unknown_features_indexes[i]]}')
        
        # Add expected value as text on the histogram
        axs[i].text(0.95, 0.95, f'Expected value: {expected_value}', ha='right', va='top', transform=axs[i].transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
        
        axs[i].legend(loc='upper left')

In [None]:
%load_ext autoreload
%autoreload 2

n_datapoints = X.shape[0]
number_of_samples = 100
cgmm_df = gmm_generate_samples(n_datapoints=n_datapoints, n_samples=number_of_samples)

# Save the imputed DataFrame to a CSV file
cgmm_df.to_csv('data/HAD_after_cgmm.csv', index=False)

cgmm_df.head()

### Calculate MSE for generated values

In [None]:
from sklearn.metrics import mean_squared_error

# Create output file
output_file_path = os.path.join("results", "with_missingness.txt")
with open(output_file_path, "a") as f:
    f.write(f"Results for {number_of_samples} samples:\n")

# Initialize dictionary to store MSE and NMSE values for each feature
feature_mse = {}

for index, row in cgmm_df.iterrows():
    # Get the indices and values of unknown features
    unknown_features_indexes = [col_index for col_index, col in enumerate(row) if isinstance(col, str)]
    
    # If all features are known, continue
    if len(unknown_features_indexes) == 0:
        continue
    
    # Compare values of generated samples with imputed values
    for feature_index in range(len(unknown_features_indexes)):
        imputed_value = imputer_df.iloc[index, unknown_features_indexes[feature_index]]
        sampled_value = json.loads(row.iloc[unknown_features_indexes].values[feature_index])
        
        for sample in sampled_value:
            if not pd.isna(sample):
                # Calculate mean squared error
                mse = mean_squared_error(imputed_value.flatten(), np.array(sample).flatten())
                
                # Save each mse and nmse to its corresponding feature in feature_mse and feature_nmse
                if unknown_features_indexes[feature_index] not in feature_mse:
                    feature_mse[unknown_features_indexes[feature_index]] = []
                
                feature_mse[unknown_features_indexes[feature_index]].append(mse)
        
# Print the mean squared error and normalized mean squared error for each feature and save to file
with open(output_file_path, "a") as f:
    for feature_index, mse_list in feature_mse.items():
        mean_mse = np.mean(mse_list)
        f.write(f'Feature {cgmm_df.columns[feature_index]} MSE: {mean_mse}\n')
        print(f'Feature {cgmm_df.columns[feature_index]} MSE: {mean_mse}')
        
with open(output_file_path, "a") as f:
    f.write(f"\n\n")

### *Optional:* Generating heatmap which helps visualize how cGMM distributes data

In [None]:
# Define bin edges and adjust number of bins
n_bins = 13
h_min = heatmap_feature_min_value
h_max = heatmap_feature_max_value
width = (h_max - h_min) / (n_bins-1)
bin_edges = np.arange(h_min - width/2, h_max +  width, width)

# Save the original values of the heatmap feature
heatmap_feature_values = cgmm_df[heatmap_feature]

# Deserializing the values of the heatmap feature
for i, entry in enumerate(heatmap_feature_values):
    if isinstance(entry, str):
        float_array = json.loads(entry)
        heatmap_feature_values[i] = float_array

# Determine the number of samples based on the length of an entry with a list of values
list_length = None
for entry in heatmap_feature_values:
    if isinstance(entry, list):
        list_length = len(entry)
        break

histograms = []

for entry in heatmap_feature_values:
    if isinstance(entry, list):
        # Apply np.histogram with specified bin edges
        hist, _ = np.histogram(entry, bins=bin_edges)
        histograms.append(hist)
    else:
        # Single-value entry: Place it in the correct bin and set frequency to list_length
        hist = np.zeros(len(bin_edges) - 1)
        index = np.digitize([entry], bin_edges)[0] - 1
        if 0 <= index < len(hist):
            hist[index] = list_length
        histograms.append(hist)

# Reshape histograms to fit imshow format (histograms should be a 2D array)
histograms_2d = np.array(histograms)

# Determine figure height based on the number of datapoints
fig_height = max(6, n_datapoints * 0.2)
fig_width = min(max(10, bin_edges.shape[0] * 1), 32)

extent = [bin_edges[0], bin_edges[-1], 0, len(histograms)]

# Create heatmap
plt.figure(figsize=(fig_width, fig_height))
plt.imshow(histograms_2d, aspect='auto', cmap='Reds', extent=extent)
plt.colorbar()
plt.xticks(bin_edges, labels=np.round(bin_edges, 2), rotation=90)
plt.yticks(range(n_datapoints), range(n_datapoints))
plt.xlabel('Values')
plt.ylabel('Index')
plt.title(heatmap_feature + ' values heatmap generated by cGMM')

# Add markers for original values of a missing feature
original_values = heatmap_feature_original_values[heatmap_feature_original_values != -1] # Selecting only the non -1 values
indexes = heatmap_feature_original_values.index[heatmap_feature_original_values != -1] + 0.5 # Indexes corresponding to non -1 values
plt.scatter(original_values, indexes, color='red', marker='o', s=80, label='Original Values')
plt.scatter(imputer_df[heatmap_feature], imputer_df.index + 0.5, color='blue', marker='x', label='Imputer Values')

plt.grid(True)
plt.legend()
plt.savefig('images/heatmaps/' + heatmap_feature + '_heatmap.png')
plt.show()