In [96]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404

In [97]:
df = pd.read_csv('data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [98]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [99]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

# Fit and transform your data
X_scaled = scaler.fit_transform(X)

# Convert the scaled data back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

X = X_scaled_df

In [100]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)  # Fit the model to the data
        bic.append(gmm.bic(X))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

optimal_n_components = 26

if optimal_n_components is None:
    n_components_range = range(1, 51)  # Range of number of components to evaluate
    bic_values = compute_bic(X, n_components_range)  # Compute BIC values
    optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

    # Plotting BIC values
    plt.plot(n_components_range, bic_values, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC Value')
    plt.title('BIC for Gaussian Mixture Models')
    plt.grid(True)
    plt.show()

In [101]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)

# Fit the Gaussian Mixture Model to the imputed DataFrame
gmm.fit(X)

### Generating 10 subsets with randomly removed number of features

In [102]:
# Function that randomly removes features and replace their values with NaN
def remove_features(data, num_features_to_remove):
    subset = data.sample(frac=0.01, random_state=RANDOM_STATE)
    features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    subset = subset.astype(object)
    subset.loc[:, features_to_remove] = np.nan
    return subset

subsets = []

for _ in range(2):
    subsets.append(remove_features(X, 1))

for _ in range(2):
    subsets.append(remove_features(X, 2))

for _ in range(6):
    num_features_to_remove = np.random.randint(3, min(5, len(X.columns) - 1))
    subsets.append(remove_features(X, num_features_to_remove))

for i, subset in enumerate(subsets):
    nan_columns = subset.columns[subset.isnull().all()]
    print(f"Subset {i+1} has missing values in: {', '.join(nan_columns)}")
    

Subset 1 has missing values in: ap_hi
Subset 2 has missing values in: ap_hi
Subset 3 has missing values in: ap_lo, gluc
Subset 4 has missing values in: height, alco
Subset 5 has missing values in: height, weight, ap_hi
Subset 6 has missing values in: age, ap_hi, ap_lo, gluc
Subset 7 has missing values in: gender, height, ap_hi
Subset 8 has missing values in: gender, ap_lo, cholesterol, alco
Subset 9 has missing values in: age, gender, ap_lo
Subset 10 has missing values in: age, ap_hi, gluc


In [103]:
subsets[0]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
9812,-0.232189,1.364055,1.905072,1.166624,,-0.088238,-0.539322,-0.39572,3.216684,4.194906,0.494167
40122,0.945648,-0.733108,-0.043755,0.749831,,0.017879,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
56499,-0.402015,-0.733108,0.565254,0.194108,,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,-2.023607
5270,0.122053,1.364055,0.687055,0.402504,,-0.088238,-0.539322,3.099157,-0.310879,-0.238384,-2.023607
69194,-0.190037,1.364055,1.661469,0.263573,,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
...,...,...,...,...,...,...,...,...,...,...,...
34658,1.01293,-0.733108,-0.896366,1.444485,,-0.03518,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
60533,-2.009078,1.364055,1.174262,1.305554,,-0.03518,-0.539322,-0.39572,3.216684,-0.238384,0.494167
11861,1.263007,-0.733108,-1.261771,-0.29215,,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,-2.023607
46401,0.712188,1.364055,-0.165556,0.194108,,0.017879,-0.539322,-0.39572,-0.310879,-0.238384,-2.023607


In [104]:
from ConditionalGMM.condGMM import CondGMM
import json

number_of_samples = 100

for subset in subsets:
    index = 0
    for row_index, row in subset.iterrows():
        # Get indices and values of unknown and known features
        unknown_features_indexes = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
        
        # Find indices of known features
        known_features_indexes = list(set(range(subset.shape[1])) - set(unknown_features_indexes))
        
        # Extract values of known features for the given row
        known_features_values = subset.iloc[index, known_features_indexes]
        
        # If all features are known, continue
        if len(unknown_features_indexes) == 0:
            continue
        
        # Initialize CondGMM
        cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
        
        # Generate samples using Conditional GMM
        sampled_data = cGMM.rvs(known_features_values, size=number_of_samples, random_state=RANDOM_STATE)
        
        # Update unknown features with sampled data
        for feature_index in range(len(unknown_features_indexes)):
            subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])
            
        index += 1

In [105]:
import json
from sklearn.metrics import mean_squared_error

for subset_index, subset in enumerate(subsets):
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, str)).any()]

    if not unknown_features_indexes:
        continue  # Skip if there are no missing values
    
    # Initialize dictionary to store MSE and NMSE values for each feature in the subset
    feature_mse = {}
    feature_nmse = {}
    
    # Iterate through rows in the subset DataFrame
    for index, row in subset.iterrows():
        # Extract original values for the current row from X
        original_values = X.iloc[index, unknown_features_indexes].values
        
        # Compute MSE for each feature separately
        for feature_index in range(len(unknown_features_indexes)):
            # Extract generated samples for unknown features and drop NaN values
            generated_samples_raw = json.loads(row.iloc[unknown_features_indexes].values[feature_index])
            generated_samples = [sample for sample in generated_samples_raw if not pd.isna(sample)]
            original_value = original_values[feature_index]
            
            # Calculate variance of generated samples
            var_generated = np.var(generated_samples)
            
            for sample in generated_samples:
                if not pd.isna(sample):
                    # Ensure both original_value and sample are arrays of the same length
                    original_value_array = np.full_like(np.array(sample), original_value)
                    mse_value = mean_squared_error(original_value_array.flatten(), np.array(sample).flatten())
                    
                    if var_generated == 0 or np.isnan(var_generated):
                        nmse_value = 0
                    else:
                        nmse_value = mse_value / var_generated
                    
                    # Add MSE value to the dictionary under the corresponding feature index
                    if unknown_features_indexes[feature_index] not in feature_mse:
                        feature_mse[unknown_features_indexes[feature_index]] = []
                        feature_nmse[unknown_features_indexes[feature_index]] = []
                        
                    feature_mse[unknown_features_indexes[feature_index]].append(mse_value)
                    feature_nmse[unknown_features_indexes[feature_index]].append(nmse_value)
    
    # Print the mean squared error and normalized mean squared error for each feature and save to file
    output_file_path = os.path.join("results", "without_missingness.txt")
    print(f"MSE for Subset {subset_index + 1}:")
    with open(output_file_path, "w") as f:
        for feature_index, mse_values in feature_mse.items():
            mean_mse = np.mean(mse_values)
            mean_nmse = np.mean(feature_nmse[feature_index])
            f.write(f'Feature {df.columns[feature_index]} MSE: {mean_mse} NMSE: {mean_nmse}\n')
            print(f"Feature {df.columns[feature_index]}: MSE = {mean_mse}  NMSE: {mean_nmse}")


MSE for Subset 1:
Feature ap_hi: MSE = 0.01641840020617918  NMSE: 1.9169245987062433
MSE for Subset 2:
Feature ap_hi: MSE = 0.01641840020617918  NMSE: 1.9169245987062433
MSE for Subset 3:
Feature ap_lo: MSE = 2.947109742441104  NMSE: 2483.6004893334803
Feature gluc: MSE = 1.0405200308144276  NMSE: 157709.6541841157
MSE for Subset 4:
Feature height: MSE = 1.4221132336171343  NMSE: 2.121496365312392
Feature alco: MSE = 1.718400602337973  NMSE: 1435308.9516472237
MSE for Subset 5:
Feature height: MSE = 1.464147361270429  NMSE: 2.429336915807227
Feature weight: MSE = 1.632001152997965  NMSE: 1.8960835985624045
Feature ap_hi: MSE = 0.016281141759946775  NMSE: 1.7481001411923063
MSE for Subset 6:
Feature age: MSE = 1.5773066998536223  NMSE: 1.1416673146217702
Feature ap_hi: MSE = 0.0160859145101557  NMSE: 1.2375211654614318
Feature ap_lo: MSE = 2.956270458753703  NMSE: 1470.523172271277
Feature gluc: MSE = 1.0234105833594176  NMSE: 138199.59238057537
MSE for Subset 7:
Feature gender: MSE = 0