In [36]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404

In [37]:
df = pd.read_csv('data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [38]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [39]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

# Fit and transform your data
X_scaled = scaler.fit_transform(X)

# Convert the scaled data back to DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

X = X_scaled_df

In [40]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)  # Fit the model to the data
        bic.append(gmm.bic(X))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

optimal_n_components = 26

if optimal_n_components is None:
    n_components_range = range(1, 51)  # Range of number of components to evaluate
    bic_values = compute_bic(X, n_components_range)  # Compute BIC values
    optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

    # Plotting BIC values
    plt.plot(n_components_range, bic_values, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC Value')
    plt.title('BIC for Gaussian Mixture Models')
    plt.grid(True)
    plt.show()

In [41]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)

# Fit the Gaussian Mixture Model to the imputed DataFrame
gmm.fit(X)

### Generating 10 subsets with randomly removed number of features

In [42]:
subsets_fraction = 0.01

# Function that randomly removes features and replace their values with NaN
def remove_features(data, num_features_to_remove):
    subset = data.sample(frac=subsets_fraction, random_state=RANDOM_STATE)
    features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    subset = subset.astype(object)
    subset.loc[:, features_to_remove] = np.nan
    return subset

subsets = []

for _ in range(2):
    subsets.append(remove_features(X, 1))

for _ in range(2):
    subsets.append(remove_features(X, 2))

for _ in range(6):
    num_features_to_remove = np.random.randint(3, min(5, len(X.columns) - 1))
    subsets.append(remove_features(X, num_features_to_remove))

for i, subset in enumerate(subsets):
    nan_columns = subset.columns[subset.isnull().all()]
    print(f"Subset {i+1} has missing values in: {', '.join(nan_columns)}")
    

Subset 1 has missing values in: gender
Subset 2 has missing values in: height
Subset 3 has missing values in: gender, gluc
Subset 4 has missing values in: gender, smoke
Subset 5 has missing values in: ap_hi, gluc, alco
Subset 6 has missing values in: height, weight, ap_hi
Subset 7 has missing values in: age, cholesterol, gluc
Subset 8 has missing values in: age, cholesterol, smoke, alco
Subset 9 has missing values in: ap_lo, smoke, alco
Subset 10 has missing values in: weight, ap_hi, ap_lo


In [43]:
subsets[0]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
9812,-0.232189,,1.905072,1.166624,0.007679,-0.088238,-0.539322,-0.39572,3.216684,4.194906,0.494167
40122,0.945648,,-0.043755,0.749831,0.007679,0.017879,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
56499,-0.402015,,0.565254,0.194108,0.007679,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,-2.023607
5270,0.122053,,0.687055,0.402504,0.007679,-0.088238,-0.539322,3.099157,-0.310879,-0.238384,-2.023607
69194,-0.190037,,1.661469,0.263573,-0.057251,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
...,...,...,...,...,...,...,...,...,...,...,...
37086,0.747045,,-0.774565,0.749831,0.007679,-0.088238,2.400793,-0.39572,-0.310879,-0.238384,0.494167
32248,-1.183052,,0.687055,0.749831,0.332333,0.017879,-0.539322,-0.39572,-0.310879,-0.238384,0.494167
32319,0.377805,,-1.018168,0.402504,0.07261,-0.088238,-0.539322,1.351719,-0.310879,-0.238384,0.494167
30594,-0.825161,,1.661469,-0.222685,-0.057251,-0.088238,-0.539322,-0.39572,-0.310879,-0.238384,0.494167


In [44]:
from ConditionalGMM.condGMM import CondGMM
import json

number_of_samples = 100

for subset in subsets:
    index = 0
    for row_index, row in subset.iterrows():
        # Get indices and values of unknown and known features
        unknown_features_indexes = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
        
        # Find indices of known features
        known_features_indexes = list(set(range(subset.shape[1])) - set(unknown_features_indexes))
        
        # Extract values of known features for the given row
        known_features_values = subset.iloc[index, known_features_indexes]
        
        # If all features are known, continue
        if len(unknown_features_indexes) == 0:
            continue
        
        # Initialize CondGMM
        cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
        
        # Generate samples using Conditional GMM
        sampled_data = cGMM.rvs(known_features_values, size=number_of_samples, random_state=RANDOM_STATE)
        
        # Update unknown features with sampled data
        for feature_index in range(len(unknown_features_indexes)):
            subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(sampled_data.shape[0])])
            
        index += 1

In [45]:
import json
from sklearn.metrics import mean_squared_error

output_file_path = os.path.join("results", "without_missingness.txt")
with open(output_file_path, "a") as f:
    f.write(f"Results for {number_of_samples} samples with {subsets_fraction} fraction of data:\n")

for subset_index, subset in enumerate(subsets):
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, str)).any()]

    if not unknown_features_indexes:
        continue  # Skip if there are no missing values
    
    # Initialize dictionary to store MSE and NMSE values for each feature in the subset
    feature_mse = {}
    
    # Iterate through rows in the subset DataFrame
    for index, row in subset.iterrows():
        # Extract original values for the current row from X
        original_values = X.iloc[index, unknown_features_indexes].values
        
        # Compute MSE for each feature separately
        for feature_index in range(len(unknown_features_indexes)):
            # Extract generated samples for unknown features and drop NaN values
            generated_samples_raw = json.loads(row.iloc[unknown_features_indexes].values[feature_index])
            generated_samples = [sample for sample in generated_samples_raw if not pd.isna(sample)]
            
            # Extract original value for the current feature
            original_value = original_values[feature_index]
            
            # Calculate variance of generated samples
            var_generated = np.var(generated_samples)
            
            for sample in generated_samples:
                if not pd.isna(sample):
                    # Ensure both original_value and sample are arrays of the same length
                    original_value_array = np.full_like(np.array(sample), original_value)
                    mse_value = mean_squared_error(original_value_array.flatten(), np.array(sample).flatten())
                    
                    # Add MSE value to the dictionary under the corresponding feature index
                    if unknown_features_indexes[feature_index] not in feature_mse:
                        feature_mse[unknown_features_indexes[feature_index]] = []
                        
                    feature_mse[unknown_features_indexes[feature_index]].append(mse_value)
    
    # Print the mean squared error  for each feature and save to file
    with open(output_file_path, "a") as f:
        print(f"MSE for Subset {subset_index + 1}:")
        f.write(f"MSE for Subset {subset_index + 1}:\n")
        for feature_index, mse_values in feature_mse.items():
            mean_mse = np.mean(mse_values)
            # mean_nmse = np.mean(feature_nmse[feature_index])
            f.write(f'Feature {df.columns[feature_index]} MSE: {mean_mse}\n')
            print(f"Feature {df.columns[feature_index]}: MSE = {mean_mse}")
            

with open(output_file_path, "a") as f:
    f.write(f"\n\n")

MSE for Subset 1:
Feature gender: MSE = 1.1702671604145851
MSE for Subset 2:
Feature height: MSE = 1.39844321836624
MSE for Subset 3:
Feature gender: MSE = 0.9440825874399931
Feature gluc: MSE = 1.0864168523378053
MSE for Subset 4:
Feature gender: MSE = 0.9850652857838303
Feature smoke: MSE = 0.9886308221857623
MSE for Subset 5:
Feature ap_hi: MSE = 0.036346094911362
Feature gluc: MSE = 1.0057685957142848
Feature alco: MSE = 1.400796016883348
MSE for Subset 6:
Feature height: MSE = 1.2728760844596043
Feature weight: MSE = 1.619188133740477
Feature ap_hi: MSE = 0.040491959972159944
MSE for Subset 7:
Feature age: MSE = 1.165482767213727
Feature cholesterol: MSE = 1.114161732432696
Feature gluc: MSE = 1.1117845426100401
MSE for Subset 8:
Feature age: MSE = 1.2328512082703134
Feature cholesterol: MSE = 1.0150128991511762
Feature smoke: MSE = 0.9591630627775334
Feature alco: MSE = 0.9058841240527284
MSE for Subset 9:
Feature ap_lo: MSE = 1.1687770693256538
Feature smoke: MSE = 1.03173825907