In [86]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

In [87]:
df = pd.read_csv('data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [88]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [89]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)  # Fit the model to the data
        bic.append(gmm.bic(X))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

n_components_range = range(1, 21)  # Range of number of components to evaluate

bic_values = compute_bic(X, n_components_range)  # Compute BIC values

optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

# Plotting BIC values
plt.plot(n_components_range, bic_values, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('BIC Value')
plt.title('BIC for Gaussian Mixture Models')
plt.grid(True)
# plt.savefig('images/cGMM_without_missingness_BIC.png')
plt.show()


KeyboardInterrupt: 

In [None]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)

# Fit the Gaussian Mixture Model to the imputed DataFrame
gmm.fit(X)

In [138]:
from sklearn.model_selection import train_test_split

# Function that randomly removes features and replace their values with NaN
def remove_features(data, num_features_to_remove):
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.9, random_state=RANDOM_STATE)
    subset = X_train.copy()
    features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    subset = subset.astype(object)
    subset.loc[:, features_to_remove] = np.nan
    return subset

subsets = []

for _ in range(2):
    subsets.append(remove_features(X, 1))

for _ in range(2):
    subsets.append(remove_features(X, 2))

for _ in range(6):
    num_features_to_remove = np.random.randint(1, min(5, len(X.columns) - 1))
    subsets.append(remove_features(X, num_features_to_remove))

for i, subset in enumerate(subsets):
    nan_columns = subset.columns[subset.isnull().all()]
    print(f"Subset {i+1} has missing values in: {', '.join(nan_columns)}")

Subset 1 has missing values in: alco
Subset 2 has missing values in: height
Subset 3 has missing values in: age, smoke
Subset 4 has missing values in: height, cholesterol
Subset 5 has missing values in: age, height, weight, ap_hi
Subset 6 has missing values in: ap_lo
Subset 7 has missing values in: age, ap_hi, alco
Subset 8 has missing values in: ap_lo, cholesterol, smoke
Subset 9 has missing values in: height, weight, ap_lo, smoke
Subset 10 has missing values in: smoke


In [140]:
from ConditionalGMM.condGMM import CondGMM
import json

%load_ext autoreload
%autoreload 2

n_samples = 1000
row_index = 0

for index, row in subsets[0].iterrows():
    # Gather indexes of known features and their values
    known_features_indexes = [row.index.get_loc(col) for col in row.dropna().index]
    unknown_features_indexes = list(set(range(subsets[0].shape[1])) - set(known_features_indexes))
    known_features_values = row.dropna().values
    
    cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
    
    sampled_data = cGMM.rvs(known_features_values, size=n_samples, random_state=RANDOM_STATE)
    
    # Update unknown features with sampled data
    for feature_index in range(len(unknown_features_indexes)):
        subsets[1].iloc[row_index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(n_samples)])
        
    row_index += 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


KeyboardInterrupt: 

In [None]:
subsets[0].iloc[0, 4]

'[122.35394346083757, 130.04074916001628, 138.5493581762801, 133.49306255871366, 143.74911962179934, 150.85768057327823, 128.7121838092859, 150.97451624237115, 129.83744645384186, 136.51155369103543, 124.10400905203606, 130.67118695644388, 153.62496677203947, 129.74948483105797, 156.18766747485688, 118.27019522974624, 122.67533671784724, 135.17338471776878, 140.838052820918, 122.2819036708175, 151.47861774745127, 123.38078343940873, 138.18631842213057, 137.7515199020222, 139.2550918556589, 131.91358557451957, 127.20824259712639, 132.21140902129284, 139.53538266983375, 141.65051841267947, 141.45450305358395, 128.70940031977966, 165.22784839312524, 155.80949462433503, 132.87173786554828, 134.50264742348566, 122.53422356921364, 136.8554875407312, 123.35382337846858, 135.66178714604666, 133.76304417312278, 122.73377831894032, 132.5897990347849, 139.3897923749294, 123.65258351084708, 145.38304434978545, 137.2931890946796, 130.95454698251157, 140.20708415743783, 131.8540149411754, 144.258760