In [1]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

In [2]:
df = pd.read_csv('data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [4]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    """
    Computes the Bayesian Information Criterion (BIC) for Gaussian Mixture Models with different numbers of components.
    
    Parameters:
        X (array-like): Input data.
        n_components_range (range): Range of number of components to evaluate.
        
    Returns:
        list: BIC values for each number of components.
    """
    bic = []  # List to store BIC values
    for n_components in n_components_range:
        # Create Gaussian Mixture Model with specified number of components
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)  # Fit the model to the data
        bic.append(gmm.bic(X))  # Calculate BIC and add to list
    return bic  # Return list of BIC values

optimal_n_components = 26

if optimal_n_components is None:
    n_components_range = range(1, 51)  # Range of number of components to evaluate
    bic_values = compute_bic(X, n_components_range)  # Compute BIC values
    optimal_n_components = n_components_range[np.argmin(bic_values)]  # Determine optimal number of components

    # Plotting BIC values
    plt.plot(n_components_range, bic_values, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('BIC Value')
    plt.title('BIC for Gaussian Mixture Models')
    plt.grid(True)
    plt.show()

In [5]:
# Create Gaussian Mixture Model with optimal number of components
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)

# Fit the Gaussian Mixture Model to the imputed DataFrame
gmm.fit(X)

### Generating 10 subsets with randomly removed number of features

In [6]:
# Function that randomly removes features and replace their values with NaN
def remove_features(data, num_features_to_remove):
    subset = data.sample(frac=0.1, random_state=RANDOM_STATE)
    features_to_remove = np.random.choice(subset.columns[:-1], num_features_to_remove, replace=False)
    subset = subset.astype(object)
    subset.loc[:, features_to_remove] = np.nan
    return subset

subsets = []

# for _ in range(2):
#     subsets.append(remove_features(X, 1))

# for _ in range(2):
#     subsets.append(remove_features(X, 2))

# for _ in range(6):
#     num_features_to_remove = np.random.randint(3, min(5, len(X.columns) - 1))
#     subsets.append(remove_features(X, num_features_to_remove))

# for i, subset in enumerate(subsets):
#     nan_columns = subset.columns[subset.isnull().all()]
#     print(f"Subset {i+1} has missing values in: {', '.join(nan_columns)}")
    
subsets.append(remove_features(X, 3))

In [7]:
from ConditionalGMM.condGMM import CondGMM
import json

for subset in subsets:
    index = 0
    for row_index, row in subset.iterrows():
        # Get indices and values of unknown and known features
        unknown_features_indexes = [row.index.get_loc(col) for col in row.index if pd.isna(row[col])]
        
        # Find indices of known features
        known_features_indexes = list(set(range(subset.shape[1])) - set(unknown_features_indexes))
        
        # Extract values of known features for the given row
        known_features_values = subset.iloc[index, known_features_indexes]
        
        # If all features are known, continue
        if len(unknown_features_indexes) == 0:
            continue
        
        # Initialize CondGMM
        cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
        
        # Generate samples using Conditional GMM
        sampled_data = cGMM.rvs(known_features_values, size=100, random_state=RANDOM_STATE)
        
        # Update unknown features with sampled data
        for feature_index in range(len(unknown_features_indexes)):
            subset.iloc[index, unknown_features_indexes[feature_index]] = json.dumps([sampled_data[sample_index][feature_index] for sample_index in range(100)])
            
        index += 1

In [8]:
subsets[0]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
9812,"[16679.001436420796, 20199.748513247476, 17551...",2,180,"[105.00679744195439, 72.969365561997, 54.94076...",130,"[96.94773125442597, 86.76712634528535, 98.7154...",1,1,1,1,1
40122,"[17663.28661140646, 21147.941954760314, 18527....",1,164,"[94.02391009346127, 63.407073636278625, 45.108...",130,"[95.61748788330759, 85.86934727993639, 96.8901...",1,1,0,0,1
56499,"[17462.74887047189, 21121.005453874903, 18369....",1,169,"[96.45281578485701, 68.06989002481191, 51.0611...",130,"[95.23413588422329, 85.87069926060161, 96.6723...",1,1,0,0,0
5270,"[17875.980351601138, 21589.506373592423, 18796...",2,170,"[99.99003592822999, 75.85401540004113, 61.5047...",130,"[91.24106943187607, 84.93455914864201, 93.0497...",1,3,0,0,0
69194,"[16997.49628306699, 20655.752866470004, 17904....",2,178,"[98.8977303972049, 70.5148046371598, 53.506041...",120,"[91.52445438280051, 82.16101775917882, 92.9626...",1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
37086,"[18575.969241992127, 22060.62458534598, 19439....",1,158,"[96.05331686069542, 65.43648040351276, 47.1381...",130,"[95.8816427384584, 86.1335021350872, 97.154307...",3,1,0,0,1
32248,"[19104.061476913736, 22762.31806031675, 20010....",2,170,"[106.45594525858117, 78.07301949853607, 61.064...",180,"[116.30969805499231, 106.94626143137063, 117.7...",1,1,0,0,1
32319,"[18204.400854456788, 21514.0302547363, 19024.7...",1,156,"[101.06154569541663, 65.26315071147154, 44.344...",140,"[100.45020668671194, 89.76081495276836, 102.01...",1,2,0,0,1
30594,"[17075.944538424286, 20560.59988177814, 17939....",1,178,"[97.8893139315457, 67.27247747436304, 48.97412...",120,"[91.53208916034212, 81.78394855697093, 92.8047...",1,1,0,0,1


In [18]:
for index, row in subset.iterrows():
    generated_samples = row.iloc[unknown_features_indexes].values

KeyboardInterrupt: 

In [14]:
import json
from sklearn.metrics import mean_squared_error

for subset_index, subset in enumerate(subsets):
    # Determine unknown features indexes dynamically for each subset
    unknown_features_indexes = [col_index for col_index, col in enumerate(subset.columns) if subset[col].apply(lambda x: isinstance(x, str)).any()]

    if not unknown_features_indexes:
        continue  # Skip if there are no missing values
    
    # Initialize dictionary to store MSE values for each feature in the subset
    feature_mse = {}
    
    for index, row in subset.iterrows():
        # Extract original values for the current row from X
        original_values = X.iloc[index, unknown_features_indexes].values

        # Ensure sample is an array-like object
        if not isinstance(generated_samples, list):
            generated_samples = [generated_samples]
        
        # Compute MSE for each feature separately
        for feature_index in range(len(unknown_features_indexes)):
            # Extract generated samples for unknown features
            generated_samples = json.loads(row.iloc[unknown_features_indexes].values[unknown_features_indexes[feature_index]])
            original_value = original_values[feature_index]
            
            for sample in generated_samples:
                if not pd.isna(sample):
                    # Ensure both original_value and sample are arrays of the same length
                    original_value_array = np.full_like(np.array(sample), original_value)
                    mse_value = mean_squared_error(original_value_array.flatten(), np.array(sample).flatten())
                    
                    if unknown_features_indexes[feature_index] not in feature_mse:
                        feature_mse[unknown_features_indexes[feature_index]] = []
                        
                    feature_mse[unknown_features_indexes[feature_index]].append(mse_value)
    
    # Compute the mean MSE for each feature in the subset
    print(f"MSE for Subset {subset_index + 1}:")
    for feature_index, mse_values in feature_mse.items():
        mean_mse = np.mean(mse_values)
        print(f"Feature {feature_index}: {mean_mse}")


IndexError: index 3 is out of bounds for axis 0 with size 3