In [None]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

In [None]:
df = pd.read_csv('data/HAD.csv')
df.head()

In [None]:
target_feature = 'MRS_90'
y = df.filter([target_feature])
y.to_csv('data/HAD_target.csv', index=False)

#### Because of the warning given while imputing missing values, 'SERUM_GLUCOSE' and 'VALV_HEART' columns are removed due to not having any other entry than '-1' value

In [None]:
df = df.drop(columns={'SERUM_GLUCOSE', 'SERUM_GLUCOSE_MISSING', 'VALV_HEART', 'MRS_90', 'MRS_90_DICHO'}, axis=1)
df.shape

### Generating missing data based on sklearn imputers

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# import warnings
# warnings.filterwarnings('ignore')

imputer_df = df.copy()

missing_flag_cols = [col for col in df.columns if col.endswith('_MISSING')]

# imp = SimpleImputer(missing_values=-1, strategy='median')
imp = IterativeImputer(missing_values=-1, initial_strategy='median', random_state=RANDOM_STATE)

# Iterate over columns with missingness flags to heck if imputation is needed and apply imputation only if missingness flag is set to 1
for missing_flag_col in missing_flag_cols:
    value_col = missing_flag_col.replace('_MISSING', '')
    
    if (imputer_df[missing_flag_col] == 1).any():
        imp.fit(imputer_df[[value_col]])
        imputer_df[value_col] = imp.transform(imputer_df[[value_col]]).ravel()
        # # Update missingness flag to 2 for imputed values
        # imputer_df[missing_flag_col] = imputer_df[missing_flag_col].replace(1, 2)

imputer_df = imputer_df[imputer_df.columns.drop(list(imputer_df.filter(regex='MISSING')))]
imputer_df.to_csv('data/HAD_no_missingness.csv', index=False)
imputer_df.head()

### Using BIC to get the optimal number of components for GMM

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def compute_bic(X, n_components_range):
    bic = []
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, random_state=RANDOM_STATE)
        gmm.fit(X)
        bic.append(gmm.bic(X))
    return bic

n_components_range = range(1, 11)

bic_values = compute_bic(imputer_df, n_components_range)

optimal_n_components = n_components_range[np.argmin(bic_values)]

plt.plot(n_components_range, bic_values, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('BIC Value')
plt.title('BIC for Gaussian Mixture Models')
plt.grid(True)
plt.show()

### Gaussian Mixture Model fitting

In [None]:
gmm = GaussianMixture(n_components=optimal_n_components, random_state=RANDOM_STATE)
gmm.fit(imputer_df)

### Trying to manually sample from the closest distribution

In [None]:
temp_df = df.iloc[:5].copy()
temp_df = temp_df[temp_df.columns.drop(list(temp_df.filter(regex='MISSING')))]
temp_df

In [None]:
from scipy.linalg import eigh

def is_positive_definite(matrix):
    # Check if the matrix is symmetric
    if not np.allclose(matrix, matrix.T):
        return False
    
    # Check if all eigenvalues are positive
    eig_vals, _ = eigh(matrix)
    return np.all(eig_vals > 0)

def check_covariance_matrices(covariances):
    for i, cov in enumerate(covariances):
        if not is_positive_definite(cov):
            print(f"Covariance matrix {i+1} is not positive definite.")
        else:
            print(f"Covariance matrix {i+1} is positive definite.")

# Example usage:
# Assuming 'covariances' is a list of covariance matrices
# Check if all covariance matrices are positive definite
check_covariance_matrices(gmm.covariances_)


In [None]:
from ConditionalGMM.ConditionalGMM.condGMM import CondGMM

unknown_features_indexes = []

for index, row in df.iterrows():
    for missing_flag_col in missing_flag_cols:
        mv_column = missing_flag_col
        v_column = missing_flag_col.replace('_MISSING', '')
        mv_flag = row[mv_column]
        v_value = row[v_column]
        
        if mv_flag == 1 and v_value == -1:
            unknown_features_indexes.append(temp_df.columns.get_loc(v_column))  # Use columns from temp_df
        
    known_features_indexes = list(set(range(temp_df.shape[1])) - set(unknown_features_indexes))  # Use indexes from temp_df
    
    known_features_values = temp_df.iloc[index, known_features_indexes]  # Use values from temp_df
    unknown_features_values = temp_df.iloc[index, unknown_features_indexes]  # Use values from temp_df

    cGMM = CondGMM(gmm.weights_, gmm.means_, gmm.covariances_, known_features_indexes)
    
    sampled_data = cGMM.rvs(known_features_values, size=1)  # Sample from the conditional Gaussian Mixture Model
    
    temp_df.iloc[index, unknown_features_indexes] = sampled_data[0, -len(unknown_features_indexes):]  # Impute sampled data into temp_df

    unknown_features_indexes = []  # Reset for the next iteration
    known_features_indexes = []  # Reset for the next iteration

In [None]:
# # Number of components in the GMM
# num_components = 2

# # Example weights for the GMM (summing to 1)
# weights = np.array([0.5, 0.5])

# # Example means for the GMM
# means = np.array([[-2, -5, 0], [3, -3, 0]])  # Example means for each component in 3D space

# # Example covariances for the GMM
# covs = np.array([[[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]],  # Covariance matrix for the first component
#                  [[1, -0.5, 0], [-0.5, 1, 0], [0, 0, 1]]])  # Covariance matrix for the second component

# # Initialize the ConditionalGMM object
# cGMM = CondGMM(weights, means, covs, fixed_indices=[0])

# # Define the range of y and z values
# y_values = np.linspace(-12, 0, 200)
# z_values = np.linspace(-3, 3, 200)

# # Draw random samples for each value of x_obs
# N = 100
# x_obs = np.array([-1, 4, 7])a

# for x in x_obs:
#     # Compute the conditional probability distributions
#     y_cpdf = np.array([cGMM.conditional_pdf([yi, 0], x) for yi in y_values])  # Fix z=0
#     z_cpdf = np.array([cGMM.conditional_pdf([0, zi], x) for zi in z_values])  # Fix y=0
    
#     # Draw random samples from the conditional GMM
#     y_rvs = cGMM.rvs([x, 0], size=N)  # Fix z=0
#     z_rvs = cGMM.rvs([x, 0], size=N)  # Fix y=0

#     # Now you can use y_rvs and z_rvs arrays containing random samples from the conditional GMM


In [None]:
# concatenated_df = pd.concat([df.head(5), imputer_df.head(5), temp_df], axis=0)
# concatenated_df['source'] = ['df'] * len(df.head(5)) + ['imputer_df'] * len(imputer_df.head(5)) + ['temp_df'] * len(temp_df)
# concatenated_df.set_index('source', inplace=True)
# concatenated_df.T.to_csv('results/comparison_df.csv')