In [1365]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

RANDOM_STATE = 404

## Wanted to check if any of the rows has a full set of features and the results is that each of them has at least one feature value missing

In [1366]:
df = pd.read_csv('data/HAD.csv')
df.head()

Unnamed: 0,AGE,AGE_MISSING,SEX_F,NIHSS_BL,NIHSS_BL_MISSING,SYS_BLOOD_PRESSURE,SYS_BLOOD_PRESSURE_MISSING,PREV_MRS,PREV_MRS_MISSING,ORAL_ANTICOAGULANT,...,ONSET_TO_ADMISSION,ONSET_TO_ADMISSION_MISSING,ONSET_TO_IMAGING,ONSET_TO_IMAGING_MISSING,ONSET_TO_TPA,ONSET_TO_TPA_MISSING,ONSET_TO_GROIN,ONSET_TO_GROIN_MISSING,MRS_90,MRS_90_DICHO
0,70,0,0,3,0,-1,1,0,0,0,...,64,0,96,0,180,0,-1,1,0,0
1,55,0,0,6,0,142,0,3,0,0,...,38,0,104,0,165,0,-1,1,4,1
2,73,0,0,3,0,170,0,0,0,1,...,-1,1,-1,1,-1,1,-1,1,2,0
3,81,0,0,10,0,-1,1,0,0,0,...,69,0,90,0,115,0,-1,1,3,1
4,81,0,1,11,0,-1,1,0,0,0,...,98,0,110,0,120,0,-1,1,0,0


In [1367]:
df_filtered = df[df.columns.drop(list(df.filter(regex='MISSING')))]
df_filtered.head()

Unnamed: 0,AGE,SEX_F,NIHSS_BL,SYS_BLOOD_PRESSURE,PREV_MRS,ORAL_ANTICOAGULANT,SERUM_GLUCOSE,HYPERTENSION,HYPERCHOL,ISCH_HEART,...,OCCLUSION_ACA,OCCLUSION_PCA,OCCLUSION_VB,CTA_CS,ONSET_TO_ADMISSION,ONSET_TO_IMAGING,ONSET_TO_TPA,ONSET_TO_GROIN,MRS_90,MRS_90_DICHO
0,70,0,3,-1,0,0,-1,1,1,0,...,0,0,0,-1,64,96,180,-1,0,0
1,55,0,6,142,3,0,-1,1,1,0,...,-1,-1,-1,-1,38,104,165,-1,4,1
2,73,0,3,170,0,1,-1,1,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,2,0
3,81,0,10,-1,0,0,-1,1,0,1,...,0,1,0,-1,69,90,115,-1,3,1
4,81,1,11,-1,0,0,-1,1,1,0,...,-1,-1,-1,-1,98,110,120,-1,0,0


In [1368]:
df_filtered = df_filtered[(df_filtered != -1).all(axis=1)]
df_filtered.head()

Unnamed: 0,AGE,SEX_F,NIHSS_BL,SYS_BLOOD_PRESSURE,PREV_MRS,ORAL_ANTICOAGULANT,SERUM_GLUCOSE,HYPERTENSION,HYPERCHOL,ISCH_HEART,...,OCCLUSION_ACA,OCCLUSION_PCA,OCCLUSION_VB,CTA_CS,ONSET_TO_ADMISSION,ONSET_TO_IMAGING,ONSET_TO_TPA,ONSET_TO_GROIN,MRS_90,MRS_90_DICHO


## Gaussian Mixed Model

In [1369]:
target_feature = 'MRS_90'
y = df.filter([target_feature])

# Because of the warning given while imputing missing values, 'SERUM_GLUCOSE' and 'VALV_HEART' columns are removed due to not having any other entry than '-1' value.
df = df.drop(columns={target_feature, 'SERUM_GLUCOSE', 'VALV_HEART'}, axis=1)

# All columns containing information about the missingness are removed
df = df[df.columns.drop(list(df.filter(regex='MISSING')))]

In [1370]:
import warnings
warnings.filterwarnings('ignore')

In [1371]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=RANDOM_STATE)
strategies = ['mean', 'median', 'most_frequent']
results_gmm = []

# Loop through the combinations of imputers and strategies
for i in range(3):
    for strategy in strategies:
        # Initialize the imputer based on the iteration
        if i == 0:
            imp = SimpleImputer(missing_values=-1, strategy=strategy)
            imputer_type = 'Simple'
        elif i == 1:
            imp = IterativeImputer(missing_values=-1, initial_strategy=strategy, random_state=RANDOM_STATE)
            imputer_type = 'Iterative (10)'
        else:
            imp = IterativeImputer(missing_values=-1, max_iter=100, initial_strategy=strategy, random_state=RANDOM_STATE)
            imputer_type = 'Iterative (100)'
            
        # Fit and transform the imputer
        imputed_df = imp.fit_transform(df)
        
        # Fit Gaussian Mixture Model (GMM) on the imputed data
        gmm = GaussianMixture(n_components=6, covariance_type='full', random_state=RANDOM_STATE)
        gmm.fit(imputed_df)

        # Predict cluster labels using GMM
        y_pred = gmm.predict(df)

        # Calculate accuracy and mean squared error (MSE) for the predictions
        accuracy = accuracy_score(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        
        # Append the results to the list
        results_gmm.append({'Imputer': imputer_type,
                        'Strategy': strategy,
                        'Accuracy (%)': round(accuracy * 100, 2),
                        'MSE': round(mse, 2)})

# Create a DataFrame from the list of results
df_results_gmm = pd.DataFrame(results_gmm)

# Print the results DataFrame
print(df_results_gmm)

           Imputer       Strategy  Accuracy (%)    MSE
0           Simple           mean         23.73  10.48
1           Simple         median         19.60   7.89
2           Simple  most_frequent         22.25   7.95
3   Iterative (10)           mean         23.73  10.49
4   Iterative (10)         median         20.13   9.39
5   Iterative (10)  most_frequent         12.18   7.88
6  Iterative (100)           mean         17.48   8.78
7  Iterative (100)         median         19.17   9.48
8  Iterative (100)  most_frequent         12.82  12.83


In [1372]:
# Only when working with 2 dimensions of data
# from matplotlib.colors import LogNorm

# imputed_df = imputed_df.filter(['AGE', 'SYS_BLOOD_PRESSURE'], axis=1)

# # display predicted scores by the model as a contour plot
# x = np.linspace(-200.0, 200.0)
# y = np.linspace(-200.0, 200.0)
# X, Y = np.meshgrid(x, y)
# XX = np.array([X.ravel(), Y.ravel()]).T
# Z = -gmm.score_samples(XX)
# Z = Z.reshape(X.shape)

# CS = plt.contour(
#     X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10)
# )
# CB = plt.colorbar(CS, shrink=0.8, extend="both")
# plt.scatter(imputed_df.values[:, 0], imputed_df.values[:, 1], 0.8)

# plt.title("Negative log-likelihood predicted by a GMM")
# plt.axis("tight")
# plt.show()

## Kernel Density Estimation

In [1374]:
from sklearn.neighbors import KernelDensity

# Initialize an empty list to store results
results_kde = []

# Loop through the combinations of imputers and strategies
for i in range(3):
    for strategy in strategies:
        # Initialize the imputer based on the iteration
        if i == 0:
            imp = SimpleImputer(missing_values=-1, strategy=strategy)
            imputer_type = 'Simple'
        elif i == 1:
            imp = IterativeImputer(missing_values=-1, initial_strategy=strategy, random_state=RANDOM_STATE)
            imputer_type = 'Iterative (10)'
        else:
            imp = IterativeImputer(missing_values=-1, max_iter=100, initial_strategy=strategy, random_state=RANDOM_STATE)
            imputer_type = 'Iterative (100)'
            
        # Fit and transform the imputer
        imputed_df = imp.fit_transform(df)
        
        # Fit Kernel Density Estimation (KDE) on the imputed data
        kde = KernelDensity(bandwidth=0.1)
        kde.fit(imputed_df)

        # Sample from the KDE to get density estimates
        density_estimates = kde.score_samples(df)

        # Assign cluster labels based on density estimates
        threshold = 0 
        y_pred = (density_estimates < threshold).astype(int)

        # Calculate accuracy and mean squared error (MSE) for the predictions
        accuracy = accuracy_score(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        
        # Append the results to the list
        results_kde.append({'Imputer': imputer_type,
                        'Strategy': strategy,
                        'Accuracy (%)': round(accuracy * 100, 2),
                        'MSE': mse})

# Create a DataFrame from the list of results
df_results_kde = pd.DataFrame(results_kde)

# Print the results DataFrame
print(df_results_kde)


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets