In [None]:
%cd ..

# Tutorial: Cluster incomplete multi-modal data

## Prerequisites

We will need the following libraries installed: seaborn

## Step 1: Import required libraries

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datasets import LoadDataset
from imml.preprocessing import MultiViewTransformer, NormalizerNaN
from imml.ampute import Amputer
from imml.cluster import IMSR, PIMVC

In [None]:
from tueplots import axes, bundles
plt.rcParams.update(**bundles.icml2022(), **axes.lines())
for key in ["axes.labelsize", "axes.titlesize", "font.size", "legend.fontsize", "xtick.labelsize", "ytick.labelsize"]:
    if key == "legend.fontsize":
        plt.rcParams[key] += 3
    else:
        plt.rcParams[key] += 6

## Step 2: Load the dataset

Load the bbcsport dataset and obtain the true labels and number of clusters.

In [None]:
Xs, y = LoadDataset.load_dataset(dataset_name="bbcsport", return_y=True)
print("Samples:", len(Xs[0]), "\t", "Modalities:", len(Xs), "\t", "Features:", [X.shape[1] for X in Xs])
y = y.squeeze()
n_clusters = int(y.nunique())
y.value_counts()

# Step 3: Benchmarking

Define the parameters for the experiment:
- ps: the percentages of data to be made incomplete (from 0% to 100% in 20% steps).
- mechanisms: the different types of missing data mechanisms to simulate.
- n_times: the number of times to repeat the experiment for averaging the results.
- all_metrics: an empty dictionary to store the performance metrics (Adjusted Rand Index).

In [None]:
ps = np.arange(0., 1., 0.2)
mechanisms = ["um", "pm", "mcar", "mnar"]
n_times = 50
algorithms = ["IMSR", "PIMVC"]
methods = ["Original", "Baseline"]
all_metrics = {}

Define the clustering method to be used: LFIMVC.
This method is designed for clustering incomplete multi-view data.

In [None]:
for algorithm in tqdm(algorithms):
    all_metrics[algorithm] = {}
    for method in tqdm(methods):
        all_metrics[algorithm][method] = {}
        for mechanism in mechanisms:
            all_metrics[algorithm][method][mechanism] = {}
            for p in ps:
                missing_percentage = int(p*100)
                if (p == 0) and (mechanism != mechanisms[0]):
                    all_metrics[algorithm][method][mechanism][0] = all_metrics[algorithm]["Original"][mechanisms[0]][0]
                else:
                    all_metrics[algorithm][method][mechanism][missing_percentage] = {}
                    for i in range(n_times):
                        alg = eval(algorithm)
                        pipeline = make_pipeline(
                            MultiViewTransformer(NormalizerNaN().set_output(transform="pandas")),
                            alg(n_clusters=n_clusters, random_state=i))
                        if method == "Baseline":
                            pipeline = make_pipeline(
                                MultiViewTransformer(SimpleImputer().set_output(transform="pandas")),
                                *pipeline)
                        pipeline = make_pipeline(Amputer(p=p, mechanism=mechanism, random_state=i), *pipeline)
                        clusters = pipeline.fit_predict(Xs)
                        metric = adjusted_rand_score(labels_true=y, labels_pred=clusters)
                        all_metrics[algorithm][method][mechanism][missing_percentage][i] = metric

Add a baseline method to compare with LFIMVC.
This method uses SimpleImputer to fill missing data, followed by normalization and clustering.

## Step 4: Transform results

Flatten the metrics dictionary into a format suitable for DataFrame creation. Each entry in the DataFrame will represent a mechanism, method, and percentage of incomplete samples.

In [None]:
flattened_data = [
    {
        'Algorithm': algorithm,
        'Method': method,
        'Incomplete samples (\%)': p,
        'Mechanism': mechanism,
        **p_dict
    }
    for algorithm, algorithm_dict in all_metrics.items()
    for method, method_dict in algorithm_dict.items()
    for mechanism, mechanism_dict in method_dict.items()
    for p, p_dict in mechanism_dict.items()
]
df = pd.DataFrame(flattened_data)
df = df.melt(id_vars=["Algorithm", 'Method', "Mechanism", 'Incomplete samples (\%)'],
             var_name='Iteration', value_name='Adjusted Rand Index')
df = df.sort_values(["Algorithm", "Method", "Mechanism", "Incomplete samples (\%)", "Iteration"], ascending=[True, False, False, True, True])
df.to_csv("tutorials/cluster_results.csv", index= None)
df

In [None]:
df = pd.read_csv("tutorials/cluster_results.csv")
df

## Step 5: Visualize results

Create a visualization using seaborn's FacetGrid. We are plotting AMI (clustering performance) vs. percentage of incomplete data. Each subplot corresponds to a different missing data mechanism.

In [None]:
df["Split"] = df["Algorithm"] + df["Method"]
colorblind_palette = sns.color_palette("colorblind")
n_estimators = df["Algorithm"].unique()
estimator_colors_dict = {estimator:colorblind_palette[i] for i, estimator in enumerate(n_estimators)}
estimator_colors_dict = {mix: col for mix in df["Split"] for est, col in estimator_colors_dict.items() if est in mix}
estimator_markers_dict = {estimator:["o", "+"][i] for i, estimator in enumerate(n_estimators)}
estimator_markers_dict = {mix: col for mix in df["Split"] for meth, col in estimator_markers_dict.items() if meth in mix}
method_linestyles_dict = {"Original": "-", "Baseline": "--"}
method_linestyles_dict = {mix: col for mix in df["Split"] for meth, col in method_linestyles_dict.items() if meth in mix}
mechanism_names = {"um": "Unpaired missing", "pm": "Partial missing", "mnar": "Missing not at random", "mcar": "Missing completely at random"}

In [None]:
g = sns.FacetGrid(data=df, col="Mechanism", legend_out=False,
                  despine= False, xlim=(-5, 150), ylim=(-0.05, 1.05)).map_dataframe(sns.pointplot, x="Incomplete samples (\%)",
                                                                    y="Adjusted Rand Index", hue="Split", 
                                                                    capsize= 0.05, seed= 42,
                                                                    palette= estimator_colors_dict,
                                                                    linestyles= list(method_linestyles_dict.values()),
                                                                    markers=list(estimator_markers_dict.values()))

handles = [plt.Line2D([0], [0], marker=marker, color=col, lw=0, markersize=5, markerfacecolor=col)
           for i, (_, col, marker) in enumerate(zip(n_estimators, colorblind_palette, list(estimator_markers_dict.values())))]
g.fig.legend(handles=handles, labels=n_estimators.tolist(), title="Algorithm",
             bbox_to_anchor=(0.99, 0.7), loc="center left", frameon=False)
handles = [plt.Line2D([0], [0], color='black', lw=2, linestyle=linestyle)
           for linestyle in method_linestyles_dict.values()]
leg = g.fig.legend(handles=handles, labels=["No", "Yes"], title="Mean \n imputation",
             bbox_to_anchor=(.99, 0.42), loc="center left", frameon=False)
leg.get_title().set_multialignment('center')

for i, mechanism in enumerate(df["Mechanism"].unique()):
    g.axes[0][i].set_title(mechanism_names[mechanism])
    g.axes.flatten()[i].set_xlim((-0.2, 4.2))

plt.tight_layout()
plt.savefig("paper_figures/cluster.pdf")
plt.savefig("paper_figures/cluster.svg")