In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import adjusted_mutual_info_score
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from imvc.datasets import LoadDataset
from imvc.preprocessing import MultiViewTransformer, NormalizerNaN
from imvc.ampute import Amputer
from imvc.cluster import LFIMVC

In [2]:
Xs, y = LoadDataset.load_dataset(dataset_name="bbcsport", return_y=True)
y = y.squeeze()
Xs = [X.astype(float) for X in Xs]
n_clusters = y.nunique()

In [3]:
ps = np.arange(0., 1., 0.2)
metrics = []
for p in ps:
    pipeline = make_pipeline(Amputer(p= p, mechanism="MCAR", random_state=42),
                             MultiViewTransformer(NormalizerNaN().set_output(transform="pandas")),
                             LFIMVC(n_clusters=int(n_clusters), random_state=42))
    clusters = pipeline.fit_predict(Xs)
    metric = adjusted_mutual_info_score(labels_true=y, labels_pred=clusters)
    metrics.append(metric)

In [4]:
baselines = []
for p in ps[1:]:
    pipeline = make_pipeline(Amputer(p= p, mechanism="MCAR", random_state=42),
                             MultiViewTransformer(SimpleImputer().set_output(transform="pandas")),
                             MultiViewTransformer(NormalizerNaN().set_output(transform="pandas")),
                             LFIMVC(n_clusters=int(n_clusters), random_state=42))
    clusters = pipeline.fit_predict(Xs)
    baseline = adjusted_mutual_info_score(labels_true=y, labels_pred=clusters)
    baselines.append(baseline)
baselines = [metrics[0]] + baselines

In [5]:
ax = pd.DataFrame([metrics, baselines], columns=[p*100 for p in ps], index=["LFIMVC", "Baseline"]).T.plot(style= "-o", ylim= (0, 1), ylabel= "AMI", xlabel= "% Missing Rate", rot= 0)
ax.get_lines()[1].set_marker("X")
_ = ax.legend()