In [1]:
# Load packages
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pyreadr
import seaborn as sns

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

from matplotlib import colors as mcolors
COLORS = [v for v in mcolors.BASE_COLORS.values()]

readRDS = robjects.r['readRDS']

PATH_RESULTS = './results/'

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

COLORS = ["#377eb8", "#ff7f00", "#4daf4a",
          "#f781bf", "#a65628", "#984ea3",
          "#999999", "#e41a1c", "#dede00"]
custom_palette = sns.set_palette(sns.color_palette(COLORS))

In [2]:
# Load results
with open('./results/results_fcubt.pkl', 'rb') as f:
    results_fcubt = pickle.load(f)
with open('./results/results_FPCA_GMM.pkl', 'rb') as f:
    results_FPCA_GMM = pickle.load(f)
with open('./results/results_fcubt_grow.pkl', 'rb') as f:
    results_grow = pickle.load(f)
with open('./results/results_kmeans.pkl', 'rb') as f:
    results_kmeans = pickle.load(f)
with open('./results/results_kmeans_derivative.pkl', 'rb') as f:
    results_kmeans_derivative = pickle.load(f)

#results_funhddc = readRDS('./results/results_funhddc.rds')
#results_funhddc = pandas2ri.rpy2py_dataframe(results_funhddc)
results_funhddc_review = readRDS('./results/results_funhddc_review.rds')
results_funhddc_review = pandas2ri.rpy2py_dataframe(results_funhddc_review)
results_funclust = readRDS('./results/results_funclust.rds')
results_funclust = pandas2ri.rpy2py_dataframe(results_funclust)

In [3]:
# FCUBT
n_clusters_fcubt = np.array([simu['n_clusters'] for idx, simu in enumerate(results_fcubt)])
ARI_fcubt = np.array([simu['ARI'] for idx, simu in enumerate(results_fcubt)])

In [4]:
# FPCA+GMM
n_clusters_FPCA_GMM = np.array([max(simu, key=lambda key: simu[key]) for idx, simu in enumerate(results_FPCA_GMM)])
ARI_FPCA_GMM = np.array([max(simu.values()) for idx, simu in enumerate(results_FPCA_GMM)])

In [5]:
# GROW
n_clusters_grow = np.array([simu['n_clusters'] for idx, simu in enumerate(results_grow)])
ARI_grow = np.array([simu['ARI'] for idx, simu in enumerate(results_grow)])

In [6]:
# KMEANS
n_clusters_kmeans = np.array([max(simu, key=lambda key: simu[key]) for idx, simu in enumerate(results_kmeans)])
ARI_kmeans = np.array([max(simu.values()) for idx, simu in enumerate(results_kmeans)])

In [7]:
# KMEANS DERIVATIVE
n_clusters_kmeans_deriv = np.array([max(simu, key=lambda key: simu[key]) for idx, simu in enumerate(results_kmeans_derivative)])
ARI_kmeans_deriv = np.array([max(simu.values()) for idx, simu in enumerate(results_kmeans_derivative)])

In [8]:
# FUNHDDC
# n_clusters_funhddc = np.array(results_funhddc['n_cluster'])
# ARI_funhddc = np.array(results_funhddc['ARI'])

In [8]:
# FUNHDDC review
# n_clusters_funhddc_A = np.array(results_funhddc_review['n_cluster_A'])
# ARI_funhddc_A = np.array(results_funhddc_review['ARI_A'])
n_clusters_funhddc_B = np.array(results_funhddc_review['n_cluster_B'])
ARI_funhddc_B = np.array(results_funhddc_review['ARI_B'])

In [9]:
# FUNCLUST
n_clusters_funclust = np.array(results_funclust['n_cluster'])
ARI_funclsut = np.array(results_funclust['ARI'])

In [10]:
n_clusters = pd.DataFrame({'\texttt{fCUBT}': n_clusters_fcubt,
                           '\texttt{Growing}': n_clusters_grow,
                           '\texttt{FPCA+GMM}': n_clusters_FPCA_GMM,
                           '\texttt{FunHDDC}_B': n_clusters_funhddc_B,
                           '\texttt{Funclust}': n_clusters_funclust,
                           '$k$\texttt{-means-}$d_1$': n_clusters_kmeans,
                           '$k$\texttt{-means-}$d_2$': n_clusters_kmeans_deriv
                          })

In [11]:
n_clusters.apply(pd.value_counts, normalize=True).T

Unnamed: 0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0
\texttt{fCUBT},,,,0.692,0.182,0.1,0.022,0.002,0.002,,,,,,
\texttt{Growing},,,,0.516,0.184,0.12,0.086,0.034,0.024,0.014,0.008,0.006,0.004,0.002,0.002
\texttt{FPCA+GMM},,,,0.266,0.45,0.248,0.036,,,,,,,,
\texttt{FunHDDC}_B,0.012,0.006,0.002,0.018,0.04,0.054,0.038,0.242,0.588,,,,,,
\texttt{Funclust},0.284,0.174,0.152,0.156,0.136,0.072,0.026,,,,,,,,
$k$\texttt{-means-}$d_1$,,0.002,0.018,0.046,0.08,0.178,0.676,,,,,,,,
$k$\texttt{-means-}$d_2$,,0.056,0.116,0.822,0.004,0.002,,,,,,,,,


In [12]:
ARI = pd.DataFrame({'\\texttt{fCUBT}': ARI_fcubt,
                    '\\texttt{Growing}': ARI_grow,
                    '\\texttt{FPCA+GMM}': ARI_FPCA_GMM,
                    '\\texttt{FunHDDC}': ARI_funhddc_B,
                    '\\texttt{Funclust}': ARI_funclsut,
                    '$k$\\texttt{-means-}$d_1$': ARI_kmeans,
                    '$k$\\texttt{-means-}$d_2$': ARI_kmeans_deriv
                   })

In [16]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=ARI, orient='h')
bplot.set_yticklabels([])
for i in range(7):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xlabel('ARI', size=16)
plt.xlim((0, 1))
plt.savefig('./figures/ARI_scenario_2.eps', format='eps')

## Influence of $n_{comp}$

In [2]:
with open('./results/results_fcubt_ncomp_7.pkl', 'rb') as f:
    results_ncomp7 = pickle.load(f)
with open('./results/results_fcubt_ncomp_5.pkl', 'rb') as f:
    results_ncomp5 = pickle.load(f)
with open('./results/results_fcubt_ncomp_3.pkl', 'rb') as f:
    results_ncomp3 = pickle.load(f)

In [3]:
# FCUBT n_comp = 7
n_clusters_ncomp7 = np.array([simu['n_clusters'] for idx, simu in enumerate(results_ncomp7)])
ARI_ncomp7 = np.array([simu['ARI'] for idx, simu in enumerate(results_ncomp7)])

In [4]:
# FCUBT n_comp = 5
n_clusters_ncomp5 = np.array([simu['n_clusters'] for idx, simu in enumerate(results_ncomp5)])
ARI_ncomp5 = np.array([simu['ARI'] for idx, simu in enumerate(results_ncomp5)])

In [5]:
# FCUBT n_comp = 3
n_clusters_ncomp3 = np.array([simu['n_clusters'] for idx, simu in enumerate(results_ncomp3)])
ARI_ncomp3 = np.array([simu['ARI'] for idx, simu in enumerate(results_ncomp3)])

In [6]:
n_clusters = pd.DataFrame({'$J^{(p)} = 3$': n_clusters_ncomp3,
                           '$J^{(p)} = 5$': n_clusters_ncomp5,
                           '$J^{(p)} = 7$': n_clusters_ncomp7
                          })

In [7]:
n_clusters.apply(pd.value_counts, normalize=True).T

Unnamed: 0,4,5,6,7
$J^{(p)} = 3$,0.02,0.968,0.012,
$J^{(p)} = 5$,0.05,0.948,,0.002
$J^{(p)} = 7$,0.074,0.914,0.012,


In [8]:
ARI = pd.DataFrame({'$J^{(p)} = 3$': ARI_ncomp3,
                    '$J^{(p)} = 5$': ARI_ncomp5,
                    '$J^{(p)} = 7$': ARI_ncomp7
                   })

In [9]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=ARI)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)
for i in range(3):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.ylabel('ARI', size=16)
plt.ylim((0.4, 1))
plt.savefig('./figures/ARI_ncomp_scenario_2.eps', format='eps')

## Different $n_comp$ for the nodes

In [2]:
with open('./results/results_fcubt_review.pkl', 'rb') as f:
    results_fcubt = pickle.load(f)

In [3]:
n_clusters = np.array([simu['n_clusters'] for idx, simu in enumerate(results_fcubt)])
ARI = np.array([simu['ARI'] for idx, simu in enumerate(results_fcubt)])

In [6]:
n_clusters = pd.DataFrame({'\texttt{fCUBT}': n_clusters})

In [10]:
n_clusters.apply(pd.value_counts, normalize=True).T.to_latex()

'\\begin{tabular}{lrrrr}\n\\toprule\n{} &      5 &      6 &      7 &      8 \\\\\n\\midrule\n\\textbackslash texttt\\{fCUBT\\} &  0.958 &  0.034 &  0.006 &  0.002 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [8]:
ARI = pd.DataFrame({'\\texttt{fCUBT}': ARI})

In [11]:
ARI.quantile(q=np.linspace(0, 1, 11)).T.to_latex()

'\\begin{tabular}{lrrrrrrrrrrr}\n\\toprule\n{} &       0.0 &       0.1 &       0.2 &       0.3 &       0.4 &       0.5 &       0.6 &       0.7 &       0.8 &       0.9 &       1.0 \\\\\n\\midrule\n\\textbackslash texttt\\{fCUBT\\} &  0.592862 &  0.867525 &  0.890652 &  0.896316 &  0.901226 &  0.903975 &  0.907945 &  0.911932 &  0.916064 &  0.921914 &  0.941173 \\\\\n\\bottomrule\n\\end{tabular}\n'

## Gaussian assumption

In [2]:
test_gaussian = pd.read_csv('./results/test_gaussian.csv')

In [5]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=test_gaussian, orient='h')
bplot.set_yticklabels([])
for i in range(5):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xlabel('$p$-values', size=16)
plt.xlim((0, 1))
plt.axvline(x=0.05, color='red', linestyle='-.')
plt.savefig('./figures/test_gaussian.eps', format='eps')

## Comparison with classification

In [25]:
with open('./results/results_fcubt_classif.pkl', 'rb') as f:
    results_classif = pickle.load(f)

In [26]:
res = []
for simulation in results_classif:
    if simulation['n_clusters'] == 5:
        res.append(simulation)

In [27]:
print(f'Number of selected dataset {len(res)}.')

Number of selected dataset 172.


In [28]:
res = pd.DataFrame(res)
res.rename(columns = {'ARI_fcubt': '\\texttt{fCUBT}',
                      'ARI_gp':'\\texttt{GPC}',
                      'ARI_rf': '\\texttt{Random Forest}'}, inplace = True)
res = res.drop(['n_clusters'], axis=1)
res = res.reindex(columns=['\\texttt{fCUBT}', '\\texttt{GPC}', '\\texttt{Random Forest}'])

In [29]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=res)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)

mybox = bplot.artists[0]
mybox.set_facecolor(COLORS[0])
mybox = bplot.artists[1]
mybox.set_facecolor(COLORS[-2])
mybox = bplot.artists[2]
mybox.set_facecolor(COLORS[-1])

plt.ylabel('ARI', size=16)
plt.ylim((0, 1))
plt.savefig('./figures/comparison_scenario_2.eps', format='eps')

## Comparison with classification (review)

In [2]:
with open('./results/results_fcubt_classif_review.pkl', 'rb') as f:
    results_classif_review = pickle.load(f)

In [3]:
res = []
for simulation in results_classif_review:
    if simulation['n_clusters'] == 5:
        res.append(simulation)

In [4]:
print(f'Number of selected dataset {len(res)}.')

Number of selected dataset 436.


In [5]:
res = pd.DataFrame(res)
res.rename(columns = {'ARI_fcubt': '\\texttt{fCUBT}',
                      'ARI_gp':'\\texttt{GPC}',
                      'ARI_rf': '\\texttt{Random Forest}'}, inplace = True)
res = res.drop(['n_clusters'], axis=1)
res = res.reindex(columns=['\\texttt{fCUBT}', '\\texttt{GPC}', '\\texttt{Random Forest}'])

In [6]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=res)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)

mybox = bplot.artists[0]
mybox.set_facecolor(COLORS[0])
mybox = bplot.artists[1]
mybox.set_facecolor(COLORS[-2])
mybox = bplot.artists[2]
mybox.set_facecolor(COLORS[-1])

plt.ylabel('ARI', size=16)
plt.ylim((0.4, 1))
plt.savefig('./figures/comparison_review_scenario_2.eps', format='eps')

## Prediction

In [35]:
with open('./results/results_fcubt_prediction_200.pkl', 'rb') as f:
    results_pred_200 = pickle.load(f)
with open('./results/results_fcubt_prediction_500.pkl', 'rb') as f:
    results_pred_500 = pickle.load(f)
with open('./results/results_fcubt_prediction_1000.pkl', 'rb') as f:
    results_pred_1000 = pickle.load(f)

In [36]:
results_pred_array = np.empty((500, 999, 3))
for idx in range(len(results_pred_200)):
    results_pred_array[idx, :, 0] = results_pred_200[idx][:-1]
    results_pred_array[idx, :, 1] = results_pred_500[idx]
    results_pred_array[idx, :, 2] = results_pred_1000[idx][:-1]

In [37]:
plt.figure(figsize=(10, 6.21), constrained_layout=True)
for idx in range(len(results_pred_200)):
    plt.plot(results_pred_array[idx, :, 0], c=COLORS[6])
plt.plot(results_pred_array[:, :, 0].mean(axis=0), c=COLORS[-2])

plt.ylabel('ARI', size=16)
plt.xlabel('Size of the online dataset', size=16)
plt.savefig('./figures/prediction_200_scenario_2.eps', format='eps')

In [38]:
plt.figure(figsize=(10, 6.21), constrained_layout=True)
for idx in range(len(results_pred_500)):
    plt.plot(results_pred_array[idx, :, 1], c=COLORS[6])
plt.plot(results_pred_array[:, :, 1].mean(axis=0), c=COLORS[-2])

plt.ylabel('ARI', size=16)
plt.xlabel('Size of the online dataset', size=16)
plt.savefig('./figures/prediction_500_scenario_2.eps', format='eps')

In [39]:
plt.figure(figsize=(10, 6.21), constrained_layout=True)
for idx in range(len(results_pred_1000)):
    plt.plot(results_pred_array[idx, :, 2], c=COLORS[6])
plt.plot(results_pred_array[:, :, 2].mean(axis=0), c=COLORS[-2])

plt.ylabel('ARI', size=16)
plt.xlabel('Size of the online dataset', size=16)
plt.savefig('./figures/prediction_1000_scenario_2.eps', format='eps')

In [40]:
res = pd.DataFrame(results_pred_array[:, -1, :])
res.rename(columns = {0: '$N = 200$',
                      1: '$N = 500$',
                      2: '$N = 1000$'}, inplace = True)
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=res)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)
for i in range(3):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xticks(size=20)
plt.ylabel('ARI', size=16)
plt.ylim((0, 1))
plt.savefig('./figures/prediction_scenario_2.eps')