In [1]:
# Load packages
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pyreadr
import seaborn as sns

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

from matplotlib import colors as mcolors
COLORS = [v for v in mcolors.BASE_COLORS.values()]

readRDS = robjects.r['readRDS']

PATH_RESULTS = './results/'

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

COLORS = ["#377eb8", "#ff7f00", "#4daf4a",
          "#f781bf", "#a65628", "#984ea3",
          "#999999", "#e41a1c", "#dede00"]
custom_palette = sns.set_palette(sns.color_palette(COLORS))

In [2]:
# Load results
with open('./results/results_fcubt.pkl', 'rb') as f:
    results_fcubt = pickle.load(f)
with open('./results/results_FPCA_GMM.pkl', 'rb') as f:
    results_FPCA_GMM = pickle.load(f)
with open('./results/results_fcubt_grow.pkl', 'rb') as f:
    results_grow = pickle.load(f)
with open('./results/results_kmeans.pkl', 'rb') as f:
    results_kmeans = pickle.load(f)
with open('./results/results_kmeans_derivative.pkl', 'rb') as f:
    results_kmeans_derivative = pickle.load(f)

#results_funhddc = readRDS('./results/results_funhddc.rds')
#results_funhddc = pandas2ri.rpy2py_dataframe(results_funhddc)
results_funhddc_review = readRDS('./results/results_funhddc_review.rds')
results_funhddc_review = pandas2ri.rpy2py_dataframe(results_funhddc_review)
results_funclust = readRDS('./results/results_funclust.rds')
results_funclust = pandas2ri.rpy2py_dataframe(results_funclust)

In [3]:
# FCUBT
n_clusters_fcubt = np.array([simu['n_clusters'] for idx, simu in enumerate(results_fcubt)])
ARI_fcubt = np.array([simu['ARI'] for idx, simu in enumerate(results_fcubt)])

In [4]:
# FPCA+GMM
n_clusters_FPCA_GMM = np.array([max(simu, key=lambda key: simu[key]) for idx, simu in results_FPCA_GMM.items()])
ARI_FPCA_GMM = np.array([max(simu.values()) for idx, simu in results_FPCA_GMM.items()])

In [5]:
# GROW
n_clusters_grow = np.array([simu['n_clusters'] for idx, simu in enumerate(results_grow)])
ARI_grow = np.array([simu['ARI'] for idx, simu in enumerate(results_grow)])

In [6]:
# KMEANS
n_clusters_kmeans = np.array([max(simu, key=lambda key: simu[key]) for idx, simu in enumerate(results_kmeans)])
ARI_kmeans = np.array([max(simu.values()) for idx, simu in enumerate(results_kmeans)])

In [7]:
# KMEANS DERIVATIVE
n_clusters_kmeans_deriv = np.array([max(simu, key=lambda key: simu[key]) for idx, simu in enumerate(results_kmeans_derivative)])
ARI_kmeans_deriv = np.array([max(simu.values()) for idx, simu in enumerate(results_kmeans_derivative)])

In [10]:
# FUNHDDC
# n_clusters_funhddc = np.array(results_funhddc['n_cluster'])
# ARI_funhddc = np.array(results_funhddc['ARI'])

In [8]:
# FUNHDDC review
# n_clusters_funhddc_A = np.array(results_funhddc_review['n_cluster_A'])
# ARI_funhddc_A = np.array(results_funhddc_review['ARI_A'])
n_clusters_funhddc_B = np.array(results_funhddc_review['n_cluster_B'])
ARI_funhddc_B = np.array(results_funhddc_review['ARI_B'])

In [9]:
# FUNCLUST
n_clusters_funclust = np.array(results_funclust['n_cluster'])
ARI_funclust = np.array(results_funclust['ARI'])

In [10]:
n_clusters = pd.DataFrame({'\texttt{fCUBT}': n_clusters_fcubt,
                           '\texttt{Growing}': n_clusters_grow,
                           '\texttt{FPCA+GMM}': n_clusters_FPCA_GMM,
                           '\texttt{FunHDDC}_B': n_clusters_funhddc_B,
                           '\texttt{Funclust}': n_clusters_funclust,
                           '$k$\texttt{-means-}$d_1$': n_clusters_kmeans,
                           '$k$\texttt{-means-}$d_2$': n_clusters_kmeans_deriv
                          })

In [11]:
n_clusters.apply(pd.value_counts, normalize=True).T

Unnamed: 0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,16.0
\texttt{fCUBT},,,,0.982,0.018,,,,,,,,,
\texttt{Growing},,,,0.596,0.234,0.08,0.028,0.026,0.012,0.01,0.006,0.004,0.002,0.002
\texttt{FPCA+GMM},,0.002,0.012,0.53,0.316,0.11,0.03,,,,,,,
\texttt{FunHDDC}_B,0.346,0.45,0.184,0.016,0.004,,,,,,,,,
\texttt{Funclust},0.444,0.442,0.106,0.006,0.002,,,,,,,,,
$k$\texttt{-means-}$d_1$,,0.15,0.154,0.606,0.016,0.07,0.004,,,,,,,
$k$\texttt{-means-}$d_2$,,,0.002,0.05,0.29,0.362,0.296,,,,,,,


In [13]:
ARI = pd.DataFrame({'\\texttt{fCUBT}': ARI_fcubt,
                    '\\texttt{Growing}': ARI_grow,
                    '\\texttt{FPCA+GMM}': ARI_FPCA_GMM,
                    '\\texttt{FunHDDC}': ARI_funhddc_B,
                    '\\texttt{Funclust}': ARI_funclust,
                    '$k$\\texttt{-means-}$d_1$': ARI_kmeans,
                    '$k$\\texttt{-means-}$d_2$': ARI_kmeans_deriv
                   })

In [23]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=ARI, orient='h')
bplot.set_yticklabels(bplot.get_yticklabels(), size=15)
for i in range(7):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xlabel('ARI', size=16)
plt.xlim((0, 1))
plt.savefig('./figures/ARI_scenario_1.eps', format='eps')

## Gaussian assumption

In [2]:
with open('./results/results_fcubt_review.pkl', 'rb') as f:
    results_fcubt = pickle.load(f)

In [4]:
n_clusters_fcubt = np.array([simu['n_clusters'] for idx, simu in enumerate(results_fcubt)])
ARI_fcubt = np.array([simu['ARI'] for idx, simu in enumerate(results_fcubt)])

In [7]:
n_clusters = pd.DataFrame({'\texttt{fCUBT}': n_clusters_fcubt})

In [23]:
n_clusters.apply(pd.value_counts, normalize=True).T

'\\begin{tabular}{lrrrrr}\n\\toprule\n{} &      5 &      6 &     7 &      8 &      9 \\\\\n\\midrule\n\\textbackslash texttt\\{fCUBT\\} &  0.532 &  0.352 &  0.08 &  0.028 &  0.008 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [15]:
ARI = pd.DataFrame({'\\texttt{fCUBT}': ARI_fcubt})

In [22]:
ARI.quantile(q=np.linspace(0, 1, 11)).T

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
\texttt{fCUBT},0.489384,0.653061,0.677296,0.692782,0.70648,0.718602,0.727707,0.738394,0.751004,0.770572,0.823693


In [54]:
test_gaussian = pd.read_csv('./results/test_gaussian.csv')

In [59]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=test_gaussian, orient='h')
bplot.set_yticklabels(bplot.get_yticklabels(), size=15)
for i in range(5):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xlabel('$p$-values', size=16)
plt.xlim((0, 1))
plt.axvline(x=0.05, color='red', linestyle='-.')
plt.savefig('./figures/test_gaussian.eps', format='eps')

## Computation time

In [48]:
with open('./results/results_fcubt_comptime.pkl', 'rb') as f:
    results_comptime = pickle.load(f)
results_comptime = np.array([simu['comp_time'] for idx, simu in enumerate(results_comptime)])

In [49]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.violinplot(x=results_comptime)
bplot.set_yticklabels(bplot.get_yticklabels(), size=15)
for i in range(0):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xlabel('Computation time', size=16)
plt.savefig('./figures/comptime_scenario_1.eps', format='eps')

## Influence of $n_{comp}$

In [29]:
with open('./results/results_fcubt_ncomp_7.pkl', 'rb') as f:
    results_ncomp7 = pickle.load(f)
with open('./results/results_fcubt_ncomp_5.pkl', 'rb') as f:
    results_ncomp5 = pickle.load(f)
with open('./results/results_fcubt_ncomp_3.pkl', 'rb') as f:
    results_ncomp3 = pickle.load(f)

In [30]:
# FCUBT n_comp = 7
n_clusters_ncomp7 = np.array([simu['n_clusters'] for idx, simu in enumerate(results_ncomp5)])
ARI_ncomp7 = np.array([simu['ARI'] for idx, simu in enumerate(results_ncomp7)])

In [31]:
# FCUBT n_comp = 5
n_clusters_ncomp5 = np.array([simu['n_clusters'] for idx, simu in enumerate(results_ncomp5)])
ARI_ncomp5 = np.array([simu['ARI'] for idx, simu in enumerate(results_ncomp5)])

In [32]:
# FCUBT n_comp = 3
n_clusters_ncomp3 = np.array([simu['n_clusters'] for idx, simu in enumerate(results_ncomp3)])
ARI_ncomp3 = np.array([simu['ARI'] for idx, simu in enumerate(results_ncomp3)])

In [33]:
n_clusters = pd.DataFrame({'$n_{comp} = 3$': n_clusters_ncomp3,
                           '$n_{comp} = 5$': n_clusters_ncomp5,
                           '$n_{comp} = 7$': n_clusters_ncomp7
                          })

In [34]:
n_clusters.apply(pd.value_counts, normalize=True).T

Unnamed: 0,5
$n_{comp} = 3$,1.0
$n_{comp} = 5$,1.0
$n_{comp} = 7$,1.0


In [35]:
ARI = pd.DataFrame({'$n_{comp} = 3$': ARI_ncomp3,
                    '$n_{comp} = 5$': ARI_ncomp5,
                    '$n_{comp} = 7$': ARI_ncomp7
                   })

In [36]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=ARI)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)
for i in range(3):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.ylabel('ARI', size=16)
plt.ylim((0.4, 1))
plt.savefig('./figures/ARI_ncomp_scenario_1.eps', format='eps')

## Comparison with classification

In [32]:
with open('./results/results_fcubt_classif.pkl', 'rb') as f:
    results_classif = pickle.load(f)

In [33]:
res = []
for simulation in results_classif:
    if simulation['n_clusters'] == 5:
        res.append(simulation)

In [34]:
print(f'Number of selected dataset {len(res)}.')

Number of selected dataset 257.


In [35]:
res = pd.DataFrame(res)
res.rename(columns = {'ARI_fcubt': '\\texttt{fCUBT}',
                      'ARI_gp':'\\texttt{GPC}',
                      'ARI_rf': '\\texttt{Random Forest}'}, inplace = True)
res = res.drop(['n_clusters'], axis=1)
res = res.reindex(columns=['\\texttt{fCUBT}', '\\texttt{GPC}', '\\texttt{Random Forest}'])

In [36]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=res)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)

mybox = bplot.artists[0]
mybox.set_facecolor(COLORS[0])
mybox = bplot.artists[1]
mybox.set_facecolor(COLORS[-2])
mybox = bplot.artists[2]
mybox.set_facecolor(COLORS[-1])

plt.ylabel('ARI', size=16)
plt.ylim((0, 1))
plt.savefig('./figures/comparison_scenario_1.eps', format='eps')

## Comparison with classification (review)

In [24]:
with open('./results/results_fcubt_classif_review.pkl', 'rb') as f:
    results_classif_review = pickle.load(f)

In [25]:
res = []
for simulation in results_classif_review:
    if simulation['n_clusters'] == 5:
        res.append(simulation)

In [26]:
print(f'Number of selected dataset {len(res)}.')

Number of selected dataset 491.


In [27]:
res = pd.DataFrame(res)
res.rename(columns = {'ARI_fcubt': '\\texttt{fCUBT}',
                      'ARI_gp':'\\texttt{GPC}',
                      'ARI_rf': '\\texttt{Random Forest}'}, inplace = True)
res = res.drop(['n_clusters'], axis=1)
res = res.reindex(columns=['\\texttt{fCUBT}', '\\texttt{GPC}', '\\texttt{Random Forest}'])

In [28]:
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=res)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)

mybox = bplot.artists[0]
mybox.set_facecolor(COLORS[0])
mybox = bplot.artists[1]
mybox.set_facecolor(COLORS[-2])
mybox = bplot.artists[2]
mybox.set_facecolor(COLORS[-1])

plt.ylabel('ARI', size=16)
plt.ylim((0.4, 1))
plt.savefig('./figures/comparison_review_scenario_1.eps', format='eps')

## Prediction

In [2]:
with open('./results/results_fcubt_prediction_200.pkl', 'rb') as f:
    results_pred_200 = pickle.load(f)
with open('./results/results_fcubt_prediction_500.pkl', 'rb') as f:
    results_pred_500 = pickle.load(f)
with open('./results/results_fcubt_prediction_1000.pkl', 'rb') as f:
    results_pred_1000 = pickle.load(f)

In [3]:
results_pred_array = np.empty((500, 1000, 3))
for idx in range(len(results_pred_200)):
    results_pred_array[idx, :, 0] = results_pred_200[idx]
    results_pred_array[idx, :, 1] = results_pred_500[idx]
    results_pred_array[idx, :, 2] = results_pred_1000[idx]

In [45]:
plt.figure(figsize=(10, 6.21), constrained_layout=True)
for idx in range(len(results_pred_200)):
    plt.plot(results_pred_array[idx, :, 0], c=COLORS[6])
plt.plot(results_pred_array[:, :, 0].mean(axis=0), c=COLORS[-2])

plt.ylabel('ARI', size=16)
plt.xlabel('Size of the online dataset', size=16)
plt.savefig('./figures/prediction_200_scenario_1.eps', format='eps')

In [46]:
plt.figure(figsize=(10, 6.21), constrained_layout=True)
for idx in range(len(results_pred_500)):
    plt.plot(results_pred_array[idx, :, 1], c=COLORS[6])
plt.plot(results_pred_array[:, :, 1].mean(axis=0), c=COLORS[-2])

plt.ylabel('ARI', size=16)
plt.xlabel('Size of the online dataset', size=16)
plt.savefig('./figures/prediction_500_scenario_1.eps', format='eps')

In [47]:
plt.figure(figsize=(10, 6.21), constrained_layout=True)
for idx in range(len(results_pred_1000)):
    plt.plot(results_pred_array[idx, :, 2], c=COLORS[6])
plt.plot(results_pred_array[:, :, 2].mean(axis=0), c=COLORS[-2])

plt.ylabel('ARI', size=16)
plt.xlabel('Size of the online dataset', size=16)
plt.savefig('./figures/prediction_1000_scenario_1.eps', format='eps')

In [None]:
res = pd.DataFrame(results_pred_array[:, -1, :])
res.rename(columns = {0: '$N_0 = 200$',
                      1: '$N_0 = 500$',
                      2: '$N_0 = 1000$'}, inplace = True)
plt.figure(figsize=(5, 5), constrained_layout=True)
bplot = sns.boxplot(data=res)
bplot.set_xticklabels(bplot.get_xticklabels(), size=15)
for i in range(3):
    mybox = bplot.artists[i]
    mybox.set_facecolor(COLORS[i])
plt.xticks(size=20)
plt.ylabel('ARI', size=16)
plt.ylim((0, 1))
plt.savefig('./figures/prediction_scenario_1.eps', format='eps')