Imports

In [132]:


import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML
import plotly.express as px
import dataframe_image as dfi
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
pd.options.mode.chained_assignment = 'warn'  # default=
pd.options.plotting.backend = "plotly"
pd.options.display.float_format = '{:,.2f}'.format
PATH_CACHE = Path("../../data/algos_results.csv").resolve()
# print(PATH_CACHE)
pd.set_option('display.max_columns', None)  

## Load original data and preprocess

In [133]:
orig_df = pd.read_csv(PATH_CACHE, sep=",",header=0,index_col=False)
for orig_name,new_name in [("CallbackBestImprovement","BI"),("CallbackFirstImprovement","FI")]:
    orig_df.loc[orig_df.ImprClass == orig_name,"ImprClass"] = new_name
orig_df.loc[:,"algorithm"] = orig_df["ImprClass"]+"_" + orig_df["iteration_order"]
orig_df.loc[:,"Nb. points"] = orig_df["num_points"]
orig_df.drop(["ImprClass","iteration_order","num_iter","global_index","num_points"],axis=1,inplace=True)
df = orig_df.copy()
print(display(df.head()))

Unnamed: 0,num_clusters,init_type,init_cost,end_cost,num_iter_tot,time,algorithm,Nb. points
0,2,random,25858.54,14143.12,180,0.03,BI_BACK,20
1,2,random,26804.39,15272.5,180,0.03,BI_BACK,20
2,2,random,19462.05,11661.72,180,0.03,BI_BACK,20
3,2,random,31366.33,20703.83,160,0.03,BI_BACK,20
4,2,random,29653.63,17133.42,200,0.03,BI_BACK,20


None


## Merged Data

In [134]:
QUERY = "algorithm != 'BI_BACK' & num_clusters == 64 & init_type == 'kmean+'"

In [135]:
grouped_cols = ["num_clusters","Nb. points","init_type","algorithm"]
df_compil = orig_df.groupby(grouped_cols).mean().reset_index()
df_BI = df_compil[df_compil["algorithm"] == "BI_BACK"].copy().set_index(grouped_cols)
Ldf = []
for algo in pd.unique(df_compil["algorithm"]):
    df_algo = df_compil[df_compil["algorithm"] == algo].copy()
    df_algo["algorithm"] = "BI_BACK"
    df_algo = df_algo.set_index(grouped_cols)
    df_algo_orig = df_algo.copy()
    df_algo = (df_algo - df_BI)/df_BI*100
    df_algo.columns = ["prct_amelioration_"+c for c in df_algo.columns]
    df_algo = pd.concat([df_algo_orig,df_algo],axis=1)
    df_algo = df_algo.reset_index()
    df_algo["algorithm"] = algo
    Ldf.append(df_algo)
df_compil = pd.concat(Ldf,axis=0)
df_compil.loc[:,"Nb pts par cluster"] = df_compil["Nb. points"]/df_compil["num_clusters"]
# print(display(df_compil[df_compil["algorithm"]!="BI_BACK"].head()))

# sample the colormaps that you want to use. Use 128 from each so we get 256
def generate_cmap(negative_good=True):
    # thanks to https://stackoverflow.com/questions/31051488/combining-two-matplotlib-colormaps
    # colors in total
    Lintervals = [
        (1,0),
        (0.,1.)
    ]
    Lcmap_names = ["Reds","Greens"]
    if negative_good:
        Lcmap_names = Lcmap_names[::-1]
    Lcmaps = [
        plt.get_cmap(name)(np.linspace(*interval,128)) for name,interval in zip(Lcmap_names,Lintervals)
    ]
    

    # combine them and build a new colormap
    colors = np.vstack(Lcmaps)
    mymap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
    return mymap
def select_col(x):
    c1 = 'background-color: red'
    c2 = '' 
    #compare columns
    mask = x['Diff'] > x['HistoricStandardDev']
    #DataFrame with same index and columns names as original filled empty strings
    df1 =  pd.DataFrame(c2, index=x.index, columns=x.columns)
    #modify values of df1 column by boolean mask
    df1.loc[mask, 'Current'] = c1
    return df1

# Colors
Lindexes = [f"prct_amelioration_{col}" for col in ["end_cost","num_iter_tot","time"]]
df = df_compil.copy().reset_index()
df = df.query(QUERY)
styler = df.copy().style
for indexes,negative_good in zip(Lindexes,[False,False,False]):
    maxi = df[indexes].abs().max()
    styler.background_gradient(cmap=generate_cmap(negative_good=negative_good), subset=indexes,vmin=-maxi,vmax=maxi)
print(display(styler))


Unnamed: 0,index,num_clusters,Nb. points,init_type,algorithm,init_cost,end_cost,num_iter_tot,time,prct_amelioration_init_cost,prct_amelioration_end_cost,prct_amelioration_num_iter_tot,prct_amelioration_time,Nb pts par cluster
180,86,64,70,kmean+,FI_BACK,840.282551,78.675528,43419.538,2.591514,-0.855706,234.975875,-47.726572,-71.641803,1.09375
182,88,64,80,kmean+,FI_BACK,1088.478589,237.423738,55587.466,3.838946,1.037646,79.531736,-37.140892,-65.961452,1.25
184,90,64,90,kmean+,FI_BACK,1348.03342,416.0769,71726.196,5.760208,0.469007,38.205436,-31.532135,-55.559592,1.40625
187,93,64,100,kmean+,FI_BACK,,,,,,,,,1.5625
274,86,64,70,kmean+,FI_CURR,835.260574,79.488077,41939.436,3.961745,-1.448245,238.435454,-49.508489,-56.647758,1.09375
276,88,64,80,kmean+,FI_CURR,1085.608231,243.983282,54502.194,5.875506,0.771206,84.49184,-38.368133,-47.904004,1.25
278,90,64,90,kmean+,FI_CURR,1340.102497,421.210976,69398.163,9.089143,-0.122085,39.910787,-33.754412,-29.876625,1.40625
280,92,64,100,kmean+,FI_CURR,1604.440637,620.34035,90496.420139,13.563929,-2.370049,23.575477,-30.730042,-19.790301,1.5625
368,86,64,70,kmean+,FI_RANDOM,847.661059,79.752222,41191.458,2.759026,0.014879,239.5601,-50.40899,-69.808769,1.09375
370,88,64,80,kmean+,FI_RANDOM,1093.788184,243.086221,54475.698,4.341684,1.530507,83.813513,-38.398095,-61.503854,1.25


None


# Observations


## Initialisation random

Coût final
- A partir d'environ 5 points par cluster il devient plus intéressant d'utiliser BI_BACK
- Pour 2 clusters un comportement plus instable est observé

Temps de calcul et nombre d'itérations
- à partir de 1.25 points par cluster il devient plus intéressant d'utiliser FI_BACK (si nombre de clusters >= 16)

## Initialisation KMeans+

Coût final
- BI_BACK et FI très similaires.
- BI_BACK est plus meilleur si nb points/cluster $\in [3.12,5.63]$ pour num_clusters >= 16
- BI_BACK est plus meilleur si nb points/cluster > 6.25,11.25 pour num_clusters < 16 : tendance moins visible

Temps de calcul et nombre d'itérations
- BI_BACK toujours meilleur

## Comparaison FI_BACK, FI_CURR, FI_RANDOM

- Pour nb points/cluster < 30 mêmes ordre de grandeur pour chaque algorithme
- 

# % improvement of COLUMN = f(combination of number of points / cluster, algorithm, initialization) ?

In [136]:
COLUMN = "prct_amelioration_end_cost"

In [137]:
df = df_compil.copy().reset_index()
ticks_pow = [-2,-1,0]
max_val = abs(min(ticks_pow))
delta_zero = 0.001+1
logscale = lambda x:  np.sign(x) * (1-min(ticks_pow)+np.clip(np.log10(np.abs(x+1e-16)),min(ticks_pow)-delta_zero,np.inf))
fig = px.scatter_3d(df,x="Nb pts par cluster",y="init_type",z="algorithm",color=logscale(df[COLUMN]),
                    hover_data=["Nb pts par cluster","init_type","algorithm",COLUMN],
                    color_continuous_scale=["red","yellow","green","cyan","blue"][::-1],
                    range_color=[-max_val-delta_zero,max_val+delta_zero],
                    template="plotly_dark",width=1000,height=900)
def build(f):
    ticks_pos = [f(1,p) for p in ticks_pow]
    ticks_neg = [f(-1,p) for p in ticks_pow[::-1]]
    return [*ticks_neg,logscale(0),*ticks_pos]
def func(sign,p):
    real_value = sign*10**p
    return logscale(real_value)
Lp = build(func)
Lannot = build(lambda sign,p: "1E"+str(p) if sign > 0 else "-1E"+str(p))
fig.update_layout(coloraxis_colorbar=dict(
    title=COLUMN,
    tickvals=np.round(Lp,decimals=2),
    ticktext=Lannot,
))
fig.show()