Imports

In [1]:


import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML
import plotly.express as px
import dataframe_image as dfi
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
pd.options.mode.chained_assignment = 'warn'  # default=
pd.options.plotting.backend = "plotly"
pd.options.display.float_format = '{:,.2f}'.format
PATH_CACHE = Path("../../data/results_fi_bi.csv").resolve()
# print(PATH_CACHE)
pd.set_option('display.max_columns', None)  

## Load original data and preprocess

In [5]:
orig_df = pd.read_csv(PATH_CACHE, sep=",",header=0,index_col=False)
# for col in orig_df:
#     print(col,orig_df[col].unique())
for orig_name,new_name in [(0,"BI"),(1,"FI"),(2,"DI")]:
    orig_df.loc[orig_df.ImprClass == orig_name,"ImprClass"] = new_name
for orig_name,new_name in [(0,"BACK"),(1,"CURR"),(2,"RANDOM")]:
    orig_df.loc[orig_df.iteration_order == orig_name,"iteration_order"] = new_name
for orig_name,new_name in [(0,"random"),(1,"kmeans+")]:
    orig_df.loc[orig_df.init_type == orig_name,"init_type"] = new_name
orig_df.loc[:,"algorithm"] = orig_df["ImprClass"]+"_" + orig_df["iteration_order"]
orig_df.loc[:,"Nb. points"] = orig_df["num_points"]
orig_df.drop(["ImprClass","iteration_order","num_iter","num_points","seed"],axis=1,inplace=True)
df = orig_df.copy()
print(display(orig_df))

Unnamed: 0,num_clusters,init_type,init_cost,end_cost,num_iter_tot,time,algorithm,Nb. points
0,2,random,1642751.39,1642751.39,1000,16.00,BI_BACK,1000
1,2,random,1642751.39,1642751.39,1000,12.00,BI_BACK,1000
2,2,random,1642751.39,1642751.39,1000,12.00,BI_BACK,1000
3,2,random,1642751.39,1642751.39,1000,20.00,BI_BACK,1000
4,2,random,1642751.39,1642751.39,1000,12.00,BI_BACK,1000
...,...,...,...,...,...,...,...,...
27995,128,kmeans+,1423452.65,1416154.94,41259744,444638.00,FI_CURR,1000
27996,128,kmeans+,1423452.65,1416154.94,41259744,427181.00,FI_CURR,1000
27997,128,kmeans+,1423452.65,1416154.94,41259744,433083.00,FI_CURR,1000
27998,128,kmeans+,1423452.65,1416154.94,41259744,421314.00,FI_CURR,1000


None


## Merged Data

In [18]:
QUERY = "algorithm == 'FI_CURR' & init_type == 'random'"

In [19]:
grouped_cols = ["num_clusters","Nb. points","init_type","algorithm"]
df_compil = orig_df.groupby(grouped_cols).mean().reset_index()
df_BI = df_compil[df_compil["algorithm"] == "BI_BACK"].copy().set_index(grouped_cols)
Ldf = []
for algo in pd.unique(df_compil["algorithm"]):
    df_algo = df_compil[df_compil["algorithm"] == algo].copy()
    df_algo["algorithm"] = "BI_BACK"
    df_algo = df_algo.set_index(grouped_cols)
    df_algo_orig = df_algo.copy()
    df_algo = (df_algo - df_BI)/df_BI*100
    df_algo.columns = ["prct_amelioration_"+c for c in df_algo.columns]
    df_algo = pd.concat([df_algo_orig,df_algo],axis=1)
    df_algo = df_algo.reset_index()
    if df_algo["prct_amelioration_end_cost"].isnull().values.any():
        b=0
    df_algo["algorithm"] = algo
    Ldf.append(df_algo)
df_compil = pd.concat(Ldf,axis=0)
df_compil.loc[:,"Nb pts par cluster"] = df_compil["Nb. points"]/df_compil["num_clusters"]
# print(display(df_compil[df_compil["algorithm"]!="BI_BACK"].head()))

# sample the colormaps that you want to use. Use 128 from each so we get 256
def generate_cmap(negative_good=True):
    # thanks to https://stackoverflow.com/questions/31051488/combining-two-matplotlib-colormaps
    # colors in total
    Lintervals = [
        (1,0),
        (0.,1.)
    ]
    Lcmap_names = ["Reds","Greens"]
    if negative_good:
        Lcmap_names = Lcmap_names[::-1]
    Lcmaps = [
        plt.get_cmap(name)(np.linspace(*interval,128)) for name,interval in zip(Lcmap_names,Lintervals)
    ]
    

    # combine them and build a new colormap
    colors = np.vstack(Lcmaps)
    mymap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
    return mymap
def select_col(x):
    c1 = 'background-color: red'
    c2 = '' 
    #compare columns
    mask = x['Diff'] > x['HistoricStandardDev']
    #DataFrame with same index and columns names as original filled empty strings
    df1 =  pd.DataFrame(c2, index=x.index, columns=x.columns)
    #modify values of df1 column by boolean mask
    df1.loc[mask, 'Current'] = c1
    return df1

# Colors
Lindexes = [f"prct_amelioration_{col}" for col in ["end_cost","num_iter_tot","time"]]
df = df_compil.copy().reset_index()
df = df.query(QUERY)
styler = df.copy().style
for indexes,negative_good in zip(Lindexes,[False,False,False]):
    maxi = df[indexes].abs().max()
    styler.background_gradient(cmap=generate_cmap(negative_good=negative_good), subset=indexes,vmin=-maxi,vmax=maxi)
print(display(styler))


Unnamed: 0,index,num_clusters,Nb. points,init_type,algorithm,init_cost,end_cost,num_iter_tot,time,prct_amelioration_init_cost,prct_amelioration_end_cost,prct_amelioration_num_iter_tot,prct_amelioration_time,Nb pts par cluster
15,1,2,1000,random,FI_CURR,1642751.388,1642751.388,1000.0,17.048,0.0,0.0,0.0,9.696931,500.0
17,3,4,1000,random,FI_CURR,1640816.081,658052.565,1827877.0,22662.267,0.0,-0.633863,-13.329682,-9.256392,250.0
19,5,8,1000,random,FI_CURR,1633227.758,244865.582,10843532.0,131432.074,0.0,-1.140245,42.509292,69.28702,125.0
21,7,16,1000,random,FI_CURR,1622703.378,637282.988,22381098.0,271064.292,0.0,483.733939,5.001633,24.363133,62.5
23,9,32,1000,random,FI_CURR,1569864.574,743800.468,33160254.0,382178.848,0.0,1421.615805,-32.808693,-16.985644,31.25
25,11,64,1000,random,FI_CURR,1540827.075,113463.749,378039252.0,4221327.771,0.0,395.19066,274.570727,355.779741,15.625
27,13,128,1000,random,FI_CURR,1423452.653,164980.655,640859759.0,6078393.644,0.0,1489.471943,237.534437,291.232219,7.8125


None


In [None]:
df = df_compil.copy().reset_index()
ticks_pow = [-2,-1,0]
max_val = abs(min(ticks_pow))
delta_zero = 0.001+1
logscale = lambda x:  np.sign(x) * (1-min(ticks_pow)+np.clip(np.log10(np.abs(x+1e-16)),min(ticks_pow)-delta_zero,np.inf))
fig = px.scatter_3d(df,x="Nb. points",y="init_type",z="algorithm",color=logscale(df[COLUMN]),
                    hover_data=["Nb. points","num_clusters","init_type","algorithm",COLUMN],
                    color_continuous_scale=["red","yellow","green","cyan","blue"][::-1],
                    range_color=[-max_val-delta_zero,max_val+delta_zero],
                    template="plotly_dark",width=1000,height=900)
def build(f):
    ticks_pos = [f(1,p) for p in ticks_pow]
    ticks_neg = [f(-1,p) for p in ticks_pow[::-1]]
    return [*ticks_neg,logscale(0),*ticks_pos]
def func(sign,p):
    real_value = sign*10**p
    return logscale(real_value)
Lp = build(func)
Lannot = build(lambda sign,p: "1E"+str(p) if sign > 0 else "-1E"+str(p))
fig.update_layout(coloraxis_colorbar=dict(
    title=COLUMN,
    tickvals=np.round(Lp,decimals=2),
    ticktext=Lannot,
))
fig.show()

# Observations


## Initialisation random

Coût final
- A partir d'environ 5 points par cluster il devient plus intéressant d'utiliser BI_BACK
- Pour 2 clusters un comportement plus instable est observé

Temps de calcul et nombre d'itérations
- à partir de 1.25 points par cluster il devient plus intéressant d'utiliser FI_BACK (si nombre de clusters >= 16)

## Initialisation KMeans+

Coût final
- BI_BACK et FI très similaires.
- BI_BACK est plus meilleur si nb points/cluster $\in [3.12,5.63]$ pour num_clusters >= 16
- BI_BACK est plus meilleur si nb points/cluster > 6.25,11.25 pour num_clusters < 16 : tendance moins visible

Temps de calcul et nombre d'itérations
- BI_BACK toujours meilleur

## Comparaison FI_BACK, FI_CURR, FI_RANDOM

- Pour nb points/cluster < 30 mêmes ordre de grandeur pour chaque algorithme
- 

# % improvement of COLUMN = f(combination of number of points / cluster, algorithm, initialization) ?

In [36]:
COLUMN = "prct_amelioration_end_cost"
QUERY = "algorithm == 'FI_CURR'"

In [37]:
df = df_compil.query(QUERY)
df = df.copy().reset_index()
ticks_pow = [-2,-1,0]
max_val = max(abs(x) for x in ticks_pow)
delta_zero = 0.001+1
logscale = lambda x:  np.sign(x) * (1-min(ticks_pow)+np.clip(np.log10(np.abs(x+1e-16)),min(ticks_pow)-delta_zero,np.inf))
fig = px.scatter_3d(df,x="Nb. points",y="num_clusters",z="init_type",color=logscale(df[COLUMN]),
                    hover_data=["Nb. points","num_clusters","init_type","algorithm",COLUMN],
                    color_continuous_scale=["red","yellow","green","cyan","blue"][::-1],
                    range_color=[-max_val-delta_zero,max_val+delta_zero],
                    template="plotly_dark",width=1000,height=900)
def build(f):
    ticks_pos = [f(1,p) for p in ticks_pow]
    ticks_neg = [f(-1,p) for p in ticks_pow[::-1]]
    return [*ticks_neg,logscale(0),*ticks_pos]
def func(sign,p):
    real_value = sign*10**p
    return logscale(real_value)
Lp = build(func)
Lannot = build(lambda sign,p: "1E"+str(p) if sign > 0 else "-1E"+str(p))
fig.update_layout(coloraxis_colorbar=dict(
    title=COLUMN,
    tickvals=np.round(Lp,decimals=2),
    ticktext=Lannot,
))
fig.show()