In [None]:
#import standard libs
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import importlib
from tqdm import tqdm
from scipy.stats import linregress
import tsfel
from sklearn.preprocessing import normalize

In [None]:
# custom utils
import mt_utils as ut
import parameter_utils as pu

In [None]:
importlib.reload(ut)

In [None]:
importlib.reload(pu)

In [None]:
#import claspy
from claspy.segmentation import BinaryClaSPSegmentation
from claspy.data_loader import load_tssb_dataset
from claspy.data_loader import load_has_dataset

In [None]:
tssb = load_tssb_dataset()
hasc = load_has_dataset()

In [None]:
tssb_res = pd.read_pickle('results/zwischenergebnisse/clasp_on_downsampled_TS_TSSB_origW_ucps.pkl')
tssb_cfs = pd.read_pickle("results/zwischenergebnisse/1a_clasp_splitting_tssb.pkl")

hasc_res = pd.read_pickle('results/zwischenergebnisse/clasp_on_downsampled_TS_HASC_dsW_ucps.pkl')
hasc_cfs = pd.read_pickle("results/zwischenergebnisse/1a_clasp_splitting_hasc.pkl")

In [None]:
tssb_res=tssb_res.astype({'score': 'float'})
hasc_res=hasc_res.astype({'score': 'float'})

In [None]:
algo_order=["EveryNth","MinMax","M4","LTTB","MinMaxLTTB","LTD"]
algo_order_reversed = ['LTD', 'MinMaxLTTB', 'LTTB', 'M4', 'MinMax', 'EveryNth']

# Functions

In [None]:
def score_heatmap(df, dataset, title=None, filepath=None):
    t = df.groupby(["algo","compression"])["score"].mean().reset_index()
    t = t.pivot(index='algo', columns='compression')['score'].fillna(0)
    t = t.reindex(algo_order_reversed)
    fig = go.Figure(data=go.Heatmap(
                   z=t,
                   x=t.columns,
                   y=t.index,
                   colorbar={"title": 'Covering Score'},
                   zsmooth=False
                    )
                   )
    
    fig.update_yaxes(title_text="Downsampling Algorithm")
    fig.update_xaxes(title_text="Compression Ratio")
    
    fig.update_layout(height=400, 
                      font=dict(
                          family="Arial",
                          size=12,
                          color="black"
                      ),
                      title=dict(
                          text=title
                      ),
                      margin=dict(l=0, r=0, t=30, b=0)
                     )
    if filepath:
        fig.write_image(filepath, scale=1, width=1000, height=400)
        fig.write_image(filepath.replace('.svg', '.png'), scale=3, width=1000, height=400)
    fig.show()

In [None]:
def plot_ws_differences(df, cr, filepath=None):
    df = df[df.compression==cr]
    fig = go.Figure(
        layout=dict(
            xaxis=dict(categoryorder="category descending"),
            scattermode="group"
            )
        )

    fig.add_trace(go.Histogram(histfunc="avg", y=df.diff_scores, x=df.window_size, 
                               name='Diff. original and downsampled score', 
                               offsetgroup=1
                              )
                 )
    
    fig.update_yaxes(title_text="Diff. original and downsampled score")
    fig.update_xaxes(title_text="Window Size")

    fig.update_layout(height=400, 
                      font=dict(
                          family="Arial",
                          size=12,
                          color="black"
                      ),
                      title=dict(
                          text="Compression ratio = "+str(cr)
                      ),
                      margin=dict(l=0, r=0, t=30, b=0)
                     )
    
    if filepath:
        fig.write_image(filepath, scale=1, width=1000, height=400)
        
    fig.show()

In [None]:
def cr_score_matrix(df_res, cr_wBased_df, cr_wBasedalgo_df, cr_ftBased_df, best_cr, filepath=None):
    mean_score_cr_wBased=cr_wBased_df[['algo','score']].groupby(by='algo').mean().rename(columns={'score':'score_cr_wBased'})
    mean_score_cr_wBasedalgo=cr_wBasedalgo_df[['algo','score']].groupby(by='algo').mean().rename(columns={'score':'score_cr_wBased_algo'})
    mean_score_cr_ftBased=cr_ftBased_df[['algo','score']].groupby(by='algo').mean().rename(columns={'score':'score_cr_ftBased'})
    mean_score_allCRs = df_res[['algo','score']].groupby(by='algo').mean().rename(columns={'score':'score_all_cr'})
    mean_score_prevBestCR = df_res[df_res.compression==best_cr][['algo','score']].groupby(by='algo').mean().rename(columns={'score':'score_prev_best_cr'})
    
    all_crs = pd.concat([mean_score_allCRs,mean_score_prevBestCR,mean_score_cr_wBased,mean_score_cr_wBasedalgo,mean_score_cr_ftBased], axis=1)
    all_crs = all_crs.reindex(algo_order)
    if filepath:
        all_crs.to_excel(filepath)
    return all_crs 

In [None]:
def add_score_diffs_tssb(row):
    orig_score = tssb_cfs[(tssb_cfs.Algorithm=='None') & (tssb_cfs.orig_TS_ID==row.orig_TS_ID)].score.values[0]
    return orig_score-row.score

def add_score_diffs_hasc(row):
    orig_score = hasc_cfs[(hasc_cfs.Algorithm=='None') & (hasc_cfs.orig_TS_ID==row.orig_TS_ID)].score.values[0]
    return orig_score-row.score

In [None]:
def optimal_cr_w_based(df,ref_df,w_threshold,scale, ubound=False):
    import warnings
    warnings.filterwarnings('ignore')
    out=pd.DataFrame(columns=list(ref_df.columns.values))
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        for algo in algo_order:
            if ubound:
                cr = pu.find_cr_based_on_w_ubound(row.time_series, w_threshold, scale, algo)
            else:
                cr = pu.find_cr_based_on_w(row.time_series, w_threshold, scale, algo)
            out_df_slice = ref_df.loc[(ref_df.compression==cr)&(ref_df.orig_TS_ID==index)&(ref_df.algo==algo)].reset_index(drop=True)
        
            out=pd.concat([out,out_df_slice]).reset_index(drop=True)
    return out

In [None]:
def optimal_cr_w_based_summary(df,ref_df,w_thresholds, scale, ubound=False):
    import warnings
    warnings.filterwarnings('ignore')
    out=pd.DataFrame(columns=list(ref_df.columns.values))
    out2=pd.DataFrame(columns=list(ref_df.columns.values))
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):                
        for algo in algo_order:
            # mean window size threshold (same for all algos) and
            # algo-specific window size threshold
            if ubound:
                cr = pu.find_cr_based_on_w_ubound(row.time_series, w_thresholds['mean'], scale, algo)
                cr_algo = pu.find_cr_based_on_w_ubound(row.time_series, w_thresholds[algo], scale, algo)
            else:                
                cr = pu.find_cr_based_on_w(row.time_series, w_thresholds['mean'], scale, algo)
                cr_algo = pu.find_cr_based_on_w(row.time_series, w_thresholds[algo], scale, algo)

            out_df_slice = ref_df.loc[(ref_df.compression==cr)&(ref_df.orig_TS_ID==index)&(ref_df.algo==algo)].reset_index(drop=True)
            out_dfalgo_slice = ref_df.loc[(ref_df.compression==cr_algo)&(ref_df.orig_TS_ID==index)&(ref_df.algo==algo)].reset_index(drop=True)
        
            out=pd.concat([out,out_df_slice]).reset_index(drop=True)
            out2=pd.concat([out2,out_dfalgo_slice]).reset_index(drop=True)
    return out, out2

In [None]:
def get_w_thresholds_general(w_bounds, step, df, df_ref, scale, save_path, ubound=False):
    # output scores for each window size theshold within interval in given steps
    # result is not grouped by downsampling algorithm
    ws = np.arange(w_bounds[0],w_bounds[1],step)
    curr_df = optimal_cr_w_based(df,df_ref,ws[0], scale, ubound)
    vals = {'mean':curr_df.score.mean(),'median':curr_df.score.median(),'std':curr_df.score.std()}
    w_series = pd.Series(vals,name=ws[0])
    out = pd.DataFrame(w_series)
    for w in ws[1:]:
        curr_df = optimal_cr_w_based(df,df_ref,w, scale)
        vals = {'mean':curr_df.score.mean(),'median':curr_df.score.median(),'std':curr_df.score.std()}
        w_series = pd.Series(vals,name=w)
        out = pd.concat([out,w_series],axis=1)

    if save_path:
        out.to_excel(save_path)
    return out

In [None]:
def get_best_thresholds_per_algo(threshold_df):    
    thresholds = {}
    for index, row in tqdm(threshold_df.iterrows(), total=len(threshold_df), desc="Processing rows"):
        thresholds[index] = row.idxmax()
    
    return thresholds

In [None]:
def mean_squared_error(ts):
    out = []
    if ts.ndim == 1:
        ts = ts.reshape(1, -1)
    for t in ts:
        indices = np.arange(0, len(t))
        enum_ts = np.column_stack((indices, t))
        out.append(linregress(enum_ts).stderr/len(t))
        
    return np.mean(out)

In [None]:
domains = ['statistical','temporal','spectral']
algos = tssb_res.algo.unique()
feature_cols = np.append(domains, ['MSE'])
feature_names = feature_cols.tolist()
algo_names = algos.tolist()
data_cols = ['orig_TS_ID','algo', 'compression', 'score']
columns = np.concatenate((data_cols,feature_cols),axis=None)

def create_feature_df(dataset,orig_data):
    out = []
    for index, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Processing rows"):
        ts = orig_data.iloc[row.orig_TS_ID].time_series
        if ts.ndim == 1:
            ts = normalize(ts.reshape(1, -1), norm="max").squeeze()
        else:
           ts = normalize(ts.transpose(), norm="max").transpose()
        domain_fts = []
        for domain in domains:
            cfg = tsfel.get_features_by_domain(domain)
            features_curr = np.mean(np.asarray(tsfel.time_series_features_extractor(cfg, ts, verbose=0)).squeeze())
            domain_fts.append(features_curr)
        features_curr = np.append(domain_fts, [mean_squared_error(ts.transpose())])
        data_curr = np.asarray(row[data_cols])
        curr_row = np.concatenate((data_curr,features_curr),axis=None)
        out.append(curr_row)
    return out

In [None]:
def create_corr_matrix_features(data, save_path=None):
    coeff_matrix = np.zeros((len(algos),len(feature_cols)))
    for i, algo in enumerate(algos):
        df = data[data['algo']==algo]
        for j, feature in enumerate(feature_cols):
            x = np.asarray(df[feature])
            y = np.asarray(df.compression)
            coeff_matrix[i,j] = np.corrcoef(x, y)[0,1]
            
    correlations = pd.DataFrame(coeff_matrix,columns=feature_names, index=algo_names )
    if save_path:
        correlations.to_excel(save_path)
    return correlations

In [None]:
def max_scores_ts_algo(df):
    idx = df.groupby(['orig_TS_ID','algo'])['score'].idxmax()
    max_scores_per_algo = df.loc[idx].reset_index()
    return max_scores_per_algo

In [None]:
def create_indicator_matrix(filename=None):
    columns = ['dataset', 'algorithm', 'best_indicator', 'correlation_coefficient', 'min_feature', 'max_feature']
    result = []
    for index, row in corr_mat_tssb.iterrows():
        idx_max_val = abs(row).idxmax()
        max_val = row[idx_max_val]
        result.append(['TSSB', index, idx_max_val, max_val, df_features_cr_tssb[idx_max_val].min(), df_features_cr_tssb[idx_max_val].max()])
    
    for index, row in corr_mat_hasc.iterrows():
        idx_max_val = abs(row).idxmax()
        max_val = row[idx_max_val]
        result.append(['HASC', index, idx_max_val, max_val, df_features_cr_hasc[idx_max_val].min(), df_features_cr_hasc[idx_max_val].max()])
    
    indicator_mat = pd.DataFrame(result,columns=columns)
    if filename:
        indicator_mat.to_excel("results/tables/"+filename+".xlsx")
        indicator_mat.to_pickle("results/zwischenergebnisse/"+filename+".pkl")
    return indicator_mat

In [None]:
def find_cr_based_on_features(corr_koeff, minx, maxx, x, cr_interval=[0.1,0.9]):
    skalierungsintervall = [round((cr_interval[0]-cr_interval[1])/corr_koeff), round((cr_interval[1]-cr_interval[1])/corr_koeff)]
    skalierungsintervall = [min(skalierungsintervall), max(skalierungsintervall)]
    new_x = skalierungsintervall[0] + (((x-minx)*(skalierungsintervall[1]-skalierungsintervall[0]))/(maxx-minx))
    cr = corr_koeff*new_x+cr_interval[1]
    if cr < cr_interval[0]:
        cr = cr_interval[0]
    elif cr > cr_interval[1]:
        cr = cr_interval[1]
    return np.round(cr,1)

In [None]:
def get_feature(ts, feature_name):
    match feature_name:
        case "MSE":
            return mean_squared_error(ts)
        case _:
            cfg = tsfel.get_features_by_domain(feature_name)
            return np.mean(np.asarray(tsfel.time_series_features_extractor(cfg, ts, verbose=0)).squeeze())

In [None]:
def optimal_cr_feature_based(df, benchmark, indicator_matrix):
    out=pd.DataFrame(columns=list(df.columns.values))
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        indicator_series = indicator_matrix.loc[(indicator_matrix.dataset==benchmark)&(indicator_matrix.algorithm==row.algo)]
        feature_val = get_feature(row.time_series, indicator_series.best_indicator.values[0])
        
        cr = find_cr_based_on_features(corr_koeff=indicator_series.correlation_coefficient.values[0], 
                                       minx=indicator_series.min_feature.values[0], 
                                       maxx=indicator_series.max_feature.values[0], 
                                       x=feature_val)
        
        out_df_slice = df.loc[(df.compression==cr)&(df.orig_TS_ID==index)].reset_index(drop=True)
        
        out=pd.concat([out,out_df_slice]).reset_index(drop=True)
    return out

# Analysis

## Window Size - based Compression Ratio

### TSSB

In [None]:
score_heatmap(tssb_res, "TSSB", title='(a) TSSB', filepath="results/figures/heatmap_meanScore_perAlgo_compRatio_TSSB.svg")

In [None]:
tssb_res['diff_scores'] = tssb_res.apply(add_score_diffs_tssb,axis=1)
crs_lim=np.round(np.arange(0.2,0.8,0.1),1)
for cr in crs_lim:
    plot_ws_differences(tssb_res, cr, filepath="results/figures/optimalCR/WS_scoreDependency_CR" + str(cr).replace('.','') + "_TSSB.svg")

In [None]:
w_thresholds_tssb=get_w_thresholds_general([20,36], 5, tssb, tssb_res, scale=0, save_path='results/tables/window_size_threshold_TSSB.xlsx')
w_thresholds_tssb

### HASC

In [None]:
score_heatmap(hasc_res, "HASC", title='(b) HASC', filepath="results/figures/heatmap_meanScore_perAlgo_compRatio_HASC.svg")

In [None]:
hasc_res['diff_scores'] = hasc_res.apply(add_score_diffs_hasc,axis=1)
crs_lim=np.round(np.arange(0.2,0.8,0.1),1)
for cr in crs_lim:
    plot_ws_differences(hasc_res, cr, filepath="results/figures/optimalCR/WS_scoreDependency_CR" + str(cr).replace('.','') + "_HASC.svg")

In [None]:
w_thresholds_hasc = get_w_thresholds_general([30,100], 10, hasc, hasc_res, scale=1, 
                                             save_path='results/tables/window_size_threshold_HASC.xlsx', ubound=True)
w_thresholds_hasc

## Feature - based Compression Ratio

### TSSB

In [None]:
max_scores_tssb=max_scores_ts_algo(tssb_res)
max_scores_tssb

In [None]:
features_cr = create_feature_df(max_scores_tssb,tssb)
df_features_cr_tssb = pd.DataFrame(features_cr, columns=columns)
df_features_cr_tssb.to_pickle("results/zwischenergebnisse/cf_score_features_TSSB.pkl")
df_features_cr_tssb

In [None]:
corr_mat_tssb = create_corr_matrix_features(df_features_cr_tssb, save_path="results/tables/featureBasedCR_featureCorrelations_TSSB.xlsx")
corr_mat_tssb

### HASC

In [None]:
max_scores_hasc=max_scores_ts_algo(hasc_res)
max_scores_hasc

In [None]:
features_cr = create_feature_df(max_scores_hasc,hasc)
df_features_cr_hasc = pd.DataFrame(features_cr, columns=columns)
df_features_cr_hasc.to_pickle("results/zwischenergebnisse/cf_score_features_HASC.pkl")
df_features_cr_hasc

In [None]:
corr_mat_hasc = create_corr_matrix_features(df_features_cr_hasc, save_path="results/tables/featureBasedCR_featureCorrelations_HASC.xlsx")
corr_mat_hasc

## Result Summary

In [None]:
indicator_matrix=create_indicator_matrix(filename='feature_indicator_matrix')
indicator_matrix

### TSSB

In [None]:
best_thresholds_tssb = get_best_thresholds_per_algo(w_thresholds_tssb)
best_thresholds_tssb

In [None]:
tssb_cr_wBased_df, tssb_cr_wBased_algo_df = optimal_cr_w_based_summary(tssb,tssb_res,best_thresholds_tssb,scale=0)

In [None]:
tssb_cr_ftBased_df = optimal_cr_feature_based(tssb_res, 'TSSB', indicator_matrix)

In [None]:
score_matrix_tssb_crs = cr_score_matrix(tssb_res, tssb_cr_wBased_df, tssb_cr_wBased_algo_df, tssb_cr_ftBased_df, 0.5, 
                                        filepath="results/tables/compression_approaches_scores_TSSB.xlsx"
                                        )
score_matrix_tssb_crs.loc['mean'] = score_matrix_tssb_crs.mean()
score_matrix_tssb_crs

In [None]:
score_matrix_tssb_crs_noltd = score_matrix_tssb_crs.drop('LTD')
summaries=pd.DataFrame(columns=score_matrix_tssb_crs_noltd.columns.values)
summaries.loc['mean'] = score_matrix_tssb_crs_noltd.mean()
summaries.loc['median'] = score_matrix_tssb_crs_noltd.median()
summaries.loc['std'] = score_matrix_tssb_crs_noltd.std()
summaries.to_excel("results/tables/summmary_statistics_compression_approaches_scores_TSSB.xlsx")
summaries

### HASC

In [None]:
best_thresholds_hasc = get_best_thresholds_per_algo(w_thresholds_hasc)
best_thresholds_hasc

In [None]:
import warnings
warnings.filterwarnings('ignore')
hasc_cr_wBased_df,hasc_cr_wBased_algo_df = optimal_cr_w_based_summary(hasc,hasc_res,best_thresholds_hasc,scale=1,ubound=True)

In [None]:
hasc_cr_ftBased_df = optimal_cr_feature_based(hasc_res, 'HASC', indicator_matrix)

In [None]:
score_matrix_hasc_crs = cr_score_matrix(hasc_res, hasc_cr_wBased_df, hasc_cr_wBased_algo_df, hasc_cr_ftBased_df, 0.5,
                                       filepath="results/tables/compression_approaches_scores_HASC.xlsx"
                                       )
score_matrix_hasc_crs.loc['mean'] = score_matrix_hasc_crs.mean()
score_matrix_hasc_crs

In [None]:
score_matrix_hasc_crs_noltd = score_matrix_hasc_crs.drop('LTD')
summaries=pd.DataFrame(columns=score_matrix_hasc_crs_noltd.columns.values)
summaries.loc['mean'] = score_matrix_hasc_crs_noltd.mean()
summaries.loc['median'] = score_matrix_hasc_crs_noltd.median()
summaries.loc['std'] = score_matrix_hasc_crs_noltd.std()
summaries.to_excel("results/tables/summmary_statistics_compression_approaches_scores_HASC.xlsx")
summaries