In [None]:
#import standard libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scikit_posthocs as sp
import importlib 
from tqdm import tqdm

In [None]:
#custom function files
import plotter as pt
import mt_utils as ut

In [None]:
importlib.reload(pt)

In [None]:
importlib.reload(ut) 

In [None]:
#import claspy
from claspy.data_loader import load_tssb_dataset
from claspy.data_loader import load_has_dataset
from claspy.segmentation import BinaryClaSPSegmentation

In [None]:
tssb = load_tssb_dataset()
hasc = load_has_dataset()

In [None]:
# compression ratios
crs = np.round(np.arange(0.1,1,0.1),1)
# column titles etc.
algos = ["MinMax","MinMaxLTTB","M4","LTTB", "EveryNth","LTD"]
columns = ['Algorithm', 'compression', 'orig_TS_ID', 'Time_Series', 'True_CPS']
columns_clasp = ['Algorithm', 'compression', 'window_size', 'orig_TS_ID', 'Time_Series', 'True_CPS', 'predicted_cps', 'score', 'runtime']

# Splitting

In [None]:
# split time series in datasets at change points
def prepare_splits(dataset):
    results = []
    for index, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Processing rows"):

        # univariate TS
        if row.time_series.ndim > 1:
            ts_arr = np.transpose(row.time_series)
            splits = ut.split_ts(ts_arr, row.cps)
        else:
            splits = ut.split_ts(np.expand_dims(row.time_series, axis=0), row.cps)

        # multivariate: split and downsample each time series
        for algo in algos:
            for cr in crs:
                ds_ts_list = []
                
                for split in splits:
                    ds_ts, ds_cps = ut.downsample_splits(algo, split, cr)
                    ds_ts_list.append((ds_ts))
    
                ds_ts = np.array(ds_ts_list).squeeze()
                if row.time_series.ndim > 1:
                    ds_ts = np.transpose(ds_ts)

                results.append((algo, cr, index, ds_ts, ds_cps))
    return results

In [None]:
results_tssb = prepare_splits(tssb)

In [None]:
results_df_tssb = pd.DataFrame(results_tssb, columns=columns)
results_df_tssb.to_pickle("results/zwischenergebnisse/1_downsampled_splitting_tssb.pkl")
results_df_tssb

In [None]:
results_hasc = prepare_splits(hasc)

In [None]:
results_df_hasc = pd.DataFrame(results_hasc, columns=columns)
results_df_hasc.to_pickle("results/zwischenergebnisse/1_downsampled_splitting_hasc.pkl")
results_df_hasc

# ClaSP on split data

In [None]:
def clasp_segmentation(dataset, downsampled=True):
    import warnings
    warnings.simplefilter("ignore")
    results = []
    for index, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Processing rows"):
        if downsampled:
            w, pred_cps, score, runtime = ut.evaluate_clasp(row.Time_Series, row.True_CPS)            
            results.append((row.Algorithm, row.compression, w, row.orig_TS_ID, row.Time_Series, row.True_CPS, pred_cps, score, runtime))
        else:
            w, pred_cps, score, runtime = ut.evaluate_clasp(row.time_series, row.cps)            
            results.append(('None', 1, w, index, row.time_series, row.cps, pred_cps, score, runtime))
    return results

## TSSB

In [None]:
results_clasp_tssb = clasp_segmentation(results_df_tssb)

In [None]:
# clasp on original TS from TSSB
results_clasp_original = clasp_segmentation(tssb, False)

In [None]:
#append original to downsampled
results_clasp_df_tssb = pd.DataFrame(results_clasp_tssb, columns=columns_clasp)
results_clasp_df_tssb_orig = pd.DataFrame(results_clasp_original, columns=columns_clasp)

results_clasp_df_tssb.orig_TS_ID = results_df_tssb.orig_TS_ID
results_clasp_df_tssb_complete = pd.concat([results_clasp_df_tssb,results_clasp_df_tssb_orig]).reset_index(drop=True)

results_clasp_df_tssb_complete.to_pickle("results/zwischenergebnisse/1a_clasp_splitting_tssb.pkl")
results_clasp_df_tssb_complete

## HASC

In [None]:
results_clasp_hasc = clasp_segmentation(results_df_hasc)

In [None]:
results_clasp_original_hasc = clasp_segmentation(hasc, False)

In [None]:
#append original to downsampled
results_clasp_df_hasc = pd.DataFrame(results_clasp_hasc, columns=columns_clasp)
results_clasp_df_hasc_orig = pd.DataFrame(results_clasp_original_hasc, columns=columns_clasp)

results_clasp_df_hasc_complete = pd.concat([results_clasp_df_hasc,results_clasp_df_hasc_orig]).reset_index(drop=True)

results_clasp_df_hasc_complete.to_pickle("results/zwischenergebnisse/1a_clasp_splitting_hasc.pkl")
results_clasp_df_hasc_complete

# Analysis

## Functions

In [None]:
def group_df(df, column):
    df.groupby([column]).mean()
    return df.groupby([column]).mean(), df.groupby([column]).median()

In [None]:
def plot_clasp_per_compression(df, title, y_title_pre, filepath=None):
    fig = go.Figure()
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    fig.add_trace(go.Scatter(x=df.index.values, y=df.score,
                        mode='lines',
                        line=dict(color='black', width=2),
                        name='Score'),
                 secondary_y=False
                 )
    
    fig.add_trace(go.Scatter(x=df.index.values, y=df.runtime,
                        mode='lines',
                        line=dict(color='grey', width=2, dash='dot'),
                        name='Runtime'),
                 secondary_y=True
                 )
    
    fig.add_vline(x=df['score'].idxmax(), line_width=1, line_color="black", annotation_text="Top Score", annotation_position="top")
    fig.add_vline(x=df['score'].nlargest(2).index[-1], line_width=1, line_color="black", annotation_text="2nd Top Score", annotation_position="top")
    fig.add_vline(x=df['score'].nlargest(3).index[-1], line_width=1, line_color="black", annotation_text="3rd Top Score", annotation_position="top")

    fig.update_yaxes(title_text= y_title_pre + " Covering Score", secondary_y=False)
    fig.update_yaxes(title_text= y_title_pre + " Runtime in Seconds", secondary_y=True)

    fig.update_xaxes(title_text="Compression Ratio")
    
    fig.update_layout(height=300, 
                      font=dict(
                          family="Arial",
                          size=12,
                          color="black"
                      ),
                      title=dict(
                          text=title
                      ),
                      margin=dict(l=0, r=0, t=30, b=0),
                      legend=dict(
                          yanchor="top",
                          y=0.99,
                          xanchor="left", 
                          x=0.01
                      )
                     )

    if filepath:
        fig.write_image(filepath, scale=1, width=1000, height=300)
        
    fig.show()

In [None]:
def critical_distance(df, title=None, filepath=None):
    avg_rank = df.groupby('orig_TS_ID').score.rank(pct=True).groupby(df.Algorithm).mean()
    p_vals = sp.posthoc_nemenyi_friedman(
        df,
        melted=True,
        block_col='orig_TS_ID',
        group_col='Algorithm',
        y_col='score',
    )

    fig = plt.figure(figsize=(10, 2), dpi=100)
    plt.rcParams["font.family"] = "Arial"
    plt.rcParams["font.size"] = "12"
    font = {'family':'arial','color':'black','size':12}
    plt.title(title, loc = 'left', fontdict=font, y=1.2)
    
    sp.critical_difference_diagram(avg_rank, p_vals,
                               label_props={'color': 'black'},
                               elbow_props={'color': 'gray'}                              
                              )
    
    if filepath:
         fig.savefig(filepath, scale=1, bbox_inches='tight')

    plt.show()

## TSSB

In [None]:
grouped_tssb_mean, grouped_tssb_median = group_df(results_clasp_df_tssb_complete, 'compression')

In [None]:
plot_clasp_per_compression(grouped_tssb_mean, title='(a) TSSB', y_title_pre='Mean', filepath="results/figures/mean_scores_runtimes_per_CR_TSSB.svg")

In [None]:
plot_clasp_per_compression(grouped_tssb_median, title='', y_title_pre='Median', filepath="results/figures/median_scores_runtimes_per_CR_TSSB.svg")

In [None]:
crit_dist = pd.concat([results_clasp_df_tssb_complete[results_clasp_df_tssb_complete.compression==0.5],
                       results_clasp_df_tssb_complete[results_clasp_df_tssb_complete.Algorithm=='None']]).reset_index(drop=True)
pt.critical_distance(crit_dist, title='(a) TSSB', filepath='results/figures/critical_distance_splitting_orig_scores_TSSB.svg')

## HASC

In [None]:
grouped_hasc_mean,grouped_hasc_median = group_df(results_clasp_df_hasc_complete, 'compression')

In [None]:
plot_clasp_per_compression(grouped_hasc_mean, title='(b) HASC', y_title_pre='Mean', filepath="results/figures/mean_scores_runtimes_per_CR_HASC.svg")

In [None]:
plot_clasp_per_compression(grouped_hasc_median, title='', y_title_pre='Median', filepath="results/figures/median_scores_runtimes_per_CR_HASC.svg")

In [None]:
crit_dist = pd.concat([results_clasp_df_hasc_complete[results_clasp_df_hasc_complete.compression==0.5],
                       results_clasp_df_hasc_complete[results_clasp_df_hasc_complete.Algorithm=='None']]).reset_index(drop=True)
pt.critical_distance(crit_dist, title='(b) HASC', filepath='results/figures/critical_distance_splitting_orig_scores_HASC.svg')

# Runtime

In [None]:
results_clasp_df_tssb_complete
results_clasp_df_hasc_complete

## Functions

In [None]:
def add_num_found_cps(row):
    return len(row.predicted_cps)

In [None]:
def add_num_true_cps(row):
    return len(row.True_CPS)

In [None]:
def add_num_cp_diffs(row):
    return row.num_pred_cps-row.num_true_cps

In [None]:
def cp_differences_comparison_matrix(df, save_path=None):
    mean = df.groupby(by=['compression']).mean().rename(columns={'num_pred_cps': 'mean'})
    median = df.groupby(by=['compression']).median().rename(columns={'num_pred_cps': 'median'})
    std = df.groupby(by=['compression']).std().rename(columns={'num_pred_cps': 'std'})
    
    out = mean.merge(median, left_on='compression', right_on='compression')
    out = out.merge(std, left_on='compression', right_on='compression')
    out = out.transpose()
    if save_path:
        out.to_excel(save_path)
    return out

In [None]:
def plot_numCPS_diffs(df_tssb, df_hasc, filepath=None):

    fig = go.Figure(
        layout=dict(
            xaxis=dict(categoryorder="category descending"),
            scattermode="group"
            )
        )

    fig.add_trace(go.Bar(name='TSSB', 
                         x=df_tssb.columns.values, y=df_tssb.loc['mean'].values, 
                         error_y=dict(type='data', array=df_tssb.loc['std'].values)
                        )
                 )
    
    fig.add_trace(go.Bar(name='HASC', 
                         x=df_hasc.columns.values, y=df_hasc.loc['mean'].values, 
                         error_y=dict(type='data', array=df_hasc.loc['std'].values)
                        )
                 )
    
    fig.update_yaxes(title_text="Amount of predicted change points")
    fig.update_xaxes(title_text="Compression Ratio")

    fig.update_layout(height=400, 
                      font=dict(
                          family="Arial",
                          size=12,
                          color="black"
                      ),
                      legend=dict(
                          yanchor="top",
                          y=0.99,
                          xanchor="left", 
                          x=0.01
                      ),
                      xaxis = dict(
                        tickmode = 'array',
                        tickvals = df_tssb.columns.values
                    )
                     )
    
    if filepath:
        fig.write_image(filepath, scale=1, width=1000, height=400)
        
    fig.show()

## TSSB

In [None]:
results_clasp_df_tssb_complete['num_pred_cps'] = results_clasp_df_tssb_complete.apply(add_num_found_cps,axis=1)
results_clasp_df_tssb_complete['num_true_cps'] = results_clasp_df_tssb_complete.apply(add_num_true_cps,axis=1)
results_clasp_df_tssb_complete['diff_pred_true'] = results_clasp_df_tssb_complete.apply(add_num_cp_diffs,axis=1)

In [None]:
cps_tssb=cp_differences_comparison_matrix(results_clasp_df_tssb_complete[['compression','num_pred_cps']], 
                                          save_path='results/tables/NumCPS_per_compression_tssb.xlsx')
cps_tssb

## HASC

In [None]:
results_clasp_df_hasc_complete['num_pred_cps'] = results_clasp_df_hasc_complete.apply(add_num_found_cps,axis=1)
results_clasp_df_hasc_complete['num_true_cps'] = results_clasp_df_hasc_complete.apply(add_num_true_cps,axis=1)
results_clasp_df_hasc_complete['diff_pred_true'] = results_clasp_df_hasc_complete.apply(add_num_cp_diffs,axis=1)

In [None]:
cps_hasc=cp_differences_comparison_matrix(results_clasp_df_hasc_complete[['compression','num_pred_cps']], 
                                          save_path='results/tables/NumCPS_per_compression_hasc.xlsx')
cps_hasc

## Merged Results

In [None]:
plot_numCPS_diffs(cps_tssb, cps_hasc, filepath='results/figures/Numpredicted_cps_perCR.svg')