# ProcessM

In [87]:
# Imports & params
import pandas as pd
import scipy.stats

path = "/Users/processm/Projects/processm/im-online"
files = ['BPIC15_1', 'BPIC15_1f', 'BPIC15_2f', 'BPIC15_3', 'BPIC15_3f', 'BPIC15_4', 'BPIC15_4f', 'BPIC15_5', 'BPIC15_5f',
         'CoSeLoG_WABO_1', 'CoSeLoG_WABO_3', 'CoSeLoG_WABO_4', 'CoSeLoG_WABO_5',
         'Hospital_log',
         'Receipt_phase_of_an_environmental_permit_application_process_WABO_CoSeLoG_project',
         'Sepsis_Cases-Event_Log']
# files = ['BPIC15_1']
calcParameters = 'precision' # 'precision' or 'fitness'
mode = 'test' # 'train' or 'test'
gridColumns = 4
minRunNumber = 1
maxRunNumber = 1
methods = {
    "online": "OIM",
    "offlinetrue": "IM",
    "offlinefalse": "IMD"
}
window_sizes = [10, 20, 30, 40, 50, 75, 100]
plot_styles = {
    "offlinetrue": "color=blue, mark=*,",
    "offlinefalse": "color=purple, mark=triangle*,",
    "online": "color=red,mark=square*,"
}

## Tabele
### 1. Online vs Offline precyzja & dopasowanie

In [88]:
def print_stat(mean, other_means, std):
    print("&", end="")
    if mean > max(other_means):
        print("\\cellcolor{green!50!white}", end='')
    elif mean < min(other_means):
        print("\\cellcolor{red!50!white}", end='')

    print('{:.3f} '.format(mean), end='')
    print("\\begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,", end='')
    print('{:.3f}'.format(std), end='')
    print(");\end{tikzpicture}")

print("\\begin{tabular}{l%s}%%" % (("|" + ("c" * len(methods))) * (len(window_sizes) + 1)))
print("\\hline%")
print("(\\textsc{a}) Window size", end="")

for window_size in window_sizes:
    print("&\\multicolumn{%d}{c|}{%d}" % (len(methods), window_size), end="")

print("&\\multicolumn{%d}{c}{Mean} \\\\" % len(methods))
print("\\hline%")

print("Event log", end="")
for window_size in window_sizes + ["sum"]:
    for display_name in methods.values():
        print("&%s" % display_name, end="")
print("\\\\")
print("\\hline%")

columns = ['window_size', 'step', 'current_pos', 'value']
total_raws = { name: pd.DataFrame(columns=columns) for name in methods.keys() }

for f in files:
    print("%s " % f[:14].replace('_', '\\_'), end="")
    
    # Read files and merge into single pandas dataframe
    statistics = {}
    for name in methods.keys():
        raw = pd.DataFrame(columns=columns)
        for run in range(minRunNumber, maxRunNumber+1):
            df = pd.read_csv(path + "/" + str(run) + "-" + name + "-" + mode + "-" + calcParameters + "-" + f, names=columns)
            df["problem"] = f[:14].replace('_', '\\_')
            raw = raw.append(df, ignore_index=True)
        total_raws[name] = total_raws[name].append(raw, ignore_index=True)

        # Calculate mean and std
        stat = raw.groupby(['window_size', 'step'], as_index=False).agg({'value': ['mean', 'std']}).sort_values(by=['window_size', 'step'])

        # Round values
        stat['value', 'mean'] = stat['value', 'mean'].round(decimals=3)
        stat['value', 'std'] = stat['value', 'std'].round(decimals=3) #.fillna(1.0).apply(lambda x: min(x * 2, 1.0))

        statistics[name] = { "raw": raw, "stat": stat }

    for window_size in window_sizes:
        for name in methods.keys():
            stat = statistics[name]["stat"]
            row = stat[stat["window_size"] == window_size]

            mean = row["value", "mean"].iloc[0]
            std = row["value", "std"].iloc[0]

            other_means = [statistics[other_name]["stat"][statistics[other_name]["stat"]["window_size"] == window_size]["value", "mean"].iloc[0] for other_name in methods.keys() if other_name != name ]
            print_stat(mean, other_means, std)

    # row-wide means
    for name in methods.keys():
        row = statistics[name]["raw"].agg({'value': ['mean', 'std']})
        row = row.round(decimals=3)

        mean = row["value"].iloc[0]
        std = row["value"].iloc[1]

        other_means = [statistics[other_name]["raw"].agg({'value': ['mean']}).round(decimals=3)["value"].iloc[0]  for other_name in methods.keys() if other_name != name ]
        print_stat(mean, other_means, std)

    print("\\\\")

print("\\hline%")

# column-wide means
print("Mean")

for window_size in window_sizes:
    for name in methods.keys():
        raw = total_raws[name]
        subraw = raw[raw["window_size"] == window_size]

        stat = subraw.agg({'value': ['mean', 'std']})
        stat = stat.round(decimals=3)

        mean = stat["value"].iloc[0]
        std = stat["value"].iloc[1]

        other_means = [total_raws[other_name][total_raws[other_name]["window_size"] == window_size].agg({"value": ["mean"]}).round(decimals=3)["value"].iloc[0] for other_name in methods.keys() if other_name != name ]
        print_stat(mean, other_means, std)

# total mean
for name in methods.keys():
    total_raw = total_raws[name]

    stat = total_raw.agg({'value': ['mean', 'std']})
    stat = stat.round(decimals=3)

    mean = stat["value"].iloc[0]
    std = stat["value"].iloc[1]

    other_means = [total_raws[other_name].agg({"value": ["mean"]}).round(decimals=3)["value"].iloc[0] for other_name in methods if other_name != name]

    print_stat(mean, other_means, std)

print("\\\\")

# column-wide p-values
print("p-value", end="")
for window_size in window_sizes:
    raws = [total_raws[name][total_raws[name]["window_size"] == window_size] for name in methods.keys()]
    raws = [r.groupby(["problem"], as_index=False).agg({"value": ["mean"]}) for r in raws]

    means = [raw["value", "mean"].iloc[0] for raw in raws]
    max_mean = max(means)
    max_index = means.index(max_mean)

    pvalues = [
        scipy.stats.wilcoxon(
            raws[i]["value", "mean"],
            raws[max_index]["value", "mean"]
        ).pvalue
        if i != max_index else None
        for i in range(len(means))
    ]
    for pvalue in pvalues:
        print(("&%.3f" % pvalue) if pvalue is not None else "&", end="")

# total p-values
raws = [total_raws[name] for name in methods.keys()]
raws = [r.groupby(["problem"], as_index=False).agg({"value": ["mean"]}) for r in raws]
means = [raw.groupby(["problem"], as_index=False).agg({"value": ["mean"]})["value", "mean"].iloc[0] for raw in total_raws.values()]
max_mean = max(means)
max_index = means.index(max_mean)

pvalues = [
    scipy.stats.wilcoxon(
        raws[i]["value", "mean"],
        raws[max_index]["value", "mean"]
    ).pvalue
    if i != max_index else None
    for i in range(len(means))
]
for pvalue in pvalues:
    print(("&%.3f" % pvalue) if pvalue is not None else "&", end="")

print("\\\\", end="")

print("\\hline%")
print("\\end{tabular}")

\begin{tabular}{l|ccc|ccc|ccc|ccc|ccc|ccc|ccc|ccc}%
\hline%
(\textsc{a}) Window size&\multicolumn{3}{c|}{10}&\multicolumn{3}{c|}{20}&\multicolumn{3}{c|}{30}&\multicolumn{3}{c|}{40}&\multicolumn{3}{c|}{50}&\multicolumn{3}{c|}{75}&\multicolumn{3}{c|}{100}&\multicolumn{3}{c}{Mean} \\
\hline%
Event log&OIM&IM&IMD&OIM&IM&IMD&OIM&IM&IMD&OIM&IM&IMD&OIM&IM&IMD&OIM&IM&IMD&OIM&IM&IMD&OIM&IM&IMD\\
\hline%
BPIC15\_1 &0.058 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.064);\end{tikzpicture}
&0.058 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.064);\end{tikzpicture}
&\cellcolor{green!50!white}0.113 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.162);\end{tikzpicture}
&0.033 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.039);\end{tikzpicture}
&0.033 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.039);\end{tikzpicture}
&\cellcolor{green!50!white}0.079 \be

### 2. Grid - czas

In [94]:
print("\\pgfplotsset{every axis/.append style={%")
print("     enlarge x limits=false,%")
print("     enlarge y limits=false,%")
print("   }%")
print("}%")

print("\\tabcolsep=0.0em%")
print("\\begin{tabular}{rrrr}")

i = 0
for f in files:
    print("\\begin{tikzpicture}")
    print("\\begin{semilogyaxis}[")
    print("    xtick={10,20,30,40,50,75,100},")
    print("    grid style=dashed,")
    print("    ymajorgrids=true,")
    print("    xmajorgrids=true,")
    print("    height=0.20\\textheight,")
    print("    extra description/.code={\\node[anchor=west,font=\\scriptsize] at (0.0, 0.1) {", f[:14].replace("_", "\_"), "};}")
    print("  ]")
    i += 1
    
    # Read files and merge into single pandas dataframe
    columns = ['parameter', 'alg', 'window_size', 'step', 'current_pos', 'value']
    statistics = { name: pd.DataFrame(columns=columns) for name in methods.keys() }

    for run in range(minRunNumber, maxRunNumber + 1):
        for name in methods.keys():
            filename = "%s/%s-%s-%s" % (path, str(run), "online-stats" if name == "online" else name, f)
            df = pd.read_csv(filename, sep='\t', names=columns)
            df = df[df['parameter'] == 'time']
            statistics[name] = statistics[name].append(df, ignore_index=True)

    for name in methods.keys():
        statistics[name] = statistics[name].astype({'parameter': 'string', 'alg': 'string', 'window_size': 'int32', 'step': 'int32', 'current_pos': 'int32', 'value': 'float64'})

    for (m, df) in statistics.items():
        group = df.groupby(['window_size', 'step'], as_index=False).agg({'value': 'mean'}).round(decimals=0)
        
        print("  \\addplot [")
        print(plot_styles[m])
        print("    dashed")
        print("  ]")
        print("  table {")
        for index, row in group.iterrows():
            print("  ", row['window_size'], row['value'])
        print("  };")
    print("\\end{semilogyaxis}")
    print("\\end{tikzpicture}")
    
    if i % gridColumns == 0:
        print("\\\\")
    else:
        print("&")
print("\\end{tabular}")

\pgfplotsset{every axis/.append style={%
     enlarge x limits=false,%
     enlarge y limits=false,%
   }%
}%
\tabcolsep=0.0em%
\begin{tabular}{rrrr}
\begin{tikzpicture}
\begin{semilogyaxis}[
    xtick={10,20,30,40,50,75,100},
    grid style=dashed,
    ymajorgrids=true,
    xmajorgrids=true,
    height=0.20\textheight,
    extra description/.code={\node[anchor=west,font=\scriptsize] at (0.0, 0.1) { BPIC15\_1 };}
  ]
  \addplot [
color=red,mark=square*,
    dashed
  ]
  table {
   10.0 14.0
   20.0 9.0
   30.0 5.0
   40.0 3.0
   50.0 3.0
   75.0 2.0
   100.0 2.0
  };
  \addplot [
color=blue, mark=*,
    dashed
  ]
  table {
   10.0 11.0
   20.0 9.0
   30.0 5.0
   40.0 4.0
   50.0 4.0
   75.0 4.0
   100.0 4.0
  };
  \addplot [
color=purple, mark=triangle*,
    dashed
  ]
  table {
   10.0 11.0
   20.0 9.0
   30.0 5.0
   40.0 4.0
   50.0 4.0
   75.0 4.0
   100.0 4.0
  };
\end{semilogyaxis}
\end{tikzpicture}
&
\begin{tikzpicture}
\begin{semilogyaxis}[
    xtick={10,20,30,40,50,75,100},
  

### 3. Statystyka przebudowania drzew

In [93]:
print("\\pgfplotsset{every axis/.append style={%")
print("     enlarge x limits=false,%")
print("     enlarge y limits=false,%")
print("   }%")
print("}%")

print("\\tabcolsep=0.0em%")
print("\\begin{tabular}{rrrr}")

i = 0
for f in files:
    print("\\begin{tikzpicture}")
    print("\\begin{axis}[")
    print("    xtick={10,20,30,40,50,75,100},")
    print("    grid style=dashed,")
    print("    ymajorgrids=true,")
    print("    xmajorgrids=true,")
    print("    height=0.20\\textheight,")
    print("    extra description/.code={\\node[anchor=east,font=\\scriptsize] at (1.0, 0.9) {", f[:14].replace("_", "\_"), "};}")
    print("  ]")
    i += 1
    
    # Read files and merge into single pandas dataframe
    columns = ['window_size', 'step', 'buildFromZero', 'rebuild', 'ignored']
    online = pd.DataFrame(columns=columns)

    for run in range(minRunNumber, maxRunNumber + 1):
        df = pd.read_csv(path + "/" + str(run) + "-online-extra-" + f, sep='\t', names=columns)
        df = df.groupby(['window_size'], as_index=False).agg({'buildFromZero': 'sum', 'rebuild': 'sum', 'ignored': 'sum'})
        online = online.append(df, ignore_index=True)
    
    online = online.astype({'window_size': 'int32', 'buildFromZero': 'int32', 'rebuild': 'int32', 'ignored': 'int32'})
    online = online.groupby(['window_size'], as_index=False).agg({'buildFromZero': 'mean', 'rebuild': 'mean', 'ignored': 'mean'}).round(decimals=0)

    for m in ['buildFromZero', 'rebuild', 'ignored']:
        print("  \\addplot [")
        if m == 'buildFromZero':
            print("    color=blue,")
            print("    mark=*,")
        elif m == 'ignored':
            print("    color=red,")
            print("    mark=square*,")
        else:
            print("    color=green,")
            print("    mark=diamond*,")
        print("    dashed")
        print("  ]")
        print("  table {")
        for index, row in df.iterrows():
            print("  ", row['window_size'], row[m])
        print("  };")
        
    print("\\end{axis}")
    print("\\end{tikzpicture}")

    if i % gridColumns == 0:
        print("\\\\")
    else:
        print("&")

print("\\end{tabular}")

\pgfplotsset{every axis/.append style={%
     enlarge x limits=false,%
     enlarge y limits=false,%
   }%
}%
\tabcolsep=0.0em%
\begin{tabular}{rrrr}
\begin{tikzpicture}
\begin{axis}[
    xtick={10,20,30,40,50,75,100},
    grid style=dashed,
    ymajorgrids=true,
    xmajorgrids=true,
    height=0.20\textheight,
    extra description/.code={\node[anchor=east,font=\scriptsize] at (1.0, 0.9) { BPIC15\_1 };}
  ]
  \addplot [
    color=blue,
    mark=*,
    dashed
  ]
  table {
   10 210
   20 143
   30 120
   40 106
   50 95
   75 80
   100 66
  };
  \addplot [
    color=green,
    mark=diamond*,
    dashed
  ]
  table {
   10 336
   20 328
   30 298
   40 281
   50 267
   75 241
   100 229
  };
  \addplot [
    color=red,
    mark=square*,
    dashed
  ]
  table {
   10 83
   20 138
   30 171
   40 182
   50 187
   75 178
   100 154
  };
\end{axis}
\end{tikzpicture}
&
\begin{tikzpicture}
\begin{axis}[
    xtick={10,20,30,40,50,75,100},
    grid style=dashed,
    ymajorgrids=true,
    xma