# ProcessM

In [29]:
# Imports & params
import pandas as pd
path = "/Users/bgorka/Desktop/Projects/PolitechnikaPoznanska/processm"
# files = ['CoSeLoG_WABO_1', 'CoSeLoG_WABO_2', 'BPIC15_1', 'BPIC15_1f', 'BPIC15_2', 'BPIC15_2f', 'BPIC15_3', 'BPIC15_3f', 'BPIC15_4', 'BPIC15_4f', 'BPIC15_5', 'Receipt_phase_of_an_environmental_permit_application_process_WABO_CoSeLoG_project', 'BPIC15_5f', 'Sepsis_Cases-Event_Log', 'CoSeLoG_WABO_3', 'CoSeLoG_WABO_4', 'CoSeLoG_WABO_5', 'Hospital_log']
files = ['BPIC15_1']
calcParameters = 'precision' # or 'fitness'
offlineStatsMode = 'true' # or 'false'
mode = 'train' # or 'test'
gridColumns = 4
minRunNumber = 1
maxRunNumber = 1 

## Tabele
### 1. Online vs Offline precyzja & dopasowanie

In [22]:
for f in files:
    print('-#########  ', f, '  ###########-')
    
    # Read files and merge into single pandas dataframe
    columns = ['window_size', 'step', 'current_pos', 'value']
    online = pd.DataFrame(columns=columns)
    offline = pd.DataFrame(columns=columns)
    
    for run in range(minRunNumber, maxRunNumber + 1):
        df = pd.read_csv(path + "/" + str(run) + "-online-" + mode + "-" + calcParameters + "-" + f, names=columns)
        online = online.append(df, ignore_index=True)
        
        df = pd.read_csv(path + "/" + str(run) + "-offline" + offlineStatsMode + "-" + mode + "-" + calcParameters + "-" + f, names=columns)
        offline = offline.append(df, ignore_index=True)
    
    # Calculate mean and std
    online = online.groupby(['window_size', 'step'], as_index=False).agg({'value': ['mean', 'std']}).sort_values(by=['window_size', 'step'])
    offline = offline.groupby(['window_size', 'step'], as_index=False).agg({'value': ['mean', 'std']}).sort_values(by=['window_size', 'step'])
    
    # Round values
    online['value', 'mean'] = online['value', 'mean'].round(decimals=3)
    offline['value', 'mean'] = offline['value', 'mean'].round(decimals=3)
    online['value', 'std'] = online['value', 'std'].round(decimals=3).fillna(1.0).apply(lambda x: min(x * 2, 1.0))
    offline['value', 'std'] = offline['value', 'std'].round(decimals=3).fillna(1.0).apply(lambda x: min(x * 2, 1.0))
    
    print('------------------ OFFLINE ---------------------------------')
    for index, row in offline.iterrows():
        offline_mean = row['value', 'mean']
        offline_std = row['value', 'std']
        
        online_row = online.loc[index]
        online_mean = online_row['value', 'mean']
        
        if offline_mean < online_mean:
            print("\\cellcolor{red!50!white}", end=' ')
        elif offline_mean > online_mean:
            print("\\cellcolor{green!50!white}", end=' ')
        print('{:.3f} '.format(offline_mean), end='')
        print("\\begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,", end='')
        print('{:.3f}'.format(offline_std), end='')
        print(");\end{tikzpicture} &")
    
    print('------------------ ONLINE ---------------------------------')
    for index, row in online.iterrows():
        online_mean = row['value', 'mean']
        online_std = row['value', 'std']
        
        offline_row = offline.loc[index]
        offline_mean = offline_row['value', 'mean']
        
        if online_mean < offline_mean:
            print("\\cellcolor{red!50!white}", end=' ')
        elif online_mean > offline_mean:
            print("\\cellcolor{green!50!white}", end=' ')
        print('{:.3f} '.format(online_mean), end='')
        print("\\begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,", end='')
        print('{:.3f}'.format(online_std), end='')
        print(");\end{tikzpicture} &")

-#########   BPIC15_1   ###########-
------------------ OFFLINE ---------------------------------
\cellcolor{red!50!white} 0.038 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.004);\end{tikzpicture} &
------------------ ONLINE ---------------------------------
\cellcolor{green!50!white} 0.041 \begin{tikzpicture}[y=0.75em,baseline=0.5pt]\draw[very thick] (0,0) -- (0,0.032);\end{tikzpicture} &


### 2. Grid - czas

In [30]:
print("\\pgfplotsset{every axis/.append style={%")
print("     enlarge x limits=false,%")
print("     enlarge y limits=false,%")
print("   }%")
print("}%")

print("\\tabcolsep=0.0em%")
print("\\begin{tabular}{rrrr}")

i = 0
for f in files:
    print("\\begin{tikzpicture}")
    print("\\begin{semilogyaxis}[")
    print("    xtick={10,20,30,40,50,75,100},")
    print("    grid style=dashed,")
    print("    ymajorgrids=true,")
    print("    xmajorgrids=true,")
    print("    height=0.20\\textheight,")
    print("    extra description/.code={\\node[anchor=west,font=\\scriptsize] at (0.0, 0.1) {", f.replace("_", "\_"), "};}")
    print("  ]")
    i += 1
    
    # Read files and merge into single pandas dataframe
    columns = ['parameter', 'alg', 'window_size', 'step', 'current_pos', 'value']
    online = pd.DataFrame(columns=columns)
    offline = pd.DataFrame(columns=columns)

    for run in range(minRunNumber, maxRunNumber + 1):
        df = pd.read_csv(path + "/" + str(run) + "-online-stats-" + f, sep='\t', names=columns)
        df = df[df['parameter'] == 'time']
        online = online.append(df, ignore_index=True)

        df = pd.read_csv(path + "/" + str(run) + "-offline" + offlineStatsMode + "-" + f, sep='\t', names=columns)
        df = df[df['parameter'] == 'time']
        offline = offline.append(df, ignore_index=True)
    
    online = online.astype({'parameter': 'string', 'alg': 'string', 'window_size': 'int32', 'step': 'int32', 'current_pos': 'int32', 'value': 'float64'})
    offline = offline.astype({'parameter': 'string', 'alg': 'string', 'window_size': 'int32', 'step': 'int32', 'current_pos': 'int32', 'value': 'float64'})
    
    for (m, df) in [('online', online), ('offline', offline)]:
        group = df.groupby(['window_size', 'step'], as_index=False).agg({'value': 'mean'}).round(decimals=0)
        
        print("  \\addplot [")
        
        if m == 'offline':
            print("    color=blue,")
            print("    mark=*,")
        else:
            print("    color=red,")
            print("    mark=square*,")
        
        print("    dashed")
        print("  ]")
        print("  table {")
        for index, row in group.iterrows():
            print("  ", row['window_size'], row['value'])
        print("  };")
    print("\\end{semilogyaxis}")
    print("\\end{tikzpicture}")
    
    if i % gridColumns == 0:
        print("\\\\")
    else:
        print("&")
print("\\end{tabular}")

\pgfplotsset{every axis/.append style={%
     enlarge x limits=false,%
     enlarge y limits=false,%
   }%
}%
\tabcolsep=0.0em%
\begin{tabular}{rrrr}
\begin{tikzpicture}
\begin{semilogyaxis}[
    xtick={10,20,30,40,50,75,100},
    grid style=dashed,
    ymajorgrids=true,
    xmajorgrids=true,
    height=0.20\textheight,
    extra description/.code={\node[anchor=west,font=\scriptsize] at (0.0, 0.1) { BPIC15\_1 };}
  ]
  \addplot [
    color=red,
    mark=square*,
    dashed
  ]
  table {
   10.0 12.0
  };
  \addplot [
    color=blue,
    mark=*,
    dashed
  ]
  table {
   10.0 78.0
  };
\end{semilogyaxis}
\end{tikzpicture}
&
\end{tabular}


### 3. Statystyka przebudowania drzew

In [32]:
print("\\pgfplotsset{every axis/.append style={%")
print("     enlarge x limits=false,%")
print("     enlarge y limits=false,%")
print("   }%")
print("}%")

print("\\tabcolsep=0.0em%")
print("\\begin{tabular}{rrrr}")

i = 0
for f in files:
    print("\\begin{tikzpicture}")
    print("\\begin{axis}[")
    print("    xtick={10,20,30,40,50,75,100},")
    print("    grid style=dashed,")
    print("    ymajorgrids=true,")
    print("    xmajorgrids=true,")
    print("    height=0.20\\textheight,")
    print("    extra description/.code={\\node[anchor=east,font=\\scriptsize] at (1.0, 0.9) {", f.replace("_", "\_"), "};}")
    print("  ]")
    i += 1
    
    # Read files and merge into single pandas dataframe
    columns = ['window_size', 'step', 'buildFromZero', 'rebuild', 'ignored']
    online = pd.DataFrame(columns=columns)

    for run in range(minRunNumber, maxRunNumber + 1):
        df = pd.read_csv(path + "/" + str(run) + "-online-extra-" + f, sep='\t', names=columns)
        df = df.groupby(['window_size'], as_index=False).agg({'buildFromZero': 'sum', 'rebuild': 'sum', 'ignored': 'sum'})
        online = online.append(df, ignore_index=True)
    
    online = online.astype({'window_size': 'int32', 'buildFromZero': 'int32', 'rebuild': 'int32', 'ignored': 'int32'})
    online = online.groupby(['window_size'], as_index=False).agg({'buildFromZero': 'mean', 'rebuild': 'mean', 'ignored': 'mean'}).round(decimals=0)

    for m in ['buildFromZero', 'rebuild', 'ignored']:
        print("  \\addplot [")
        if m == 'buildFromZero':
            print("    color=blue,")
            print("    mark=*,")
        elif m == 'ignored':
            print("    color=red,")
            print("    mark=square*,")
        else:
            print("    color=green,")
            print("    mark=diamond*,")
        print("    dashed")
        print("  ]")
        print("  table {")
        for index, row in df.iterrows():
            print("  ", row['window_size'], row[m])
        print("  };")
        
    print("\\end{axis}")
    print("\\end{tikzpicture}")

    if i % gridColumns == 0:
        print("\\\\")
    else:
        print("&")

print("\\end{tabular}")

\pgfplotsset{every axis/.append style={%
     enlarge x limits=false,%
     enlarge y limits=false,%
   }%
}%
\tabcolsep=0.0em%
\begin{tabular}{rrrr}
\begin{tikzpicture}
\begin{axis}[
    xtick={10,20,30,40,50,75,100},
    grid style=dashed,
    ymajorgrids=true,
    xmajorgrids=true,
    height=0.20\textheight,
    extra description/.code={\node[anchor=east,font=\scriptsize] at (1.0, 0.9) { BPIC15\_1 };}
  ]
  \addplot [
    color=blue,
    mark=*,
    dashed
  ]
  table {
  };
  \addplot [
    color=green,
    mark=diamond*,
    dashed
  ]
  table {
  };
  \addplot [
    color=red,
    mark=square*,
    dashed
  ]
  table {
  };
\end{axis}
\end{tikzpicture}
&
\end{tabular}
