In [None]:
from header import *
from scipy import optimize
plt.rcParams.update({'font.size': 16})
plots_dir = Path('imgs/')
timelimit = 600

In [None]:
# Best params per input
params = read_benchmarks('table/params_N1e7.tsv')
#params_ont = read_benchmarks('table/params_ont_N1e6.tsv')
best_params = params.loc[params.groupby(['alg', 'cnt', 'n', 'e']).s.idxmin()]
#best_params_ont = params.loc[params_ont.groupby(['alg', 'cnt', 'n', 'e']).s.idxmin()]
# Merge the best PA runs into the other tool runs with fixed k
tools = read_benchmarks('table/tools_N1e7.tsv')
tools_slow = read_benchmarks('table/tools_slow_N1e5.tsv')
#tools_ont = read_benchmarks('table/tools_ont_N1e6.tsv')
tools = pd.concat([tools, tools_slow, best_params], join='outer').sort_values(by=['e', 'n', 'alg'])

In [None]:
def plot_scaling(df, x, y, prefix, hline=False, algo1=None, algo2=None, show_mean=False, trend_line='', xlog=False, ylog=False, title=None, split='alg', fit_min=None, cone_exp=1, cone=None):
    #fig, ax = plt.subplots()
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(6, 4, forward=True)
 
    # Data
    # TODO: Simplify handling of 'mean', since we don't use it.
    d = df.groupby([split, x], group_keys=False, observed=True)[y].mean()  # observed=True drops categorical algos that don't appear
    d = d.reset_index(name='mean_{}'.format(y))
    d = d.pivot(index=x, columns=split, values='mean_{}'.format(y))
   
    for algo in d.columns:
        d[algo] = d[algo].dropna()[d[algo] > 0]
        
    # PLOT DATA
    for algo in d.columns:
        d[algo].plot(ax=ax, alpha=0.6, zorder=3, rot=0, color=algo2color(algo), marker=algo2marker(algo), ls='', legend=False)
  
    # DRAW CONE
    # Fills the region between x**cone and x**(cone+1)
    def draw_cone(x_origin, x_max=d.index.max()):
        # draw cone
        x_max *= 3
        y_min = d.index.min()
        data = d[cone]
        y_origin = data[x_origin]
        coef = (data.iloc[-1] - y_origin) / (data.index[-1] - x_origin)  # tan
        x_cone      = (x_origin, x_max)
        y_cone_lin = (y_origin, y_origin * (x_max / x_origin)**(cone_exp))
        y_cone_quad = (y_origin, y_origin * (x_max / x_origin)**(cone_exp+1))
        ax.fill_between(x_cone, y_cone_lin, y_cone_quad, color='green', alpha=0.1)
    
    if cone is not None:
        draw_cone(x_origin=10**4)
        draw_cone(x_origin=10**2) #, x_max=10**4)
    
    # FIT y = x^C
    if trend_line == "poly":
        z = {}
        for algo in d.columns:
            s = d[algo].dropna()
            if fit_min:
                s = s[s.index >= fit_min(algo)]
            #s = s[s>0]
            if len(s) > 1:
                z[algo] = np.polyfit(np.log(s.index), np.log(s), 1)
        xs = list(d.index)
        if show_mean:
            d = d.append(df.groupby(split)[y].mean().rename('all'))
        
        # Best fit lines
        exps = {}
        for algo in z:
            regression_line = []
            a, b = z[algo]
            plot_xs = []
            for i in xs:
                # fit only on points >= fit_min(algo)
                if fit_min and i < fit_min(algo):
                    continue
                if i > d[algo].dropna().index.max():
                    continue
                plot_xs.append(i)
                regression_line.append((i**a) * np.exp(b))
            
            weight = 'bold' if 'seeds' in algo else 'normal'
            label = ''
            if len(d[algo].dropna()) > 1:
                ax.plot(plot_xs, regression_line, linestyle='-', color=algo2color(algo), alpha=0.8)
                label = '$\sim n^{{{:0.2f}}}$'.format(a)  ## np.exp(b)*x^a
                exps[algo] = f'{a:.2f}'
            ax.text(plot_xs[-1], regression_line[-1], algo2beautiful(algo) + label,
                    color=algo2color(algo), ha='center', va='bottom', size=15, alpha=1, weight=weight)
        print(exps)
    elif trend_line:
        print(trend_line)
        assert(False)
    
    # ENABLE LOG SCALE
    if ylog:
        ax.set_yscale('log')
    else:
        ax.set_ylim(0)
        
    if xlog:
        ax.set_xscale('log')
        
    # SET LIMITS FOR LOG AXES
    if xlog:
        ax.set_xlim(1/3*d.index.min(), 3*d.index.max())
    if ylog:
        ax.set_ylim(1/8*d.min().min(), 8*d.max().max())
        
    # Background
    ax.set_facecolor('#F3F3F3')
    
    # No border
    for spine in ["top", "left", "right"]:
        ax.spines[spine].set_visible(False)
        
    # GRID: major y-axis
    ax.grid(False, axis='x', which='major')
    ax.grid(False, axis='x', which='minor')
    ax.grid(True, axis='y', which='major', color='w')
    ax.grid(False, axis='y', which='minor')
    
    
    # Ticks
    # no minor ticks
    ax.tick_params(
        axis='both',          # changes apply to the x-axis
        which='minor',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        left=False,right=False,
        labelbottom=False # labels along the bottom edge are off
    )
   
    if x == 'n':
        ax.set_xticks(list(filter(lambda x: d.index.min() <= x and x <= d.index.max(), [10**e for e in range(10)])))
    if y == 's_per_pair':
        ax.set_yticks(list(filter(lambda x: d.min().min() <= x and x <= d.max().max(), [10**e for e in range(-10, 10, 2)])))
    if y == 's_per_bp':
        ax.set_yticks(list(filter(lambda x: d.min().min() <= x and x <= d.max().max(), [10**e for e in range(-10, 10, 1)])))
        
    
    # axis labelsi
    ax.set_xlabel(col2name(x), size=18)  # weight='bold',
    ax.set_ylabel(col2name(y), rotation=0, ha='left', size=18)
    ax.yaxis.set_label_coords(-0.10,1.00)
   
    filename = '{}_{}_{}'.format(prefix, x, y)
    
    if title:
        ax.set_title(title)
        filename += '_{}'.format(title)
    
    plt.savefig(plots_dir/(filename+'.pdf'), bbox_inches='tight')

In [None]:
# TOOL TIME COMPARISON -- time/alignment
experiment_name = "time"
df = tools
df = df[df.alg != 'cp-csh+gap']
#df = df[df.alg != 'dijkstra']
#df = df[df.alg != 'pa_noprune']
df = df[df.alg != 'csh+gap']

# Only print things that didn't time out.
df = df[df.exit_status == "ok"]
def fit_from(algo):
    if algo in ['dijkstra', 'dijkstra_nogreedy', 'pa_noprune', 'pa_noprune_nogreedy']:
        return 10000
    return 10**4

for e in pd.unique(df.e):
    df_n = df[df.e == e]
    #display(df_n[df_n.alg == "csh+gap"])
    plot_scaling(df_n, y='s_per_pair', x='n', prefix=experiment_name+'e={}'.format(e), xlog=True, ylog=True, trend_line='poly', title=f'Error rate {int(100*e)}%', fit_min=fit_from, cone='csh')

In [None]:
# TOOL MEMORY COMPARISON
experiment_name = "memory"
df = tools
# Only print things that didn't time out.
df = df[df.exit_status == "ok"]
df = df[df.alg != 'cp-csh+gap']
df = df[df.alg != 'csh+gap']
# The constant memory of 30MB (snakemake error?) is too large in small tests
df = df[df.n >= 10**4]
def fit_from(algo):
    if algo in ['dijkstra', 'dijkstra_nogreedy', 'pa_noprune', 'pa_noprune_nogreedy']: return 10**4
    return 10**5

for e in pd.unique(df.e):
    df_n = df[df.e == e]
    df_n = df_n[df_n.max_uss != '-']
    df_n = df_n[df_n.max_uss > 0.025]
    if df_n.empty: continue
    plot_scaling(df_n, y='max_uss', x='n', prefix=experiment_name+'e={}'.format(e), xlog=True, ylog=True, trend_line='poly', title=f'Error rate {int(100*e)}%'.format(e), fit_min=fit_from)

In [None]:
# Expanded states
experiment_name = "time"
df = tools
df = df[df.alg != 'cp-csh+gap']
df = df[df.alg != 'csh+gap']

# Only print things that didn't time out.
df = df[df.exit_status == "ok"]
def fit_from(algo):
    if algo in ['dijkstra', 'pa_noprune_nogreedy']:
        return 10000
    return 10000
import math

# Table of band
b = df[['alg', 'n', 'e', 'band']].dropna()
alg_order = ['sh', 'csh', 'csh+gap-noprune',  'dijkstra']
b['alg_idx'] = b['alg'].map(lambda a: alg_order.index(a))
b = b.sort_values(by=['e', 'n', 'alg_idx'])
display(pd.pivot_table(b, values='band', columns=['n'], index=['e', 'alg'], sort=False))
#display(b)

# Plots of expanded states
for e in pd.unique(df.e):
    df_n = df[df.e == e]
    df_n = df_n.dropna(subset=['expanded'])
    plot_scaling(df_n, y='expanded', x='n', prefix=experiment_name+'e={}'.format(e), xlog=True, ylog=True, trend_line='poly', title=f'Error rate {int(100*e)}%', fit_min=fit_from, cone='csh')

In [None]:
# MAX MEMORY USAGE

df = tools
df = df[df.n >= 1000000]
df = df[df.e >= 0.1]
df = df[df.alg.isin(['csh+gap', 'edlib', 'biwfa'])]
display(df[['alg', 'n', 'e', 's', 'max_uss']])

In [None]:
# SCALING WITH e
# TODO: plot labels
experiment_name = "e_scaling"
df = tools
df = df[df.n == 10000]
#display(df)
plot_scaling(df, y='s_per_pair', x='e', prefix=experiment_name, xlog=False, ylog=True, trend_line=False)

In [None]:
# BEST PARAMS
e = 0.2
p = params[params.e == e]
display(p[(p.alg == 'csh')]) # all
b = best_params
display(b[(b.e == 0.2)&(b.n == 10000)]) # best


In [None]:
# SCALING WITH K
# TODO: Merge these lines into a single figure per e
# e=0.01: k=31, m=0
# e=0.05: k=12, m=0
# e=0.10: ???
# e=0.20: k=8..10, m=1, for large n, k>=10
e=0.2

t = params
t = t[t.e == e]
for n in pd.unique(t.n):
    for m in [0,1]:
        df2 = t[(t.n == n) & (t.m == m) & (t.s_per_pair < 1000) & (t.alg == 'cp-sh')]
        if not df2.empty:
            plot_scaling(df2, y='s_per_pair', x='k', prefix='k', xlog=False, ylog=True, title=f'n = {n}, m = {m}')

In [None]:
display(tools[(tools.alg == 'sh')&(tools.e == 0.05)])