In [1]:
from typing import List
from itertools import combinations
import numpy as np
import dask.dataframe as dd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import matplotlib.pyplot as plt

In [3]:
dir = "./data/circle/"
real = dd.read_parquet(dir+"main.parquet", columns=["spec","variant","real.tests", "real.downs", "size", "clocks"])
squished = dd.read_parquet(dir+"squished.parquet", columns=["spec","variant","approx.tests", "approx.downs", "limit.tests", "limit.downs"])
opti = dd.read_parquet(dir+"opti/init_weights.min_front.parquet", index=["spec"],columns=["variant","comb", "real.tests", "real.downs"])
squished = squished.groupby(["spec", "variant"]).max()

real_max = real.groupby(["spec", "variant"]).max()

KeyError: 3

In [None]:
comparison_squished = real_max.join(squished, on=["spec", "variant"]).compute()

In [None]:
comparison_squished["limit.diff"] = ((comparison_squished["limit.tests"]-comparison_squished["real.tests"])/comparison_squished["real.tests"])*100

In [None]:
comparison_squished["approx.diff"] = (comparison_squished["approx.tests"]/comparison_squished["real.tests"]-1)*100

### Difference percentage limit vs real max

In [None]:
approximation_comp = comparison_squished[["limit.diff", "approx.diff", "size"]].groupby(["size"]).agg(["max", "min", "mean", "std"])

In [None]:
len(opti.groupby("spec").count().index)

In [None]:
approximation_comp.plot(figsize=(12,8))

In [None]:
ma = approximation_comp[("limit.diff", "mean")]
mstd = approximation_comp[("limit.diff", "std")]
plt.figure(figsize=(14,8))
plt.plot(approximation_comp.index, ma, "k-s", label="Mean", markersize=10,)
plt.plot(approximation_comp.index, approximation_comp[("limit.diff", "min")], "r-^", label="Min", markersize=10,)
plt.plot(approximation_comp.index, approximation_comp[("limit.diff", "max")], "r:o", label="Max", markersize=10,)
#plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, color="b", alpha=0.2, label="Std")
plt.xlabel("Size")
plt.ylabel("Error (%)")
font = {'family' : 'normal',
        'size'   : 32}

plt.rc('font', **font)
plt.legend()
plt.savefig("limit_diff_approx.pdf")

In [None]:
ma = approximation_comp[("approx.diff", "mean")]
mstd = approximation_comp[("approx.diff", "std")]
plt.figure(figsize=(12,8))
plt.plot(approximation_comp.index, ma, "k-s", label="Mean", markersize=10,)
plt.plot(approximation_comp.index, approximation_comp[("approx.diff", "min")], "r-^", label="Min", markersize=10,)
plt.plot(approximation_comp.index, approximation_comp[("approx.diff", "max")], "r:o", label="Max", markersize=10,)
#plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, color="b", alpha=0.2, label="Std")
plt.xlabel("Size")
plt.ylabel("Error (%)")
font = {'family' : 'normal',
        'size'   : 32}

plt.rc('font', **font)
plt.legend()
plt.savefig("approx_diff_approx.pdf")

### Specifications that fails to be approximated correctly

In [None]:
comparison_squished[comparison_squished["approx.tests"] < comparison_squished["real.tests"]]

### Raw limit vs real max percentage

In [None]:
(comparison_squished["limit.downs"]/comparison_squished["real.downs"]).plot(figsize=(12,8))

In [None]:
opti_max = opti.groupby("spec").max()
real_statistics = real_max.groupby("spec").aggregate(["min", "max", "mean"])
compare = real_statistics.join(opti_max)
comparison_opti = compare.compute()

### Specification complexity range evolution (natural order)

In [None]:
comparison_opti[["real.tests", ("real.tests", "min"), ("real.tests", "mean"), ("real.tests", "max")]].plot(figsize=(12, 8))

### Specification complexity range evolution (size)

In [None]:
comparison_opti2 = comparison_opti.sort_values(("clocks", "max"))
comparison_opti2["diff"] = comparison_opti2[("real.tests", "max")]/comparison_opti2[("real.tests", "min")]
comparison_opti2["size"] = comparison_opti2[("size","max")]
comparison_opti2["clocks"] = comparison_opti2[("clocks","max")]
comparison_opti2.sort_values("size").plot.scatter(x="size", y="diff",figsize=(12, 8))

In [None]:
comparison_opti2[["size", "diff"]].groupby(["size"]).aggregate(["min", "mean", "max"]).plot(figsize=(12, 8))

In [None]:
comparison_opti2.sort_values("clocks").plot.scatter(x="clocks", y="diff",figsize=(12, 8))

### All failed to optimize specifications

In [None]:
comparison_opti[comparison_opti["real.tests"] > comparison_opti[("real.tests", "mean")]]


### Percentage of badly optimized specifications

In [None]:
len(comparison_opti[comparison_opti["real.tests"] > comparison_opti[("real.tests", "mean")]])/len(comparison_opti)

### Scatter plot of permutations' real test values for each specification

In [None]:
frame = real_max.compute()
frame.reset_index(inplace=True)
frame.plot.scatter(x="spec", y="real.tests", figsize=(12, 8))

### Relative scatter plot

In [None]:
relative_dist = real_max.compute().reset_index(level="variant").join(real_statistics.compute(), on="spec")
relative_dist

In [None]:
def scale_reals(row):
    new_rows = dict()
    test_min = row[("real.tests", "min")]
    test_max = row[("real.tests", "max")]
    if test_max == test_min:
        new_rows["rel_tests"] = 0
    else:
        new_rows["rel_tests"] = (row["real.tests"] - test_min) / (test_max - test_min)
    down_min = row[("real.downs", "min")]
    down_max = row[("real.downs", "max")]
    if down_min == down_max:
        new_rows["rel_downs"] = 0
    else:
        new_rows["rel_downs"] = (row["real.downs"] - down_min) / (down_max - down_min)
    new_rows["size"] = row["size"]
    new_rows["clocks"] = row["clocks"]
    return new_rows
relative_dist = relative_dist.apply(scale_reals, axis=1, result_type='expand')
relative_dist

In [None]:
relative_dist4 = relative_dist[relative_dist["size"] == 4]

In [None]:
relative_dist4.plot.scatter(x="rel_tests", y="rel_downs", alpha=0.01, figsize=(12, 8))


In [None]:
relative_dist.plot.scatter(x="rel_tests", y="rel_downs", alpha=0.005, figsize=(12, 8))


In [None]:
num_points = relative_dist.groupby("spec").aggregate(len).sort_values("rel_tests")

In [None]:
relative_dist.loc[num_points.index[2]].plot.scatter(x="rel_tests", y="rel_downs", figsize=(12, 8))


In [None]:
import os

all_methods_comp = real_statistics
methods = []
for file in os.listdir(dir+"/opti"):
     filename = os.fsdecode(file)
     if filename.endswith(".parquet"):
         table = dd.read_parquet(dir+"opti/"+filename, index=["spec"],columns=["real.tests"])
         table = table.groupby("spec").max()
         method = filename.replace(".parquet", "")
         table = table.rename(columns={"real.tests": method})
         methods.append(method)
         all_methods_comp = all_methods_comp.join(table)
     else:
         continue
all_methods_comp = all_methods_comp.compute()
all_methods_comp

In [None]:
def diffs(row):
    new_row = dict()
    mean = row[("real.tests", "mean")]
    for i,c in enumerate(methods, start=1):
        new_row[c] = int(row[c] < mean or row[("real.tests", "min")] == row[("real.tests", "max")])
    return new_row
diffs_comp = all_methods_comp.copy()
diffs_comp = diffs_comp.apply(diffs, axis=1, result_type='expand')
diffs_comp

### Percentage of good optimized specifications by method

In [None]:
diffs_comp.apply(sum, axis=0).sort_values(ascending=False)/len(diffs_comp)

### Visualisation of optimization coverage

In [None]:
def scale_methods(row):
    new_row = dict()
    for i,c in enumerate(methods, start=1):
        if "random" in c:
            continue
        new_row[c] = i*row[c]
    return new_row
without_rand = diffs_comp.apply(scale_methods, axis=1, result_type='expand')
without_rand.iloc[:100].plot(figsize=(12, 8))

### Cumulative coverage

In [None]:
cumulative_comp = diffs_comp.apply(sum, axis=1)
cumulative_comp.plot(figsize=(12, 8))

### Uncovered specifications

In [None]:
cumulative_comp[cumulative_comp == 0]

In [None]:
starting = real_max.compute().reset_index(level="variant")
starting = starting[starting["variant"] == 0]
gain = starting.join(all_methods_comp)
gain = gain[(gain[("real.tests", "max")]/gain[("real.tests", "min")])>1.05]
gain

### Gain in percentage relative to input specification

In [None]:
def gains_minus_start(row):
    new_row = dict()
    real = float(row["real.tests"])

    diff = (row[("real.tests", "max")] - row[("real.tests", "min")])
    for c in methods:
        new_row[c] = (real - float(row[c])) / diff * 100
    return new_row
gain_from_start = gain.apply(gains_minus_start, axis=1, result_type='expand')
gain_from_start.plot.hist(bins=96, alpha=1,figsize=(12, 8))

### Gain in percentage relative to input specification (specific method)

In [None]:
gain_from_start["init_weights.dijkstra"].plot.hist(bins=96, alpha=1,figsize=(12, 8))

### What are the 0 specs?

In [None]:
strange_0_specs = gain_from_start[["init_weights.min_front", ]][gain_from_start["init_weights.min_front"] == 0]
strange_0_specs = strange_0_specs.join(real_statistics.compute()).join(starting)
strange_0_specs

In [None]:
if len(strange_0_specs) != 0:
    relative_dist.loc[strange_0_specs.index[-1]].plot.scatter(x="rel_tests", y="rel_downs", alpha=0.1,figsize=(12, 8))

### Gain in percentage relative to mean

In [None]:
def gains_from_mean(row):
    new_row = dict()
    mean = float(row[("real.tests", "mean")])

    diff = (row[("real.tests", "max")] - row[("real.tests", "min")])
    for c in methods:
        new_row[c] = (mean - float(row[c])) / diff * 100
    return new_row
gain_from_mean = all_methods_comp.apply(gains_from_mean, axis=1, result_type='expand')
gain_from_mean.plot.hist(bins=96, alpha=0.1,figsize=(12, 8))

### Gain in percentage relative to mean

In [None]:
gain_from_mean["random"].plot.hist(bins=96, alpha=1,figsize=(12, 8))

In [None]:
gain_from_mean["init_weights.min_front"].plot.hist(bins=96, alpha=1,figsize=(12, 8))
plt.xlabel("Improvement from mean (%)")
plt.ylabel("Frequency")
plt.savefig("optimization.pdf")

### Gain characteristics per method

In [None]:
gain_from_mean.aggregate(["mean", "std"], axis=0).transpose().sort_values("mean", ascending=False)

### Complexity grow speed (by clocks)

In [None]:
real_max[["clocks", "real.tests"]].groupby("clocks").aggregate(["min", "max", "mean"]).compute().plot(figsize=(12, 8))

### Complexity grow speed (by size)

In [None]:
real_max[["size", "real.tests"]].groupby("size").aggregate(["min", "max", "mean"]).compute().plot(figsize=(12, 8))

In [None]:
comparison_size_clock = comparison_squished.copy()
comparison_size_clock["approx.tests"] = comparison_size_clock["approx.tests"].astype(np.int64)
comparison_size_clock["diff_appr_real"] = ((comparison_size_clock["approx.tests"]-comparison_size_clock["real.tests"])/comparison_size_clock["real.tests"])
comparison_size_clock["diff_limit_real"] = ((comparison_size_clock["limit.tests"]-comparison_size_clock["real.tests"])/comparison_size_clock["real.tests"])
comparison_size_clock

In [None]:
comparison_size_clock.groupby("size").aggregate("count")
comparison_size_clock.groupby("clocks").aggregate("count")


### Growth speed of difference of real and approximate complexities (by size)

In [None]:
comparison_size_clock[["size", "diff_appr_real"]].groupby("size").agg(["max", "min", "mean"]).plot(figsize=(12,8))

In [None]:
comparison_size_clock[["size", "diff_limit_real"]].groupby("size").agg(["max", "min", "mean"]).plot(figsize=(12,8))


### Growth speed of difference of real and approximate complexities (by clock)

In [None]:
comparison_size_clock[["clocks", "diff_appr_real"]].groupby("clocks").agg(["max", "min", "mean"]).plot(figsize=(12,8))

In [None]:
comparison_size_clock[["clocks", "diff_limit_real"]].groupby("clocks").agg(["max", "min", "mean"]).plot(figsize=(12,8))

### Check if approximations allow specification comparison

In [None]:
check_order = comparison_squished[["real.tests", "approx.tests"]].reset_index(level="variant").drop("variant", axis=1).groupby("spec").aggregate(list)
check_order

In [None]:
def order_holds(orig: List[int], aprox: List[int]) -> float:
    candidates = list(zip(orig, aprox))
    hold_times = 0
    count = 0
    for (lorig, laprox), (rorig, raprox) in combinations(candidates, 2):
        ok = (lorig < rorig and laprox < raprox) or \
          (lorig > rorig and laprox > raprox) or \
          (lorig == rorig and laprox == raprox)
        hold_times += int(ok)
        count +=1
    return hold_times/count

check_order["holds"] = check_order.apply(lambda row: order_holds(row["real.tests"], row["approx.tests"]), axis=1)
check_order.holds.mean()