# Free Computation Time and Solution Stability on Dynamic Graphs

This notebook contains the code to run all experiments and generate all figures in the poster (and its Extended Abstract). Please update the values of the variables below before running the code. The results will be saved in the directory `results/2023-sc-poster/`.

In [None]:
path_to_graph, timestamp_position = "~/wikipedia-growth.txt", 2
path_to_project = "~/projects/lollipop" # root of the repository
threads = 10 # 0 to use nproc

In [None]:
%pip install pandas matplotlib
import subprocess
from pathlib import Path
import shutil
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

mpl.rcParams['figure.dpi'] = 600

In [None]:
# Latency
timeseries_intervals = [32, 64, 128] # for the latency graph, in days
ingest_rates = [10_000_000, 8_000_000, 6_000_000, 4_000_000, 2_000_000] # events / second. Use 0 to indicate unlimited rate
repeat = 10

# Stability
stability_interval = 800_000 # for the stability experiment

path_to_go = shutil.which("go")
path_to_graph = Path(path_to_graph).expanduser()
path_to_project = Path(path_to_project).expanduser()
assert(path_to_graph.exists())
assert(path_to_project.exists())
assert(path_to_go)

In [None]:
def create_results_directory():
    results_dir = path_to_project / "results" / f"2023-sc-poster"
    log_dir = results_dir / "log"
    results_dir.mkdir(exist_ok=True)
    log_dir.mkdir(exist_ok=True)
    return results_dir, log_dir

(path_to_project / "results").mkdir(exist_ok=True)
results_dir, log_dir = create_results_directory()

In [None]:
class Algorithm:
    def __init__(self, name: str, graph: Path, log_dir: Path, timeseries_output: Path, timestamp_position: int = 0, convert_to_undirected: bool = False, threads: int = 0, additional_flags: list[str] = []) -> None:
        self.name = name
        self.code_path = path_to_project / "cmd" / f"lp-{name}"
        self.log_dir = log_dir
        self.convert_to_undirected = convert_to_undirected
        self.graph = graph
        self.timestamp_position = timestamp_position
        self.threads = threads
        self.timeseries_output = timeseries_output
        self.additional_flags = additional_flags
        assert(self.graph.exists())
        assert(self.log_dir.exists())
        assert(self.code_path.exists())
        assert(self.timeseries_output.parent.exists())

    def run_timeseries(self, timeseries_interval: int, ingest_rate: int = 0, suffix: str = "", no_skip: bool = False, additional_flags: list[str] = [], interval_type: str = "timestamp", check_correctness: bool = True) -> Path:
        path_timeseries = self.log_dir / f"timeseries-{self.name}-{timeseries_interval}-{ingest_rate}-{self.threads}{suffix}.csv"
        if (not no_skip) and path_timeseries.exists():
            print(f"Skipping as result exists: {path_timeseries}")
            return path_timeseries

        cmd = [path_to_go, "run", self.code_path, f"-g={self.graph}", "-tquery"]
        if interval_type == "timestamp":
            cmd.append(f"-dt={timeseries_interval}")
        elif interval_type == "edgecount":
            cmd.append(f"-de={timeseries_interval}")
        else:
            assert(False)
        if ingest_rate:
            cmd.append(f"-dr={ingest_rate}")
        if self.timestamp_position:
            cmd.append(f"-pt={self.timestamp_position}")
        if self.convert_to_undirected:
            cmd.append("-u")
        if self.threads:
            cmd.append(f"-t={self.threads}")
        if check_correctness:
            cmd.append(f"-c")
        cmd = cmd + self.additional_flags + additional_flags
        
        print(f"Command: {cmd}")
        self.timeseries_output.unlink(missing_ok=True)
        process = subprocess.Popen(cmd, cwd=path_to_project)
        returncode = process.wait()
        assert(returncode == 0)

        assert(self.timeseries_output.exists())
        return self.timeseries_output.rename(path_timeseries)
    
pagerank = Algorithm(name="pagerank", graph=path_to_graph, log_dir=log_dir, 
                     timeseries_output=path_to_project / "results" / "timeseries.csv", 
                     timestamp_position=timestamp_position, 
                     convert_to_undirected=False, threads=threads)

colouring = Algorithm(name="colouring", graph=path_to_graph, log_dir=log_dir, 
                     timeseries_output=path_to_project / "results" / "colouring-timeseries.csv", 
                     timestamp_position=timestamp_position, 
                     convert_to_undirected=True, threads=threads)

colouring_oracle = Algorithm(name="colouring", graph=path_to_graph, log_dir=log_dir, 
                     timeseries_output=path_to_project / "results" / "colouring-snapshot-timeseries.csv", 
                     timestamp_position=timestamp_position, additional_flags=["-o"],
                     convert_to_undirected=True, threads=threads)

In [None]:
class LatencyExperiment:
    def __init__(self, log_dir: Path, algorithm: Algorithm) -> None:
        self.algorithm = algorithm
        self.log_dir = log_dir
        self.results_db = pd.DataFrame(columns=["Algorithm", "Query Interval", "Number of Queries", "Ingest Rate", "Average Latency", "Threads", "Timeseries Path"])
        assert(self.log_dir.exists())

    def run(self, timeseries_intervals: list[int], ingest_rates: list[int], repeat: int=1) -> pd.DataFrame:
        for interval in timeseries_intervals:
            for rate in ingest_rates:
                for r in range(repeat):
                    timeseries = self.algorithm.run_timeseries(timeseries_interval=interval, ingest_rate=rate, suffix=f"-{r}")
                    df_timeseries = pd.read_csv(timeseries)
                    average_latency = df_timeseries.loc[:, 'qLatencyMS'].mean()
                    self.results_db.loc[len(self.results_db)] = {
                        "Algorithm": self.algorithm.name,
                        "Query Interval": interval,
                        "Number of Queries": df_timeseries.shape[0],
                        "Ingest Rate": rate,
                        "Average Latency": average_latency,
                        "Threads": self.algorithm.threads,
                        "Timeseries Path": timeseries
                    }
        return self.results_db

colouring_experiment = LatencyExperiment(log_dir, colouring)

In [None]:
colouring_results = colouring_experiment.run(timeseries_intervals, ingest_rates, repeat=repeat)

In [None]:
def get_average_latency(timeseries_paths: list[str]) -> pd.DataFrame:
    assert(timeseries_paths.shape[0] > 0)
    output_df = pd.read_csv(timeseries_paths.iloc[0])
    output_df = output_df[['ts', 'qLatencyMS']].copy()

    for path in timeseries_paths[1:]:
        df = pd.read_csv(path)
        assert((df['ts'] == output_df['ts']).all())
        output_df['qLatencyMS'] = output_df['qLatencyMS'].add(df['qLatencyMS'], fill_value=0)
    
    output_df['ts'] = pd.to_datetime(output_df['ts'])
    output_df['qLatencyMS'] = output_df['qLatencyMS'] / len(timeseries_paths)
    return output_df

def plot_latency(timeseries_results, y_max, x_min, figsize, legend_box):
    timeseries_path = timeseries_results.query(f'`Query Interval` == {min(timeseries_intervals)}').iloc[0]["Timeseries Path"]
    df_timeseries = pd.read_csv(timeseries_path)
    ts, e = pd.to_datetime(df_timeseries['ts']), df_timeseries['EC']
    min_ts, max_ts, max_e = min(ts), max(ts), max(e)
    print(f"min(ts)={min_ts} max(ts)={max_ts} e={e}")
    
    fig, axs = plt.subplots(len(timeseries_intervals), len(ingest_rates), sharex=True, figsize=figsize)

    for interval, axs_y in zip(timeseries_intervals, axs):
        for rate, ax in zip(ingest_rates, axs_y):
            timeseries_filtered = timeseries_results.query(f'`Query Interval` == {interval} and `Ingest Rate` == {rate} and `Threads` == {threads}')
            
            # Latency
            df_averaged = get_average_latency(timeseries_filtered["Timeseries Path"])
            x, y1 = df_averaged['ts'], df_averaged['qLatencyMS']
            ax.patch.set_visible(False)
            ax.set(xlim=[x_min, max_ts], ylim=[0,y_max], zorder=2, xticks=[])
            ax.plot(x, y1, marker = '.', markersize = 5, color="chocolate", label="Latency (ms) (left)")
            ax.fill_between(x, y1, alpha=0.2, facecolor="red", edgecolor=None)

            if rate != ingest_rates[0]:
                ax.set_yticks([])
            
            # % of edges added
            ax = ax.twinx()
            ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
            ax.set(ylim=[0,1], zorder=1)
            ax.fill_between(ts, e / max_e, alpha=0.2, facecolor="green", edgecolor=None, label=f"% of edges added (right)")
            if rate != ingest_rates[-1]:
                ax.set_yticks([])

            # Target rate not achieved.
            if rate == ingest_rates[0] or (rate == ingest_rates[1] and interval <= 32): # TODO: automate this
                ax = ax.twinx().twiny() # don't share any axes
                ax.set(xlim=[0,1], ylim=[0,1], zorder=0, xticks=[], yticks=[])
                ax.scatter(0.2, 0.85, marker="x", color="brown", s=100, label="Target rate not achieved")

    for interval, ax in zip(timeseries_intervals, axs[:, 0]):
        ax.set_ylabel(f"{interval} days", labelpad=7)
    for rate, ax in zip(ingest_rates, axs[-1]):
        ax.set_xlabel(f"{rate/1e6}m e/s", labelpad=7)
    
    # Legend
    handles, labels = [sum(x, []) for x in zip(*[ax.get_legend_handles_labels() for ax in fig.axes])]
    handles_labels = dict(zip(labels, handles))
    fig.legend(handles_labels.values(), handles_labels.keys(), framealpha=0.9,bbox_to_anchor=legend_box)
    
    fig.tight_layout(pad=0.5)
    return fig

fig_abstract = plot_latency(colouring_results, 1000, pd.to_datetime("2005-03-15"), (6, 4), (1, 1))
fig_poster = plot_latency(colouring_results, 1000, pd.to_datetime("2005-03-15"), (6, 4), (1.4, 1))
fig_abstract.savefig(results_dir / "rate-limiting-abstract.pdf")
fig_poster.savefig(results_dir / "rate-limiting-poster.pdf")

In [None]:
ts_stability_all_dynamic = colouring_oracle.run_timeseries(timeseries_interval=stability_interval, suffix=f"-stability-dynamic", interval_type="edgecount")
ts_stability_all_async = colouring_oracle.run_timeseries(timeseries_interval=stability_interval, suffix=f"-stability-async", interval_type="edgecount", additional_flags=["-SnapshotOracle"])
ts_stability_all_async_wc = colouring_oracle.run_timeseries(timeseries_interval=stability_interval, suffix=f"-stability-async_wc", interval_type="edgecount", additional_flags=["-SnapshotOracle", "-WaitCount"], check_correctness=False)
df_ts_stability_all_dynamic = pd.read_csv(ts_stability_all_dynamic)
df_ts_stability_all_async = pd.read_csv(ts_stability_all_async)
df_ts_stability_all_async_wc = pd.read_csv(ts_stability_all_async_wc)

In [None]:
def plot_stability_percentage(data: list[tuple[pd.DataFrame, str]], y_max: int, figsize):
    x = df_ts_stability_all_dynamic["EC"] / 1000000
    fig, ax = plt.subplots(figsize=figsize)
    for df, label in data:
        ax.plot(x, df["pctSame"], marker = '.', markersize = 5, label=f"{label} (%)")
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(100))
    ax.tick_params(axis='x', labelrotation = 90)
    ax.xaxis.set_major_locator(mtick.MultipleLocator(base=3.2))
    ax.set(xlim=[x[1], max(x)], ylim=[0, y_max], ylabel=f"% of Vertex Colours Unchanged", zorder=1)
    ax.patch.set_visible(False)
    ax.xaxis.grid(alpha=0.3)
    ax.yaxis.grid(alpha=0.3)
    ax.set_xlabel("Edge Count (millions)", labelpad=7)

    ax = ax.twinx()
    ax.set(zorder=0, ylim=[0, y_max], ylabel=f"# of Colours Used")
    for df, label in data:
        ax.fill_between(x, df["colourCount"], label=f"{label} (#)", alpha=0.6)

    fig.legend(bbox_to_anchor=(0.51, 1.03), loc='center', ncol=3, framealpha=1, prop={'size': 7.5})
    fig.tight_layout(pad=0.5)
    return fig

fig_poster = plot_stability_percentage([(df_ts_stability_all_dynamic, "Dynamic"), (df_ts_stability_all_async, "Static"), (df_ts_stability_all_async_wc, "Static (Wait Count)")], 100, (6, 3))
fig_abstract = plot_stability_percentage([(df_ts_stability_all_dynamic, "Dynamic"), (df_ts_stability_all_async, "Static"), (df_ts_stability_all_async_wc, "Static (Wait Count)")], 100, (9, 3))
fig_poster.savefig(results_dir / "stability-all-vertices-poster.pdf", bbox_inches='tight')
fig_abstract.savefig(results_dir / "stability-all-vertices-abstract.pdf", bbox_inches='tight')

In [None]:
ts_stability_interest_dynamic = colouring_oracle.run_timeseries(timeseries_interval=stability_interval, suffix=f"-stability-dynamic-interest", interval_type="edgecount", additional_flags=["-UseInterest"])
ts_stability_interest_async = colouring_oracle.run_timeseries(timeseries_interval=stability_interval, suffix=f"-stability-async-interest", interval_type="edgecount", additional_flags=["-UseInterest", "-SnapshotOracle"])
ts_stability_interest_async_wc = colouring_oracle.run_timeseries(timeseries_interval=stability_interval, suffix=f"-stability-async_wc-interest", interval_type="edgecount", additional_flags=["-UseInterest", "-SnapshotOracle", "-WaitCount"], check_correctness=False)
df_ts_stability_interest_dynamic = pd.read_csv(ts_stability_interest_dynamic)
df_ts_stability_interest_async = pd.read_csv(ts_stability_interest_async)
df_ts_stability_interest_async_wc = pd.read_csv(ts_stability_interest_async_wc)


In [None]:
def plot_colours_of_vertices(df_list: list[pd.DataFrame], titles: list[str], figsize, y_max):
    x = df_list[0]["EC"] / 1000000
    fig, axes = plt.subplots(1, len(df_list), sharex=True, sharey=True, figsize=figsize)
    max_colour_line = 1

    for df, title, ax in zip(df_list, titles, axes):
        vertices = df.columns[df.columns.get_loc("EC")+2:-1]
        max_y = 0
        for v in vertices:
            y = df[v]
            ax.plot(x, y, alpha=0.8, linewidth=1)
            max_y = max(max_y, max(y)) # max(max(y), max_y) doesn't work??
        ax.set(xlabel="Edge Count (millions)")
        ax.title.set_text(title)
        ax.yaxis.set_major_locator(mtick.MultipleLocator(10))
        ax.yaxis.set_minor_locator(mtick.AutoMinorLocator(10))
        ax.yaxis.grid(alpha=0.3, which='minor')
        ax.yaxis.grid(alpha=1, which='major')
        max_colour_line = ax.plot([x[0], x[len(x)-1]], [max_y, max_y], alpha=1, linewidth=1, color="chocolate")
        ax.annotate(f"{max_y}", (x[2], max_y+1))

    axes[0].set(xlim=[x[0], x[len(x)-1]], ylim=[0, y_max], xlabel="Edge Count (millions)", ylabel=f"Colour ID")
    fig.legend(max_colour_line, ["Maximum # of Colours Used"], bbox_to_anchor=(0.995, 0.95), loc='upper right')
    fig.tight_layout(pad=0.5)
    return fig

fig = plot_colours_of_vertices([df_ts_stability_interest_dynamic, df_ts_stability_interest_async, df_ts_stability_interest_async_wc], ["Dynamic", "Static", "Static (Wait Count)"], (11, 5), 100)
fig.savefig(results_dir / "stability-interest-colours.pdf", bbox_inches='tight')