In [None]:
%pip install pandas matplotlib
import subprocess
from pathlib import Path
import shutil
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.dpi'] = 300

In [None]:
timeseries_intervals = [16, 32, 64, 128, 256]
ingest_rates = [6_400_000, 3_200_000, 1_600_000, 800_000] # 0 to indicate no limit
path_to_graph, timestamp_position = "~/wikipedia-growth.txt", 2
path_to_project = "~/projects/lollipop"
path_to_go = shutil.which("go")
threads = 10 # 0 to use nproc

path_to_graph = Path(path_to_graph).expanduser()
path_to_project = Path(path_to_project).expanduser()
assert(path_to_graph.exists())
assert(path_to_project.exists())
assert(path_to_go)

In [None]:
def create_results_directory():
    path_to_results = path_to_project / "results" / f"2023-sc-poster"
    path_to_results.mkdir(exist_ok=True)
    return path_to_results

(path_to_project / "results").mkdir(exist_ok=True)
results_dir = create_results_directory() # or replace with existing result folder

In [None]:
class Algorithm:
    def __init__(self, name: str, graph: Path, results_dir: Path, timeseries_output: Path, timestamp_position: int, convert_to_undirected: bool = False, threads: int = 0) -> None:
        self.name = name
        self.code_path = path_to_project / "cmd" / f"lp-{name}"
        self.results_dir = results_dir
        self.convert_to_undirected = convert_to_undirected
        self.graph = graph
        self.timestamp_position = timestamp_position
        self.threads = threads
        self.timeseries_output = timeseries_output
        assert(self.graph.exists())
        assert(self.results_dir.exists())
        assert(self.code_path.exists())
        assert(self.timeseries_output.parent.exists())

    def run_timeseries(self, timeseries_interval: int, ingest_rate: int = 0, no_skip: bool = False) -> Path:
        path_timeseries = self.results_dir / f"timeseries-{self.name}-{timeseries_interval}-{ingest_rate}-{self.threads}.csv"
        # path_timeseries = self.results_dir / f"timeseries-{self.name}-{timeseries_interval}-{ingest_rate}.csv"
        if (not no_skip) and path_timeseries.exists():
            print(f"Skipping as result exists: {path_timeseries}")
            return path_timeseries

        cmd = [path_to_go, "run", self.code_path, "-c", "-tquery", f"-pt={self.timestamp_position}", f"-g={self.graph}", f"-dt={timeseries_interval}"]
        if ingest_rate:
            cmd.append(f"-dr={ingest_rate}")
        if self.convert_to_undirected:
            cmd.append("-u")
        if self.threads:
            cmd.append(f"-t={self.threads}")
        
        print(f"Command: {cmd}")
        self.timeseries_output.unlink(missing_ok=True)
        process = subprocess.Popen(cmd, cwd=path_to_project)
        process.wait()

        assert(self.timeseries_output.exists())
        return self.timeseries_output.rename(path_timeseries)
    
pagerank = Algorithm(name="pagerank", graph=path_to_graph, results_dir=results_dir, 
                     timeseries_output=path_to_project / "results" / "timeseries.csv", 
                     timestamp_position=timestamp_position, 
                     convert_to_undirected=False, threads=threads)

colouring = Algorithm(name="colouring", graph=path_to_graph, results_dir=results_dir, 
                     timeseries_output=path_to_project / "results" / "colouring-timeseries.csv", 
                     timestamp_position=timestamp_position, 
                     convert_to_undirected=True, threads=threads)

In [None]:
class Experiment:
    def __init__(self, results_dir: Path, algorithm: Algorithm) -> None:
        self.algorithm = algorithm
        self.results_dir = results_dir
        self.path_results_db = results_dir / "timeseries-db.csv"
        if self.path_results_db.exists():
            self.results_db = pd.read_csv(self.path_results_db)
        else:
            self.results_db = pd.DataFrame(columns=["Algorithm", "Query Interval", "Number of Queries", "Ingest Rate", "Average Latency", "Threads", "Timeseries Path"])
            self.results_db.to_csv(self.path_results_db, index=False)
        assert(self.results_dir.exists())

    def run(self, timeseries_intervals: list[int], ingest_rates: list[int]) -> Path:
        for interval in timeseries_intervals:
            for rate in ingest_rates:
                timeseries = self.algorithm.run_timeseries(timeseries_interval=interval, ingest_rate=rate)
                df_timeseries = pd.read_csv(timeseries)
                average_latency = df_timeseries.loc[:, 'qLatencyMS'].mean()
                self.results_db.loc[len(self.results_db)] = {
                    "Algorithm": self.algorithm.name,
                    "Query Interval": interval,
                    "Number of Queries": df_timeseries.shape[0],
                    "Ingest Rate": rate,
                    "Average Latency": average_latency,
                    "Threads": self.algorithm.threads,
                    "Timeseries Path": timeseries
                }
                self.results_db.to_csv(self.path_results_db, index=False)
        return self.path_results_db

    def load_results(self) -> pd.DataFrame:
        return pd.read_csv(self.path_results_db)

pagerank_experiment = Experiment(results_dir, pagerank)
colouring_experiment = Experiment(results_dir, colouring)

In [None]:
# pagerank_experiment.run(timeseries_intervals, ingest_rates)
# pagerank_results = pagerank_experiment.load_results()
# pagerank_results
colouring_experiment.run(timeseries_intervals, ingest_rates)
colouring_results = colouring_experiment.load_results()
colouring_results

In [None]:
def plot(timeseries_results):
    fig, axs = plt.subplots(len(timeseries_intervals), len(ingest_rates), sharex=True, sharey=True, figsize=(10, 10))

    timeseries_path = timeseries_results.query(f'`Query Interval` == {min(timeseries_intervals)}').iloc[0]["Timeseries Path"]
    df_timeseries = pd.read_csv(timeseries_path)
    ts, v, e = pd.to_datetime(df_timeseries['ts']), df_timeseries.loc[:, 'v'], df_timeseries.loc[:, 'e']
    max_v, max_e = max(v), max(e)
    for interval, axs_y in zip(timeseries_intervals, axs):
        for rate, ax in zip(ingest_rates, axs_y):
            ax = ax.twinx()
            ax.set_yticks([])
            ax.margins(x=0)
            ax.set_ylim([0, max_e])
            ax.fill_between(ts, e, alpha=0.2, facecolor="green", edgecolor=None)

    for interval, axs_y in zip(timeseries_intervals, axs):
        for rate, ax in zip(ingest_rates, axs_y):
            timeseries_q = timeseries_results.query(f'`Query Interval` == {interval} and `Ingest Rate` == {rate}')
            timeseries_path = timeseries_q.iloc[0]["Timeseries Path"]

            df_timeseries = pd.read_csv(timeseries_path)
            x, y1 = pd.to_datetime(df_timeseries['ts']), df_timeseries.loc[:, 'qLatencyMS']

            ax.set_zorder(1)
            ax.patch.set_visible(False)
            ax.set_xticks([])
            ax.margins(x=0)
            ax.set_ylim([0, 1200])
            ax.plot(x, y1, marker = '.', markersize = 5, color="chocolate")
            ax.fill_between(x, y1, alpha=0.2, facecolor="red", edgecolor=None)

    for interval, ax in zip(timeseries_intervals, axs[:, 0]):
        plt.setp(ax, ylabel=f"{interval} days")
    for rate, ax in zip(ingest_rates, axs[-1]):
        plt.setp(ax, xlabel=f"{rate/10e6}m events/sec")
    
    fig.tight_layout()
    return fig

fig = plot(colouring_results)