In [1]:
from sketching import settings
from sketching.datasets import Dataset, Covertype_Sklearn, KDDCup_Sklearn, Webspam_libsvm, Synthetic_Dataset, NoisyDataset, Synthetic_Dataset_Cohen 

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib

In [2]:
if not settings.PLOTS_DIR.exists():
    settings.PLOTS_DIR.mkdir()

In [3]:
def get_results_df(dataset: Dataset, time_column):
    df_list = []
    
    #for method in ["uniform", "l2s", "sketching"]:
    for method in ["cosketching1", "cosketching2", "cosketching5", "cosketching10"]:#, "cosketching20"]:
        df = (
            pd.read_csv(settings.RESULTS_DIR / (dataset.get_name() + f"_{method}.csv"))
            .filter(items=[time_column, "size", "ratio"])
            .groupby(["size"], as_index=False)
            .median()
            .assign(method=method)
        )
        df_list.append(df)

    return(pd.concat(df_list, ignore_index=True))

#get_results_df(Covertype_Sklearn(), "sampling_time_s")

In [4]:
def make_plot(dataset,  x_min, x_max, y_min, y_max, sampling_time=False, font_size=18, font_size_title=23):
    if sampling_time:
        time_column = "sampling_time_s"
    else:
        time_column = "total_time_s"

    results_df = get_results_df(dataset, time_column=time_column)

    # use TeX for typesetting
    plt.rcParams["text.usetex"] = True
    plt.rc("font", size=font_size)

    fig, ax = plt.subplots()

    colormap = matplotlib.cm.get_cmap(name="tab10")
    colors = {
        "cosketching1": colormap(3),
        "cosketching2": colormap(0),
        "cosketching5": colormap(1),
        "cosketching10": colormap(2),
        #"cosketching20": colormap(1),
    }

    labels = {
        "cosketching1": "Sketch (old)",
        "cosketching2": "Sketch2",
        "cosketching5": "Sketch5",
        "cosketching10": "Sketch10",
        #"cosketching20": "Sketch20"
    }

    titles = {
        "covertype_sklearn": "Covertype",
        "covertype_sklearn_noisy": "Covertype, 1\% noisy",
        "kddcup_sklearn": "Kddcup",
        "kddcup_sklearn_noisy": "Kddcup, 1\% noisy",
        "webspam_libsvm_desparsed": "Webspam",
        "webspam_libsvm_desparsed_noisy": "Webspam, 1\% noisy",
        "synthetic_n_100000": "Synthetic",
        "synthetic_n_20000_d_100": "Synthetic data"
    }

    for cur_method in ["cosketching1", "cosketching2", "cosketching5", "cosketching10"]:#, "cosketching20"]:
        cur_results = results_df.loc[results_df["method"] == cur_method]
        ax.scatter(
            cur_results[time_column], 
            cur_results["ratio"],
            color=colors[cur_method],
            label=labels[cur_method],
        )

    ax.set_xlim(left=x_min, right=x_max)
    ax.set_ylim(bottom=y_min, top=y_max)

    if sampling_time:
        ax.set_xlabel("median sampling time (s)")
    else:
        ax.set_xlabel("median absolute running time (s)")

    ax.set_ylabel("median approximation ratio")

    ax.set_title(titles[dataset.get_name()], fontsize=font_size_title)

    legend = ax.legend(loc="upper right", frameon=True)

    fig.tight_layout()

    if sampling_time:
        plt.savefig(settings.PLOTS_DIR / f"{dataset.get_name()}_sampling_time_plot.pdf")
    else:
        plt.savefig(settings.PLOTS_DIR / f"{dataset.get_name()}_total_time_plot.pdf")

    plt.show()


In [None]:
dataset = Synthetic_Dataset_Cohen(20000, 100)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=10, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=True)

In [None]:
dataset = Covertype_Sklearn()
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=True)

In [None]:
dataset = KDDCup_Sklearn()
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=True)

In [None]:
dataset = Webspam_libsvm()
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=2, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=2, sampling_time=True)

In [None]:
dataset = Synthetic_Dataset(n_rows=100000)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=True)

In [None]:
dataset = NoisyDataset(dataset=Webspam_libsvm(), percentage=0.01, std=10)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=20, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=50, sampling_time=True)

In [None]:
dataset = NoisyDataset(dataset=Covertype_Sklearn(), percentage=0.01, std=10)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=5, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=5, sampling_time=True)

In [None]:
dataset = NoisyDataset(dataset=KDDCup_Sklearn(), percentage=0.01, std=10)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=5, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=5, sampling_time=True)