In [None]:
%%time
import bokeh.io
import datamapplot as dmp
import duckdb
from fast_hdbscan import HDBSCAN
from hashlib import md5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import panel as pn
import shlex
import thisnotthat as tnt
from tqdm.auto import tqdm
import umap
# import vectorizers as vz
import vectorizers.transformers as vzt

import eng.bpe

In [None]:
bokeh.io.output_notebook()
pn.extension()

In [None]:
tqdm.pandas()

In [None]:
db = duckdb.connect(":memory:")
for table in ["process", "process_path"]:
    db.execute(f"create or replace view {table} as select * from parquet_scan('/data/acme3/stdview-20231105-20231120/{table}.parquet')")

In [None]:
db.execute(
    """
    create or replace view process_interesting as
    select process.*, process_path.ptree as ptree
    from process
    inner join process_path using (pid_hash)
    where process_path.ptree not like '%wintap%' and process_path.ptree not like '%amazon-ssm%'
    """
)

In [None]:
%%time
metadata_interesting = db.execute(
    """
    select pid_hash, parent_pid_hash, process_name, process_path, args, ptree
    from process_interesting
    order by pid_hash
    """
).df()
metadata_interesting

In [None]:
cmdlines = (
    metadata_interesting
    .assign(space=" ")[["process_path", "space", "args"]]
    .dropna(subset=["process_path"]).fillna("")
    .sum(axis=1)
    .str.strip()
    .str.encode("ascii", errors="backslashreplace")
    .str.decode("ascii")
)
cmdlines

In [None]:
%%time
tokens, code_list, cmdlines_compressed = eng.bpe.train(cmdlines.unique().tolist(), vocab_size=600, max_char_code=127)
len(code_list), np.min([len(cc) for cc in cmdlines_compressed])

In [None]:
%%time
cmdlines_vec = eng.bpe.vectorize(cmdlines.tolist(), code_list, max_char_code=127)
cmdlines_vec

In [None]:
%%time
cmdlines_iwt = vzt.InformationWeightTransformer().fit_transform(cmdlines_vec)
cmdlines_iwt

In [None]:
%%time
_lil = cmdlines_iwt.tolil()
_, i_unique, i_deunique, cmdline_counts = np.unique(
    [
        [*md5(np.asarray(ind)).digest(), *md5(np.asarray(dat)).digest()]
        for ind, dat in tqdm(zip(_lil.rows, _lil.data), total=cmdlines_iwt.shape[0])
    ],
    axis=0,
    return_index=True,
    return_inverse=True,
    return_counts=True
)
len(i_unique)

In [None]:
%%time
U_bow = umap.UMAP(metric="hellinger", init="pca", verbose=True).fit(cmdlines_iwt[i_unique, :])
cmdlines_bow = U_bow.embedding_[i_deunique, :]
cmdlines_bow.shape

In [None]:
datamap_bow = U_bow.embedding_
datamap_bow.shape

In [None]:
def summarize_args(cmdline_args):
    if cmdline_args:
        try:
            args = shlex.split(cmdline_args, posix=False)
        except ValueError:
            try:
                args = shlex.split(cmdline_args, posix=True)
            except ValueError:
                return f" (unparsable args)"

        num_args = len(args)
        if num_args > 1:
            return f" (+{num_args} args)"
        elif num_args == 1:
            return f" {args[0]}"

    return " (no arg)"

In [None]:
hover_text = (
    metadata_interesting
    .assign(args_summary=metadata_interesting["args"].fillna("").progress_apply(summarize_args))
    .iloc[i_unique]
    .assign(count_summary=pd.Series(cmdline_counts, index=metadata_interesting.iloc[i_unique].index).progress_apply(lambda c: f"[{c}x] " if c > 1 else ""))[
        ["count_summary", "process_name", "args_summary"]
    ]
    .sum(axis=1)
    .tolist()
)
len(hover_text), hover_text[-10:]

In [None]:
labels = ["???"] * datamap_bow.shape[0]
len(labels)

In [None]:
plot = tnt.BokehPlotPane(
    data=datamap_bow,
    labels=labels,
    hover_text=hover_text,
    width=800,
    height=800,
    show_legend=False
)
editor = tnt.LabelEditorWidget(plot.labels)
editor.link_to_plot(plot)
pn.Row(plot, editor)

In [None]:
dataa = np.array([[3, 4], [8, 9], [1, -1]])
plot = tnt.BokehPlotPane(dataa, width=600, height=600, show_legend=False)
pn.Row(plot)

In [None]:
dmp