In [None]:
%%time
import datamapplot as dmp
import duckdb
from fast_hdbscan import HDBSCAN
from hashlib import md5
import joblib as jl
from matplotlib import colormaps
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.sparse as sp
import shlex
from sklearn.decomposition import TruncatedSVD
from tqdm.auto import tqdm
import umap
import vectorizers as vz
import vectorizers.transformers as vzt

import eng.bpe

In [None]:
tqdm.pandas()

Get the data from here: https://gdo168.llnl.gov/data/ACME-2023/stdview-20231105-20231120.tar

In [None]:
db = duckdb.connect(":memory:")

In [None]:
for table in ["process", "process_path", "process_image_load"]:
    db.execute(f"create or replace view {table} as select * from parquet_scan('~/data/acme3/stdview-20231105-20231120/{table}.parquet')")

Drop the processes involved with the Amazon and Wintap agents.

In [None]:
db.execute(
    """
    create or replace view process_interesting as
    select process.*, process_path.ptree as ptree
    from process
    inner join process_path using (pid_hash)
    where process_path.ptree not like '%wintap%' and process_path.ptree not like '%amazon-ssm%'
    and process_started >= '2023-11-05'::timestamp
    and process_started = first_seen
    """
)

Here are the _interesting_ processes:

In [None]:
%%time
metadata_interesting = db.execute(
    """
    select pid_hash, parent_pid_hash, process_name, process_path, args, ptree
    from process_interesting
    order by pid_hash
    """
).df()
metadata_interesting

# Command line embedding

In [None]:
cmdlines = (
    metadata_interesting
    .assign(space=" ")[["process_path", "space", "args"]]
    .dropna(subset=["process_path"]).fillna("")
    .sum(axis=1)
    .str.strip()
    .str.encode("ascii", errors="backslashreplace")
    .str.decode("ascii")
)
cmdlines

In [None]:
%%time
tokens, code_list, cmdlines_compressed = eng.bpe.train(cmdlines.unique().tolist(), vocab_size=600, max_char_code=127)
len(code_list), np.min([len(cc) for cc in cmdlines_compressed])

In [None]:
%%time
cmdlines_vec = eng.bpe.vectorize(cmdlines.tolist(), code_list, max_char_code=127)
cmdlines_vec

In [None]:
%%time
cmdlines_iwt = vzt.InformationWeightTransformer().fit_transform(cmdlines_vec)
cmdlines_iwt

In [None]:
%%time
_lil = cmdlines_iwt.tolil()
_, i_unique, i_deunique, cmdline_counts = np.unique(
    [
        [*md5(np.asarray(ind)).digest(), *md5(np.asarray(dat)).digest()]
        for ind, dat in tqdm(zip(_lil.rows, _lil.data), total=cmdlines_iwt.shape[0])
    ],
    axis=0,
    return_index=True,
    return_inverse=True,
    return_counts=True
)
len(i_unique)

In [None]:
%%time
U_bow = umap.UMAP(
    n_neighbors=50,
    metric="hellinger",
    n_epochs=500,
    verbose=True
).fit(cmdlines_iwt[i_unique, :])#, y=metadata_interesting["process_name"].iloc[i_unique].map(process_name2label))
cmdlines_bow = U_bow.embedding_[i_deunique, :]
cmdlines_bow.shape

In [None]:
datamap_bow = U_bow.embedding_
datamap_bow.shape

In [None]:
hover_text = (
    metadata_interesting
    .iloc[i_unique]
    .assign(
        count_summary=pd.Series(cmdline_counts, index=metadata_interesting.iloc[i_unique].index).apply(lambda c: f"({c}x) " if c > 1 else ""),
        space=" "
    )[["count_summary", "process_name", "space", "args"]]
    .fillna("")
    .sum(axis=1)
    .apply(lambda cl: cl[:100] + "[...]" if len(cl) > 100 else cl)
    .tolist()
)
pd.Series(hover_text)

In [None]:
%%time
plot_bow = dmp.create_interactive_plot(
    datamap_bow,
    metadata_interesting.iloc[i_unique]["process_name"],
    hover_text = hover_text,
    initial_zoom_fraction=0.75,
    font_family="Roboto",
    title="ACME 3 processes",
    sub_title="Processes as bags of cooccurrence vectors of command line tokens",
    darkmode=True,
)
plot_bow

In [None]:
cmdlines_bpe = jl.Parallel(n_jobs=os.cpu_count())(
    jl.delayed(eng.bpe.bpe_encode)(cmdline, code_list, max_char_code=127)
    for cmdline in tqdm(cmdlines.iloc[i_unique])
)
pd.Series(cmdlines_bpe)

In [None]:
%%time
vz_cooc = vz.TokenCooccurrenceVectorizer(n_threads=4, n_iter=3).fit(cmdlines_bpe)
tokens_cooc = vz_cooc.cooccurrences_
tokens_cooc

In [None]:
%%time
tokens_svd = TruncatedSVD(n_components=512).fit_transform(tokens_cooc.tocsr())
tokens_svd.shape

In [None]:
coo_distrib = np.array([
    [row, token2unique[token], count]
    for row, cc in enumerate(tqdm(cmdlines_bpe))
    for token, count in zip(*np.unique(cc, return_counts=True))
])
distrib = sp.coo_matrix((coo_distrib[:, 2], (coo_distrib[:, 0], coo_distrib[:, 1])))
distrib

In [None]:
%%time
cmdlines_hg = vz.WassersteinVectorizer(max_distribution_size=tokens_svd.shape[0]).fit_transform(distrib, vectors=tokens_svd)
cmdlines_hg.shape

In [None]:
%%time
datamap_hg = umap.UMAP(n_neighbors=100, metric="cosine", n_epochs=500, verbose=True).fit_transform(cmdlines_hg)
datamap_hg.shape

In [None]:
plot_hg = dmp.create_interactive_plot(
    datamap_hg,
    metadata_interesting.iloc[i_unique]["process_name"],
    hover_text=hover_text,
    font_family="Roboto",
    title="ACME 3 processes",
    sub_title="Processes as bags of cooccurrence vectors of command line tokens",
    enable_search=True,
    darkmode=True,
)
plot_hg

---

In [None]:
%load_ext magic_duckdb

In [None]:
%dql -co db

In [None]:
%%dql
select hostname, count(distinct pid_hash)
from process
group by hostname

In [None]:
procs = db.execute(
    """
    select hostname, day, count(distinct pid_hash) as num
    from (
        select pid_hash, hostname, cast(datetrunc('day', process_started) as date) as day
        from process_interesting
        where process_started = first_seen
    )
    group by hostname, day
    order by hostname, day
    """
).df()
procs_nona = procs.dropna(subset=["day"])
procs_per_hostday = procs_nona.loc[procs_nona["day"] >= pd.Timestamp("2023-11-05")].set_index(["hostname", "day"]).unstack("day").fillna(0)
procs_per_hostday

---

In [None]:
%%dql
describe process_image_load

In [None]:
%%dql
create or replace view image_load as
select pid_hash, process_image_load.hostname, process_image_load.first_seen as timestamp, process_interesting.process_name, process_image_load.filename
from process_image_load
inner join process_interesting using (pid_hash)

In [None]:
%%dql -o process_x_image
select pid_hash, process_name, array_agg(filename) as images, array_agg(timestamp) as timestamps
from image_load
group by pid_hash, process_name

In [None]:
%%time
vz_ngram = vz.NgramVectorizer().fit(process_x_image["images"])
vz_ngram

In [None]:
procdlls_vec = vz_ngram._train_matrix
procdlls_vec

In [None]:
%%time
procdlls_iwt = vzt.InformationWeightTransformer().fit_transform(procdlls_vec)
procdlls_iwt

In [None]:
%%time
_lil = procdlls_iwt.tolil()
_, i_unique, i_deunique, procdlls_counts = np.unique(
    [
        [*md5(np.asarray(ind)).digest(), *md5(np.asarray(dat)).digest()]
        for ind, dat in tqdm(zip(_lil.rows, _lil.data), total=procdlls_iwt.shape[0])
    ],
    axis=0,
    return_index=True,
    return_inverse=True,
    return_counts=True
)
len(i_unique)

In [None]:
%%time
procdlls_bow_map = umap.UMAP(
    n_neighbors=15,
    metric="hellinger",
    verbose=True
).fit_transform(procdlls_iwt[i_unique, :])
procdlls_bow_map

In [None]:
i_good, = np.nonzero(~np.isnan(procdlls_bow_map[:, 0]))
i_good

In [None]:
cmap = colormaps.get("plasma")
procdlls_labels = pd.DataFrame({"label": np.sort(process_x_image["process_name"].iloc[i_unique].iloc[i_good].unique())})
procdlls_labels["n"] = np.arange(len(procdlls_labels)) / len(procdlls_labels)
procdlls_labels["color"] = procdlls_labels["n"].map(cmap) #.map(lambda n: "".join("{:02x}".format(c) for c in cmap(n, bytes=True)[:3]))
procdlls_labels
display(procdlls_labels)
label2color = procdlls_labels.set_index("label")["color"].to_dict()

In [None]:
hover_text = [
    (f"({count}x) " if count > 1 else "") + f"{name}: " + " ".join(
        [p.split("\\")[-1] for p in imgs][:5]
    ) + (f" +{len(imgs) - 5}" if len(imgs) > 5 else "")
    for count, (name, imgs) in zip(procdlls_counts[i_good], process_x_image.iloc[i_unique].iloc[i_good][["process_name", "images"]].itertuples(index=False))
]
pd.Series(hover_text)

In [None]:
plot_bow2 = dmp.create_interactive_plot(
    procdlls_bow_map[i_good, :],
    process_x_image["process_name"].iloc[i_unique].iloc[i_good],
    hover_text=hover_text,
    label_color_map=label2color,
    font_family="Roboto",
    title="ACME 3 processes",
    sub_title="Processes as bags of DLLs",
    enable_search=True,
    darkmode=True,
)
plot_bow2

In [None]:
%%time
dll_cooc = vz.TokenCooccurrenceVectorizer().fit(process_x_image["images"])
dll_cooc.cooccurrences_

In [None]:
%%time
dll_svd = TruncatedSVD(n_components=512).fit_transform(dll_cooc.cooccurrences_)
dll_svd.shape

In [None]:
%%time
coo_distrib = np.array([
    (row, dll_cooc.token_label_dictionary_[token], count)
    for row, images in enumerate(process_x_image["images"])
    for token, count in zip(*np.unique(images, return_counts=True))
])
distrib = sp.coo_matrix((coo_distrib[:, 2], (coo_distrib[:, 0], coo_distrib[:, 1])))
distrib

In [None]:
%%time
procdlls_hg = vz.WassersteinVectorizer().fit_transform(distrib, vectors=dll_svd)
procdlls_hg.shape

In [None]:
%%time
procdlls_hg_map = umap.UMAP(metric="cosine", verbose=True).fit_transform(procdlls_hg)

In [None]:
hover_text = [
    f"{name}: " + " ".join(
        [p.split("\\")[-1] for p in imgs][:5]
    ) + (f" +{len(imgs) - 5}" if len(imgs) > 5 else "")
    for name, imgs in process_x_image[["process_name", "images"]].itertuples(index=False)
]
pd.Series(hover_text)

In [None]:
plot_hg2 = dmp.create_interactive_plot(
    procdlls_hg_map,
    process_x_image["process_name"],
    hover_text=hover_text,
    font_family="Roboto",
    title="ACME 3 processes",
    sub_title="Processes as bags of DLL cooccurrence vectors",
    enable_search=True,
    darkmode=True,
)
plot_hg2