All text annotations are temporary, and for guiding John.

In [None]:
import bokeh.io
from collections import defaultdict
from dask import delayed
from dask.distributed import LocalCluster, Client, as_completed
import gzip
from hashlib import md5
import json
import numpy as np
import os
import pandas as pd
import panel as pn
from pathlib import Path
import scipy.sparse as ss
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
import struct
import sys
import thisnotthat as tnt
from tqdm.auto import tqdm
import umap
import vectorizers as vz
import vectorizers.transformers as vzt
from vectorizers.transformers import CategoricalColumnTransformer
import zstandard as zstd

In [None]:
bokeh.io.output_notebook()
pn.extension()

Dask makes things go zzzzzoom

In [None]:
cluster = LocalCluster(threads_per_worker=1)
client = Client(cluster)
client

We will work on host 501, processing all days. One can also choose days between 18 and 25.

In [None]:
HOST = 501
DAYS = ["*"]
HOSTNAME = f"SysClient{HOST:04d}.systemia.com"

The data lives as compressed JSON-lines chunks, Zstd-compressed. The following data engineering goes much less deep into token generation than the work we presented so far, so as to put the emphasis on the vectorization (not the data engineering).

In [None]:
schemas = {
    "FLOW": ["object", "action", ("src_ip", "ip"), ("dest_ip", "ip"), ("src_port", "port"), ("dest_port", "port"), "l4protocol", "direction"],
    "FILE": ["object", "action", ("file_path", "path"), "info_class", ("new_path", "path")],
    "HOST": ["object", "action"],
    "MODULE": ["object", "action", ("module_path", "path")],
    "REGISTRY": ["object", "action", ("key", "registry-key"), ("value", "registry-value"), ("type", "registry-type")],
    "SERVICE": ["object", "action", ("name", "service-name")],
    "SHELL": ["object", "action"],
    "TASK": ["object", "action", "path", ("task_name", "task-name")],
    "THREAD": ["object", "action"],
    "USER_SESSION": ["object", "action", ("user", "user-domain"), ("requesting_domain", "domain"), ("requesting_user", "user"), ("src_ip", "ip"), ("src_port", "port")],
    "PROCESS": {
        "CREATE": {
            "actorID": ["object", "action", ("image_path", "child"), ("image_path", "path")],
            "objectID": [("parent_image_path", "parent"), ("image_path", "process"), ("user", "user-domain")]
        },
        "OPEN": {
            "actorID": ["object", "action"]
        },
        "TERMINATE": {
            "actorID": ["object", "action"]
        }
    }
}

In [None]:
def iter_events(path_chunk):
    with zstd.open(path_chunk, mode="rt", encoding="utf-8") as file:
        for line in file:
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                # Skip ill-formed records.
                pass

In [None]:
def extract_features(path_chunk):
    for event in iter_events(path_chunk):
        obj = event["object"]
        if obj not in schemas:
            continue
        schema = (
            schemas[obj].get(event["action"], {})
            if isinstance(schemas[obj], dict)
            else {"actorID": schemas[obj]}
        )
        for identifier, features in schema.items():
            tokens = []
            for feature in features:
                field, kind = (feature, feature) if isinstance(feature, str) else feature
                if value := event.get(field, ""):
                    tokens.append((kind, value))
            yield (pd.Timestamp(event["timestamp"]), event[identifier], tokens)

Given a data chunk, we return a data frame where its categorical tokens are already in the list form suitable for one-hot vectorization.

In [None]:
def tabulate_features(path_chunk):
    return pd.DataFrame(
        data=extract_features(path_chunk),
        columns=["timestamp", "process_id", "tokens"]
    ).astype({"process_id": "category"})

Where are my data chunks?

In [None]:
ROOT_HOSTNAME = Path("/data/optc/scipy2023") / HOSTNAME
CHUNKS = sorted(sum(
    [list(ROOT_HOSTNAME.glob(f"{day}/optc-eng.*.json.zstd")) for day in DAYS],
    []
))
len(CHUNKS)

The vectorization gambit is to do it by chunks, and combine the resulting sparse matrices afterwards, using nifty NgramVectorizer addition.

In [None]:
def vectorize_features(path_chunk):
    return vz.NgramVectorizer().fit(tabulate_features(path_chunk)["tokens"])

In [None]:
%%time
summands = [[delayed(vectorize_features)(chunk) for chunk in CHUNKS]]
while len(summands[-1]) > 1:
    to_sum = summands[-1]
    sums = []
    for i in range(0, len(to_sum), 2):
        if i + 1 < len(to_sum):
            sums.append(to_sum[i] + to_sum[i + 1])
        else:
            sums.append(to_sum[i])
    summands.append(sums)

futs = client.compute(sum(summands, []))
for fut in tqdm(as_completed(futs), total=sum(len(ss) for ss in summands)):
    pass

vzr_all = futs[-1].result()
event_matrix = vzr_all._train_matrix
event_matrix

Let's now group events by process.

In [None]:
def summarize_processes(metadata):
    return metadata.groupby("process_id", as_index=False).agg({"timestamp": "min", **{col: "sum" for col in metadata.columns if col not in {"timestamp", "process_id"}}})

In [None]:
def events_by_process(path_chunk):
    features = tabulate_features(path_chunk)
    metadata = features[["timestamp", "process_id"]].join(
        pd.DataFrame(
            data=iter(features["tokens"].apply(lambda tokens: {value: 1.0 for kind, value in tokens if kind == "object"}))
        ),
        how="inner"
    )
    metadata["event_index"] = pd.Series(metadata.index).apply(lambda x: [x])
    return summarize_processes(metadata)

In [None]:
%%time
events_by_process(CHUNKS[0])

In [None]:
%%time
process2ievent = {}
total_events = 0
metadata_processes = pd.DataFrame()
for fut in tqdm(client.map(events_by_process, CHUNKS), total=len(CHUNKS)):
    processes = fut.result()
    metadata_processes = summarize_processes(pd.concat([metadata_processes, processes.drop(columns=["event_index"])], ignore_index=True).fillna(0.0))
    for process_id, indices in processes[["process_id", "event_index"]].itertuples(index=False):
        process2ievent.setdefault(process_id, [])
        for index_row_chunk in indices:
            process2ievent[process_id].append(index_row_chunk + total_events)
    total_events += processes["event_index"].apply(len).sum()

len(process2ievent)

In [None]:
metadata_processes

That's a *lot* of processes. Let's prune off those for which we don't have enough features (by weight) to reliably describe their behaviour.

In [None]:
features_per_event = np.array(event_matrix.sum(axis=1)).squeeze()
features_per_process = pd.Series({process_id: sum([features_per_event[i] for i in indices]) for process_id, indices in tqdm(process2ievent.items())})
features_per_process

Distribution of number of features per process:

In [None]:
features_per_process.apply(np.log10).hist(bins=range(-1, 6))

It does not make much sense to me to keep processes described by a total number of categorical features less than 10. So let's drop the guys from the first column.

We will do that while also putting together process vectors by summing event vectors.
This means a linear combination of the rows of the event matrix.
The fastest way of achieving that is by computing a projection matrix that we will multiply on the left of the event matrix.

In [None]:
%%time
irows = []
icols = []
process2irow = {}
irow2process = {}
irow_next = 0
for process_id, indices in tqdm(process2ievent.items()):
    if features_per_process.loc[process_id] >= 10:
        irow = irow_next
        irow_next += 1
        irows += [irow] * len(indices)
        icols += indices
        process2irow[process_id] = irow
        irow2process[irow] = process_id

projection = ss.coo_matrix((np.ones((len(irows),), dtype=np.int32), (irows, icols)), shape=(len(process2irow), event_matrix.shape[0])).tocsr()
assert set(np.array(projection.sum(axis=0)).squeeze()) <= {0, 1}
projection

In [None]:
process_matrix = (projection @ event_matrix).astype(np.float32)
process_matrix

In [None]:
pruned = sorted(list(irow2process.items()))
metadata_pruned = metadata_processes.set_index("process_id").loc[[process_id for _, process_id in pruned]].copy().reset_index()
assert pd.Series(metadata_pruned.index).equals(pd.Series([i for i, _ in pruned]))
metadata_pruned

The categories (_labels_) for our process instances are either the command line by which they were started, or when we can't find that, their related image path.
The former can only be found in `PROCESS-CREATE` events.
The latter is field common to all events, and its value should be shared by all events generated by any given process instance.

The way we associate labels to process instances is thus to extract the best label we can from every event.
We then tabulate these in association with their process ID, and use an *importance* ordinal to denote which label should take precedence.
We sort this table by importance, and drop process ID duplicates: what remains are the best guest we can take as label for every process.

In [None]:
def filter_labels(proposals):
    return proposals.sort_values("importance", ascending=True).drop_duplicates(subset=["process_id"], keep="first", ignore_index=True)

In [None]:
def label_processes(path_chunk):
    data = []
    for event in iter_events(path_chunk):
        if event["object"] == "PROCESS" and event["action"] == "CREATE":
            if command_line := event.get("command_line", ""):
                data.append((event["objectID"], 0, command_line))
            elif image_path := event.get("image_path", ""):
                data.append((event["objectID"], 10, image_path))
            if parent_image_path := event.get("parent_image_path", ""):
                data.append((event["actorID"], 10, parent_image_path))
        else:
            if image_path := event.get("image_path", ""):
                data.append((event["actorID"], 10, image_path))

    return filter_labels(pd.DataFrame(data=data, columns=["process_id", "importance", "label"]))

We then run this filtering iteratively across best proposals from every chunk, and come out the other end with every process instance labeled... or nearly.

In [None]:
labels_known = pd.DataFrame()
for fut in tqdm(client.map(label_processes, CHUNKS), total=len(CHUNKS)):
    labels_known = filter_labels(pd.concat([labels_known, fut.result()], ignore_index=True))
labels_known

Any process missing a label, now, we just consider we **don't know** what they are about.
Let's bin these together.

In [None]:
labels = pd.Series(irow2process, name="process_id").to_frame().merge(labels_known[["process_id", "label"]], on="process_id", how="left").fillna("(unknown)")
labels

Now, not all features are _useful_ for characterizing the process instances.
*Orphan features* are too few for their sharing to denote similarity between more than a very small group of processes.
*Spurious features* are too often associated to processes to help differentiate between them (like stop words).
So a quick thresholding might help compress our very large feature space.

In [None]:
feature_importance = pd.Series(np.array(process_matrix.sum(axis=0)).squeeze())
sum(feature_importance == 0)

So, the pruning of the set of processes already leaves 41 features completely useless.

In [None]:
feature_importance.loc[feature_importance > 0].apply(np.log10).hist(bins=[-2,-1,0,1,2,3,4,5])

Most features, by a large factor, are orphans; we seem not to have any spurious feature, as none is associated to more than 10000 process instances.

Let's take a more detailed look at the first column of the previous histogram.

In [None]:
feature_importance.loc[feature_importance < 10].hist(bins=np.linspace(0, 10, 10) - 0.5)

Again, most of these rarely used features are literal orphans: associated to one or two processes.
Let's cut off any that's not tied to at least 3 processes.

In [None]:
%%time
col2token = []
token2col = {}
indices_keep = []
for i, count in enumerate(feature_importance):
    if count > 3:
        indices_keep.append(i)
        token = vzr_all.column_index_dictionary_[i]
        index_new = len(col2token)
        col2token.append(token)
        token2col[token] = index_new

reduced_matrix = process_matrix[:, indices_keep].copy()
reduced_matrix

Ok, has this feature space reduction killed the representation of processes?
I'm hoping the total feature weight for any process is at least 5 (e.g. 5 tokens associated to it across all events that characterize it).

In [None]:
features_per_process_redux = np.array(reduced_matrix.sum(axis=1)).squeeze()
assert np.min(features_per_process_redux) > 5.0

Now, it's always easier to compute the compressed vector representation on the subset of unique process vectors.

In [None]:
%%time

def md5_list(it):
    return struct.unpack("<QQ", md5(memoryview(np.array(it))).digest())

reduced_lil = reduced_matrix.tolil()
hh = np.zeros(shape=(reduced_matrix.shape[0], 4), dtype=np.uint64)
for i, indices_values in enumerate(zip(reduced_lil.rows, reduced_lil.data)):
    hh[i, :] = sum((md5_list(it) for it in indices_values), ())
_, index_u, inverse_u, counts_u = np.unique(hh, axis=0, return_index=True, return_inverse=True, return_counts=True)
index_u.shape, inverse_u.shape

In [None]:
unique_matrix = reduced_matrix[index_u, :]
unique_matrix

I have tried running the information weight transform on the matrix of unique process vectors,
but the result seems to confuse UMAP **a lot**.
UMAP would crash on that matrix by putting way too many vectors under one particular leaf of the RP tree:
Leland mused that the hyperplanes used to spread the vectors between the RP trees were doing a poor job.
I didn't have the time to truly debug this, so I moved on with directly compressing the matrix of unique process vectors.

The protomap only contains the unique vectors.

In [None]:
%%time
process_protomap = umap.UMAP(n_components=2, metric="cosine", densmap=True, dens_lambda=4, n_epochs=800, verbose=True).fit_transform(Normalizer(norm="l1").fit_transform(unique_matrix))
process_protomap

The full map is the protomap reduplicated.

In [None]:
process_map = process_protomap[inverse_u, :]
process_map.shape

The following will visualize a map of all process instances, where we color the most frequent process classes (top 12).

In [None]:
processes_top12 = labels.groupby("label", as_index=False).agg({"process_id": "count"}).sort_values("process_id", ascending=False).head(12)
processes_top12

### Build some custom summarizers
ThisNotThat hasn't yet integrated a few of the data summarization views that we'd like to use to explore our data.  As such we'll build them ourselves for the moment.  Once they prove generally useful we'll contribute them back to the ThisNotThat project via a pull request. That will allow our work to be used by both ourselves and others in the future.

In [None]:
list(zip(*[(0, 3), (1, 2), (3, 4)]))

In [None]:
class SparseSupportSummarizer:
    """
    Summarizer for a DataSummaryPane.
    This takes a sparse matrix of counts or importances.  Then for any selection of data it computes the
    column marginals of that matrix and finds the columns with the largest marginals.

    It returns a DataFrame with the top max_features features along with their column marginals and support.

    Parameters
    ----------

    matrix: a sparse matrix
        This is the matrix which we will use for computing the marginals
    column_index_dictionary: dict
         A dictionary mapping from column indices to column names
    max_features: int <default: 10>
        The number of features to return
    proportional_support: bool <default: True>
        Should the proportion be normalized (True) or left as a raw count (False)
    """
    def __init__(
        self,
        matrix,
        column_index_dictionary,
        max_features= 10,
        proportional_support = True
    ):
        self.matrix = matrix
        self.column_index_dictionary = column_index_dictionary
        self.max_features = max_features
        self.proportional_support = proportional_support

    def summarize(self, selected):
        data = self.matrix[plot.selected,:]
        column_marginal = np.array(data.sum(axis=0)).squeeze()
        largest_indices = np.argsort(column_marginal)[::-1][:self.max_features]
        features = [self.column_index_dictionary[x] for x in largest_indices]
        kinds, values = zip(*features)
        importance = column_marginal[largest_indices]
        support = np.sort(np.array((data>0).sum(axis=0)).squeeze())[::-1][:self.max_features]
        if self.proportional_support:
            support = support / data.shape[0]
        return pd.DataFrame({'Kind': kinds, 'Value': values, 'Total weight':importance, 'support':support})

The current [FeatureImportanceSummarizer](https://thisnotthat.readthedocs.io/en/latest/plotsummarypane_feature_importance.html) doesn't take sparse matrices as input.  As such we grabbed the class and modified it to suit our needs.  Again we'll integrate this back into the library later if it is generally useful.

In [None]:
import bokeh.plotting as bpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
class SparseFeatureImportanceSummarizer:
    """
    Summarizer for the PlotSummaryPane that constructs a class balanced, L1 penalized,
    logistic regression between the selected points and the remaining data.

    This version takes a sparse feature matrix and column_index_dictionary which maps from the
    indices of the matrix to the set of feature names.

    Then it displays that feature importance in a bar plot.
    The title is colour coded by model accuracy in order to give a rough approximation of
    how much trust you should put in the model.

    All of the standard caveats with using the coefficients of a linear model as a feature
    importance measure should be included here.

    It might be worth reading the sklearn documentation on the
    common pitfalls in the interpretation of coefficients of linear models
    (https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html)

    Parameters
    ----------

    data: sparse_matrix
        A sparse_matrix corresponding to the plot points.
    column_index_dictionary: dict
        A dictionary mapping from column indices to column names
    max_features: int <default: 15>
        The maximum number of features to display the importance for.
    tol_importance_relative: float <default: 0.01>
        The minimum feature coefficient value in order to be considered important.

    """

    def __init__(
        self,
        data,
        column_index_dictionary,
        max_features: int = 15,
        tol_importance_relative: float = 0.01,
    ):

        self.data = data  # Indexed 0 to length.
        self.max_features = max_features
        self.tol_importance_relative = tol_importance_relative
        self._features = column_index_dictionary
        self._classifier = None
        self._classes = None

    def summarize(self, selected, width: int = 600, height: int = 600):
        classes = np.zeros((self.data.shape[0],), dtype="int32")
        classes[selected] = True
        classifier = LogisticRegression(
            penalty="l1", solver="liblinear", class_weight="balanced"
        ).fit(self.data, classes)
        self._classifier = classifier
        self._classes = classes
        assert classifier.coef_.shape[0] == 1 or classifier.coef_.ndim == 1
        importance = np.squeeze(classifier.coef_)
        index_importance = np.argsort(-np.abs(importance))[: self.max_features]
        importance_abs = np.abs(importance)[index_importance]
        importance_relative = importance_abs / np.max(importance_abs)
        importance_restricted = importance[
            np.where(importance_relative > self.tol_importance_relative)
        ]

        selected_columns_tuples = [self._features[x] for x in index_importance[: len(importance_restricted)] ]
        selected_columns = [f"{kind}: {value}" for kind, value in selected_columns_tuples]

        model_acc = classifier.score(self.data, classes)
        fig = bpl.figure(
            y_range=selected_columns,
            width=width,
            height=height,
        )
        if model_acc > 0.9:
            fig.title = f"Estimated Feature Importance\nTrustworthiness high ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "green"
        elif model_acc > 0.8:
            fig.title = f"Estimated Feature Importance\nTrustworthiness medium ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "yellow"
        elif model_acc > 0.5:
            fig.title = f"Estimated Feature Importance\nTrustworthiness low ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "orange"
        else:
            fig.title = f"Estimated Feature Importance\nTrustworthiness low ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "red"

        fig.hbar(
            y=selected_columns,
            right=importance[index_importance[: len(importance_restricted)]],
            height=0.8,
        )
        plt.xlabel("Coefficient values corrected by the feature's std dev")
        return fig

Explor the most frequent labels to find interesting ways to subset our data for exploration

In [None]:
list(processes_top12["label"])

Perhaps we only want to study particular labels

In [None]:
indices_of_interest = labels.loc[labels["label"] == 'C:\\Windows\\SYSTEM32\\cmd.exe /c "C:\\ncr\\DeleteArchiveSecurity.bat"'].index
process_map_study = process_map[indices_of_interest, :]
process_map_study.shape

### Let's perform a more detailed exploration on the 12 most frequent labels

In [None]:
labels_top12_ = set(processes_top12["label"])
labels_top12 = labels[["process_id"]].copy()
labels_top12["label"] = labels["label"].apply(lambda lb: lb if lb in labels_top12_ else "(other)")
labels_top12

In [None]:
labels_top12_only = labels_top12.loc[labels_top12["label"] != "(other)"]
indices_of_interest = labels_top12_only.index
process_map_top12_only = process_map[indices_of_interest, :]

Build a few matrices and column dictionary for helping us summarize our various selections of data.

In [None]:
column_index_dictionary = {x:y[1] for x,y in enumerate(col2token)}
sparse_features_top12_only = reduced_matrix[indices_of_interest,:]
infoweight_matrix = vzt.InformationWeightTransformer().fit_transform(reduced_matrix[indices_of_interest,:]).astype(np.float32)

Subselect the rows that we are interested and fold together the various bits of metadata associated with these processes.

In [None]:
simple_process_summary = CategoricalColumnTransformer(object_column_name='process_id', descriptor_column_name=list(metadata_pruned.columns[2:]), include_column_name=True).fit_transform(metadata_pruned.astype('str'))
simple_process_summary = simple_process_summary.reset_index()
simple_process_summary.columns = ['process_id', 'event_summary']
metadata_pruned_top12 = metadata_pruned.iloc[indices_of_interest]
metadata_pruned_top12 = pd.merge(metadata_pruned_top12, simple_process_summary, how='left')
metadata_pruned_top12 = pd.merge(metadata_pruned_top12, labels_top12_only, how='left')
metadata_pruned_top12['event_summary_string'] = ["<br>".join(x) for x in metadata_pruned_top12.event_summary]
metadata_pruned_top12['freq'] = 1
metadata_pruned_top12.head(2)

Construct a custom data map explorer that is tailored to our data for helping us both explore and label processes.

### Build ThisNotThat dashboard
This code constructs an interactive dashboard for visualizingand exploring the our selected HBS data of interest it depends on a few objects to better summarize our embedding.  I'll include a list of the objects that are used below in case you'd like to apply a similar dashboard to your data.

* ```process_map_top12_only```: This is an n by 2 numpy array that corresponds to x,y coordinates of our data.  In this case it is generated by UMAP
* ```labels_top12_only``` is a pandas data frame with n rows.  It has a label column with a label per data point that we'll use for hovering over and passing to a ValueCountsSummarizer for displaying what points have been selected.
* ```metadata_pruned_top12``` is a pandas data frame with n rows and all the various bits of metadata we might want to use to summarize our processes.
* ```metadata_pruned.iloc[indices_of_interest]``` in order to align with our process_map_top12_only.
* ```sparse_features_top12_only``` is a scipy sparse matrix with one row corresponding to each of our processes and one column associated with each of our tokens.
* ```column_index_dictionary``` is a dictionary mapping between integers representing column ids and strings representing the feature associated with each column.

Build a label annotation layer.  This particular annotation layer requires a high dimensional dense process embedding which we will generate via an informationWeightTransform of our Ngram matrix followed by a TruncatedSVD to find a 1024 dimensional dense representation.  It also requires our data map (```process_map_top12_only```) and our sparse representation (```sparse_features_top12_only```) along with it's column labels (```trimmed_path_dict```).  We'll use the trimmed paths of our features to avoid cluttering our data map.

In [None]:
trimmed_path_dict = {i:(kind, value.split("\\")[-1]) for i, (kind, value) in enumerate(col2token)}

In [None]:
%%time
dense_vectors = TruncatedSVD(n_components=1024).fit_transform(infoweight_matrix)
label_layer = tnt.SparseMetadataLabelLayers(dense_vectors, process_map_top12_only, sparse_features_top12_only, {i: value for i, (_, value) in trimmed_path_dict.items()}, cluster_map_representation=False, random_state=42)

Now we'll construct a makrdown template which will leverage fields in our metadata data frame (```metadata_pruned_top12```) to give a detailed description of our process.  More advanced users might want to include a hyperlink to a more powerful process exploration tool or to simply include the entire event list in this markdown description.

In [None]:
template = """
# {label}

## {process_id}

---

{event_summary_string}

"""

Finally we will build our basic plot and it's various exploratory widgets to help us analyse and label our data.

In [None]:
plot = tnt.BokehPlotPane(process_map_top12_only, labels=labels_top12_only["label"], width=800, height=800, show_legend=False, tools="pan,wheel_zoom,lasso_select,tap,reset")
editor = tnt.LabelEditorWidget(plot.labels, selectable_legend=True)
editor.link_to_plot(plot)
plot.add_cluster_labels(label_layer, max_text_size=24)

info_pane = tnt.InformationPane(metadata_pruned_top12, markdown_template=template, width=600)
info_pane.link_to_plot(plot)

value_summarizer = tnt.summary.dataframe.ValueCountsSummarizer(labels_top12_only["label"])
value_summary_plot = tnt.summary.dataframe.DataSummaryPane(value_summarizer)
value_summary_plot.link_to_plot(plot)

metadata_pruned['freq'] = 1
time_summarizer = tnt.summary.plot.TimeSeriesSummarizer(metadata_pruned_top12, time_column='timestamp', count_column='freq')
time_summary_plot = tnt.summary.plot.PlotSummaryPane(time_summarizer)
time_summary_plot.link_to_plot(plot)

control_df = metadata_pruned_top12["THREAD,FLOW,PROCESS,FILE,REGISTRY,TASK,MODULE,USER_SESSION,SERVICE,SHELL,HOST".split(',')]
control = tnt.PlotControlWidget(raw_dataframe=control_df)
control.link_to_plot(plot)

support_summarizer = SparseSupportSummarizer(sparse_features_top12_only, trimmed_path_dict, max_features=10)
support_summary_df = tnt.summary.dataframe.DataSummaryPane(support_summarizer, width=600, sizing_mode=None)
support_summary_df.link_to_plot(plot)

# info_summarizer = SparseSupportSummarizer(infoweight_matrix, trimmed_path_dict, max_features=10)
# info_summary_df = tnt.summary.dataframe.DataSummaryPane(info_summarizer, width=400, sizing_mode='stretch_width')
# info_summary_df.link_to_plot(plot)

#This can be a bit expensive for large selections.  That can cause a lag especially when you use a search widget in conjunction with this widget being present.
feature_summarizer = SparseFeatureImportanceSummarizer(sparse_features_top12_only, trimmed_path_dict, max_features=6)
feature_summary_plot = tnt.summary.plot.PlotSummaryPane(feature_summarizer, width=800, height=400, sizing_mode="stretch_both")
feature_summary_plot.link_to_plot(plot)

#This is one of our most simple search widgets.  Please see our read the docs page for more powerful and flexible search options.
search = tnt.KeywordSearchWidget(labels_top12_only["label"])
search.link_to_plot(plot)

#Lay out the widgets that you are interested in using via Panels excellent Row, Column and Tab functions
# pn.Row(plot, editor, pn.Tabs(("search", pn.Column(search, time_summary_plot, value_summary_plot, )), ("support", pn.Column(support_summary_df, info_summary_df)), ("feature importance", feature_summary_plot), ('control', control), (info_pane)))
pn.Column(
    pn.Row(plot, pn.Column(search, pn.Row(editor, control))),
    pn.Tabs(
        ("Chronology", pn.Row(time_summary_plot, value_summary_plot)),
        # ("Support", pn.Row(support_summary_df, info_summary_df)),
        ("Feature importance", pn.Row(feature_summary_plot, support_summary_df)),
        ("Details", info_pane))
)

Ready for demos of exploring the map contents using summarizers.