In [None]:
import bokeh.io
import bokeh.plotting as bpl
import cloudpickle as cpkl
import fsspec
import gzip
import itertools as it
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import panel as pn
from pathlib import Path
import scipy.sparse as ss
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
import thisnotthat as tnt
from vectorizers.transformers import CategoricalColumnTransformer, InformationWeightTransformer

In [None]:
bokeh.io.output_notebook()
pn.extension()

In [None]:
files_vectors = ["manifest.json", "features.npz", "map2d.npz", "metadata.csv.gz", "labels.csv.gz", "col2token.pkl.gz"]
if all([Path(f).is_file() for f in files_vectors]):
    print("Using process map and metadata stored LOCALLY.")
    FS = fsspec.filesystem("file")
    ROOT = "."
else:
    print("Using CANNED process map and metadata (from Azure container).")
    FS = fsspec.filesystem("abfs", account_name="scipy2023")
    ROOT = "map"

In [None]:
with FS.open("manifest.json", "rt", encoding="utf-8") as file:
    manifest = json.load(file)
HOST = manifest["host"]
DAYS = manifest["days"]
print(f"Context: host {HOST}; days {', '.join(DAYS)}")

In [None]:
with FS.open("features.npz", "rb") as file_features:
    features = ss.load_npz(file_features)
features

In [None]:
with (
    FS.open("col2token.pkl.gz", "rb") as file_compressed,
    gzip.open(file_compressed, "rb") as file_pkl
):
    col2token = dict(enumerate(cpkl.load(file_pkl)))

assert len(col2token) == features.shape[1]
for i, (k, v) in enumerate(col2token.items()):
    if i >= 25:
        break
    print(k, ":", v)

In [None]:
with FS.open("metadata.csv.gz", "rb") as file_metadata:
    metadata = pd.read_csv(file_metadata, parse_dates=["timestamp"], compression="gzip")
assert metadata.shape[0] == features.shape[0]
metadata

In [None]:
with FS.open("labels.csv.gz", "rb") as file_labels:
    labels = pd.read_csv(file_labels, compression="gzip")
assert labels.shape[0] == features.shape[0]
labels

In [None]:
with FS.open("map2d.npz", "rb") as file_vectors:
    process_map = np.load(file_vectors)["process_map"]
assert process_map.shape == (features.shape[0], 2)

In [None]:
top15_labels = labels.groupby("label", as_index=False).agg({"process_id": "count"}).sort_values("process_id", ascending=False).head(15)
top15_labels

In [None]:
labels_top15 = labels.loc[labels["label"].isin(set(top15_labels["label"]))].copy()
indices_top15 = labels_top15.index.copy()
labels_top15.reset_index(drop=True, inplace=True)
labels_top15

In [None]:
processes_top15 = process_map[indices_top15, :]
features_top15 = features[indices_top15, :]
metadata_top15 = metadata.loc[indices_top15].copy()
processes_top15.shape, features_top15.shape, metadata_top15.shape

In [None]:
class SparseSupportSummarizer:
    """
    Summarizer for a DataSummaryPane.
    This takes a sparse matrix of counts or importances.  Then for any selection of data it computes the
    column marginals of that matrix and finds the columns with the largest marginals.

    It returns a DataFrame with the top max_features features along with their column marginals and support.

    Parameters
    ----------

    matrix: a sparse matrix
        This is the matrix which we will use for computing the marginals
    column_index_dictionary: dict
         A dictionary mapping from column indices to column names
    max_features: int <default: 10>
        The number of features to return
    proportional_support: bool <default: True>
        Should the proportion be normalized (True) or left as a raw count (False)
    """
    def __init__(
        self,
        matrix,
        column_index_dictionary,
        max_features= 10,
        proportional_support = True
    ):
        self.matrix = matrix
        self.column_index_dictionary = column_index_dictionary
        self.max_features = max_features
        self.proportional_support = proportional_support

    def summarize(self, selected):
        data = self.matrix[plot.selected,:]
        column_marginal = np.array(data.sum(axis=0)).squeeze()
        largest_indices = np.argsort(column_marginal)[::-1][:self.max_features]
        features = [self.column_index_dictionary[x] for x in largest_indices]
        kinds, values = zip(*features)
        importance = column_marginal[largest_indices]
        support = np.sort(np.array((data>0).sum(axis=0)).squeeze())[::-1][:self.max_features]
        if self.proportional_support:
            support = support / data.shape[0]
        return pd.DataFrame({'Kind': kinds, 'Value': values, 'Total weight':importance, 'support':support})

In [None]:
class SparseFeatureImportanceSummarizer:
    """
    Summarizer for the PlotSummaryPane that constructs a class balanced, L1 penalized,
    logistic regression between the selected points and the remaining data.

    This version takes a sparse feature matrix and column_index_dictionary which maps from the
    indices of the matrix to the set of feature names.

    Then it displays that feature importance in a bar plot.
    The title is colour coded by model accuracy in order to give a rough approximation of
    how much trust you should put in the model.

    All of the standard caveats with using the coefficients of a linear model as a feature
    importance measure should be included here.

    It might be worth reading the sklearn documentation on the
    common pitfalls in the interpretation of coefficients of linear models
    (https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html)

    Parameters
    ----------

    data: sparse_matrix
        A sparse_matrix corresponding to the plot points.
    column_index_dictionary: dict
        A dictionary mapping from column indices to column names
    max_features: int <default: 15>
        The maximum number of features to display the importance for.
    tol_importance_relative: float <default: 0.01>
        The minimum feature coefficient value in order to be considered important.

    """

    def __init__(
        self,
        data,
        column_index_dictionary,
        max_features: int = 15,
        tol_importance_relative: float = 0.01,
    ):

        self.data = data  # Indexed 0 to length.
        self.max_features = max_features
        self.tol_importance_relative = tol_importance_relative
        self._features = column_index_dictionary
        self._classifier = None
        self._classes = None

    def summarize(self, selected, width: int = 600, height: int = 600):
        classes = np.zeros((self.data.shape[0],), dtype="int32")
        classes[selected] = True
        classifier = LogisticRegression(
            penalty="l1",
            solver="liblinear",
            class_weight="balanced",
            tol=1e-3,
            max_iter=20
        ).fit(self.data, classes)
        self._classifier = classifier
        self._classes = classes
        assert classifier.coef_.shape[0] == 1 or classifier.coef_.ndim == 1
        importance = np.squeeze(classifier.coef_)
        index_importance = np.argsort(-np.abs(importance))[: self.max_features]
        importance_abs = np.abs(importance)[index_importance]
        importance_relative = importance_abs / np.max(importance_abs)
        importance_restricted = importance[
            np.where(importance_relative > self.tol_importance_relative)
        ]

        selected_columns_tuples = [self._features[x] for x in index_importance[: len(importance_restricted)] ]
        selected_columns = [f"{kind}: {value}" for kind, value in selected_columns_tuples]

        model_acc = classifier.score(self.data, classes)
        fig = bpl.figure(
            y_range=selected_columns,
            width=width,
            height=height,
        )
        if model_acc > 0.9:
            fig.title = f"Estimated Feature Importance\nTrustworthiness high ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "green"
        elif model_acc > 0.8:
            fig.title = f"Estimated Feature Importance\nTrustworthiness medium ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "yellow"
        elif model_acc > 0.5:
            fig.title = f"Estimated Feature Importance\nTrustworthiness low ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "orange"
        else:
            fig.title = f"Estimated Feature Importance\nTrustworthiness low ({model_acc:.4} mean accuracy)"
            fig.title.text_color = "red"

        fig.hbar(
            y=selected_columns,
            right=importance[index_importance[: len(importance_restricted)]],
            height=0.8,
        )
        plt.xlabel("Coefficient values corrected by the feature's std dev")
        return fig

In [None]:
infoweight = InformationWeightTransformer().fit_transform(features_top15).astype(np.float32)
infoweight

In [None]:
metadata_top15.astype('str')

In [None]:
_27.dtypes

In [None]:
%%time
metadata_summary_top15 = pd.merge(
    metadata_top15,
    CategoricalColumnTransformer(
        object_column_name='process_id',
        descriptor_column_name=list(metadata_top15.columns[2:]),
        include_column_name=True
    ).fit_transform(metadata_top15.astype('str')).rename("event_summary").reset_index(),
    on="process_id",
    how="left"
).merge(labels_top15, on="process_id", how="left")
metadata_summary_top15["event_summary_string"] = metadata_summary_top15["event_summary"].apply(lambda x: "<br>".join(x))
metadata_summary_top15["freq"] = 1
metadata_summary_top15

In [None]:
paths_short = {i: (kind, value.split("\\")[-1]) for i, (kind, value) in col2token.items()}
assert len(paths_short) == features.shape[1]
for i, (k, v) in enumerate(paths_short.items()):
    if i >= 25:
        break
    print(k, ":", v)

In [None]:
%%time
infoweight_compressed = TruncatedSVD(n_components=1024).fit_transform(infoweight)
infoweight_compressed.shape

In [None]:
%%time
layer_metadata = tnt.SparseMetadataLabelLayers(
    infoweight_compressed,
    processes_top15,
    features_top15,
    {i: value for i, (_, value) in paths_short.items()},
    cluster_map_representation=False,
    random_state=42
)
layer_metadata

In [None]:
template_info_process = """
# {label}

## {process_id}

---

{event_summary_string}

"""

In [None]:
plot = tnt.BokehPlotPane(
    processes_top15,
    labels=labels_top15["label"],
    width=600,
    height=600,
    show_legend=False,
    tools="pan,wheel_zoom,lasso_select,tap,reset"
)
plot.add_cluster_labels(layer_metadata, max_text_size=24)

editor = tnt.LabelEditorWidget(plot.labels, selectable_legend=True)
editor.link_to_plot(plot)

#This is one of our most simple search widgets.  Please see our read the docs page for more powerful and flexible search options.
search = tnt.KeywordSearchWidget(labels_top15["label"])
search.link_to_plot(plot)

info_pane = tnt.InformationPane(metadata_summary_top15, markdown_template=template_info_process, width=600)
info_pane.link_to_plot(plot)

value_summarizer = tnt.summary.dataframe.ValueCountsSummarizer(labels_top15["label"])
value_summary_plot = tnt.summary.dataframe.DataSummaryPane(value_summarizer)
value_summary_plot.link_to_plot(plot)

time_summarizer = tnt.summary.plot.TimeSeriesSummarizer(
    metadata_summary_top15,
    time_column='timestamp',
    count_column='freq'
)
time_summary_plot = tnt.summary.plot.PlotSummaryPane(time_summarizer)
time_summary_plot.link_to_plot(plot)

control_df = metadata_top15["THREAD,FLOW,PROCESS,FILE,REGISTRY,TASK,MODULE,USER_SESSION,SERVICE,SHELL,HOST".split(',')]
control = tnt.PlotControlWidget(raw_dataframe=control_df)
control.link_to_plot(plot)

support_summarizer = SparseSupportSummarizer(features_top15, paths_short, max_features=16)
support_summary_df = tnt.summary.dataframe.DataSummaryPane(support_summarizer, width=600, sizing_mode=None)
support_summary_df.link_to_plot(plot)

feature_summarizer = SparseFeatureImportanceSummarizer(features_top15, paths_short, max_features=8)
feature_summary_plot = tnt.summary.plot.PlotSummaryPane(feature_summarizer, width=800, sizing_mode="stretch_both")
feature_summary_plot.link_to_plot(plot)

#Lay out the widgets that you are interested in using via Panels excellent Row, Column and Tab functions
pn.Column(
    pn.Row(plot, pn.Column(pn.Row(editor, pn.Column(search, control)))),
    pn.Tabs(
        ("Chronology", pn.Row(time_summary_plot, value_summary_plot)),
        ("Feature importance", pn.Row(feature_summary_plot, support_summary_df)),
        ("Details", info_pane))
)