# Hunting without rules

* Let's try another [SSLoad infection example](https://www.malware-traffic-analysis.net/2024/04/18/index.html);
* This time the rule engine is disabled;
* We might not have a signature for the attack;
* Alerts are only ~10% of data Suricata provides;

In [None]:
import pandas as pd
import json

In [None]:
with open("./data/03/eve.json", "r") as handle:
    DF = pd.json_normalize([json.loads(l) for l in handle])

## Initial view

In [None]:
DF

## Aggregate overview

In [None]:
(
    DF
    .groupby("event_type")
    .agg({"timestamp": ["min", "max", "count"]})
    .sort_values(by=[("timestamp", "min")])
)

## Investigate common indicators

* DNS Resource Record;
* TLS Server Name Indicator;
* HTTP Hostname;
* Transferred file names;
* HTTP Request URL-s,

In [None]:
sorted(list(DF["dns.rrname"].dropna().unique()))

In [None]:
sorted(list(DF["tls.sni"].dropna().unique()))

In [None]:
sorted(list(DF["http.hostname"].dropna().unique()))

In [None]:
sorted(list(DF["fileinfo.filename"].dropna().unique()))

In [None]:
sorted(list(DF["http.url"].dropna().unique()))

## Filter flows that exhibit interesting values

* Drill down on interesting values;
* Goal is to extract relevant flow_id values for interesting flows;
* Why - not enough to say *"I saw bad stuff"*;
* What happened?
* Who is impacted?

In [None]:
IDX_DNS = DF["dns.rrname"].isin(["6a1dec63ee4eff7cb2935f0df790f4df.azr.footprintdns.com", "t.me", "x1.c.lencr.org", "api.ipify.org"])
IDX_TLS = DF["tls.sni"].isin(["6a1dec63ee4eff7cb2935f0df790f4df.azr.footprintdns.com", "t.me", "x1.c.lencr.org", "api.ipify.org"])
IDX_HOSTNAME = DF["http.hostname"].isin(["212.18.104.28", "85.239.53.219", "x1.c.lencr.org"])
IDX_URL = DF["http.url"].isin(["/8080.dll", "/api/1f4b0ad7-0502-68bf-b0aa-fe57ec45c025/result", "/api/1f4b0ad7-0502-68bf-b0aa-fe57ec45c025/tasks"])
IDX_FILES = DF["fileinfo.filename"].isin(["/8080.dll", "crypted_dll.bin"])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
DF_INTEREST = (
    DF
    .loc[IDX_DNS | IDX_TLS | IDX_HOSTNAME | IDX_URL | IDX_FILES]
    .dropna(how="all", axis=1)
)

## Investigate related flows

In [None]:
import ipywidgets as widgets

In [None]:
def show(limit: int, event_type: list):
    pd.set_option('display.max_rows', limit)
    pd.set_option('display.min_rows', limit)
    pd.set_option('display.max_columns', None)
    
    df = DF_INTEREST
    if event_type is not None and len(event_type) > 0:
        df = df.loc[df.event_type.isin(event_type)]
    return df.sort_values(by=["timestamp", "flow_id", "tx_id"], ascending=True)

In [None]:
widgets.interact(show, 
                 limit=widgets.IntSlider(min=5, max=300, continuous_update=False),
                 event_type=widgets.SelectMultiple(options=list(DF_INTEREST.event_type.unique())))

## Aggregations

In [None]:
def agg(limit: int, group: str, aggs_fields: list):
    pd.set_option('display.max_rows', limit)
    pd.set_option('display.min_rows', limit)
    pd.set_option('display.max_columns', None)
    
    aggs = {
        "timestamp": ["min", "max", "count"]
    }
    for a in aggs_fields:
        if a == group or a in ("timestamp", "tx_id"):
            continue
        if a not in list(DF_INTEREST.columns.values):
            continue
        aggs[a] = ["unique", "nunique"]
    
    if group in ('', None):
        return
    
    return (
        DF_INTEREST
        .groupby(group)
        .agg(aggs)
    )

In [None]:
widgets.interact(agg,
                 limit=widgets.IntSlider(min=5, max=300, continuous_update=False),
                 group=widgets.Combobox(options=list(DF_INTEREST.columns.values)),
                 aggs_fields=widgets.SelectMultiple(options=list(DF_INTEREST.columns.values),
                                                    value=["src_ip", "dest_ip", "event_type"],
                                                    rows=30))