In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sqlite3, re

# caminhos
DATA = Path("data")
REPORTS = Path("reports")
DFIR = Path("reports/dfir")

# carrega transações
tx = pd.read_parquet(DATA/"paysim.parquet")  # ajuste se necessário
print(tx.shape, tx.columns.tolist()[:10])

# opcional: amostra pra iterar mais rápido
tx_small = pd.read_parquet(DATA/"paysim_sample.parquet")


(6, 12) ['ts', 'cpf', 'device_id', 'ip', 'asn', 'city', 'amount', 'currency', 'channel', 'merchant_id']


In [2]:
cmd_path = DFIR/"cmdline.txt"
cmdlines = []
if cmd_path.exists():
    for ln in cmd_path.read_text(errors="ignore").splitlines():
        # formato típico: "PID  Process  Args"
        # vamos tentar quebrar por TABs ou múltiplos espaços
        parts = re.split(r"\s{2,}|\t", ln.strip())
        if len(parts) >= 3 and parts[0].isdigit():
            pid = parts[0]
            proc = parts[1]
            args = "  ".join(parts[2:])
            cmdlines.append((int(pid), proc, args))
cmd_df = pd.DataFrame(cmdlines, columns=["pid","process","args"])
cmd_df.head(8)


Unnamed: 0,pid,process,args


In [5]:
# ===== [FIX D5-4.2b] Padrões mais permissivos e sem grupos capturantes =====

# UNC WebDAV: \\<IP v4>@<porta>\davwwwroot\...
pat_webdav = r'\\\\\d{1,3}(?:\.\d{1,3}){3}@\d+\\davwwwroot(?:\\.*)?'
# rundll32
pat_rundll = r'\brundll32\b'
# powershell oculto
pat_powershell_hidden = r'\bpowershell\.exe\b.*-windowstyle\s+hidden'

# (Debug) veja linhas que contêm "davwwwroot" para confirmar parsing do arquivo
print("Preview de linhas com 'davwwwroot':")
print(cmd_df[cmd_df["args"].str.contains(r'davwwwroot', flags=re.I, na=False)]
      .head(10)
      .to_string(index=False))

# filtro principal: presença de WebDAV
alerts_A = cmd_df[cmd_df["args"].str.contains(pat_webdav, flags=re.I, na=False)].copy()

if alerts_A.empty:
    # fallback: se por algum motivo o regex ainda não pegou, aceita qualquer 'davwwwroot'
    alerts_A = cmd_df[cmd_df["args"].str.contains(r'davwwwroot', flags=re.I, na=False)].copy()

# se mesmo assim vazio, crie DF vazio com colunas esperadas
if alerts_A.empty:
    alerts_A = pd.DataFrame(columns=["pid","process","args","iocs","score","rule"])
else:
    alerts_A["args"] = alerts_A["args"].astype(str)
    alerts_A["iocs"] = "WebDAV"
    alerts_A.loc[alerts_A["args"].str.contains(pat_rundll, flags=re.I, na=False), "iocs"] += "+rundll32"
    alerts_A.loc[alerts_A["args"].str.contains(pat_powershell_hidden, flags=re.I, na=False), "iocs"] += "+ps_hidden"

    # flags
    has_webdav     = alerts_A["args"].str.contains(r'davwwwroot', flags=re.I, na=False)
    has_rundll32   = alerts_A["args"].str.contains(pat_rundll, flags=re.I, na=False)
    has_ps_hidden  = alerts_A["args"].str.contains(pat_powershell_hidden, flags=re.I, na=False)

    alerts_A["score"] = (has_webdav.astype(int) * 2 +
                         has_rundll32.astype(int) * 2 +
                         has_ps_hidden.astype(int) * 1)

alerts_A["rule"] = "A_WebDAV_rundll32"
alerts_A = alerts_A.sort_values(["score"], ascending=False)

print("\nLinhas com alerta (Regra A):", len(alerts_A))
display(alerts_A.head(10))


Preview de linhas com 'davwwwroot':
Empty DataFrame
Columns: [pid, process, args]
Index: []

Linhas com alerta (Regra A): 0


Unnamed: 0,pid,process,args,iocs,score,rule


In [7]:
import os
from pathlib import Path

# 1) conferir diretório atual do notebook
print("cwd:", os.getcwd())

# 2) garantir as pastas
(Path("reports") / "alerts").mkdir(parents=True, exist_ok=True)
Path("data").mkdir(parents=True, exist_ok=True)

# 3) tentar salvar novamente a Regra A
out_A = Path("reports")/"alerts"/"day5_ruleA_webdav_rundll32.csv"
alerts_A.to_csv(out_A, index=False)
print("Salvo em:", out_A)


cwd: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks
Salvo em: reports/alerts/day5_ruleA_webdav_rundll32.csv


In [8]:
out_A = REPORTS/"alerts"/"day5_ruleA_webdav_rundll32.csv"
alerts_A.to_csv(out_A, index=False)
out_A


PosixPath('reports/alerts/day5_ruleA_webdav_rundll32.csv')

In [12]:
import re
netscan_path = DFIR/"netscan.txt"

def is_private_ip(ip):
    try:
        a,b,c,d = map(int, ip.split("."))
        if a == 10: return True
        if a == 192 and b == 168: return True
        if a == 172 and 16 <= b <= 31: return True
    except Exception:
        return False
    return False

rows = []
if netscan_path.exists():
    for ln in netscan_path.read_text(errors="ignore").splitlines():
        # Ex.: "29:0x... TCPv4 192.168.19.150 51035 196.204.4.8 80 ESTABLISHED 1260 svchost.exe 2024-07-15 ..."
        parts = ln.split()
        # achar o índice do token TCP*
        idx = next((i for i,t in enumerate(parts) if t.startswith("TCP")), None)
        if idx is None:
            continue
        try:
            proto  = parts[idx]                  # TCPv4 / TCPv6
            laddr  = parts[idx+1]
            lport  = parts[idx+2]
            raddr  = parts[idx+3]
            rport  = parts[idx+4]
            state  = parts[idx+5]
            # depois de state, normalmente vem PID e processo
            pid_tok = parts[idx+6] if len(parts) > idx+6 else ""
            pid     = int(pid_tok) if pid_tok.isdigit() else None
            process = parts[idx+7] if len(parts) > idx+7 else None
        except Exception:
            # linha irregular, pula
            continue

        # normalizações
        rport_clean = re.sub(r"\D", "", rport or "")
        lport_clean = re.sub(r"\D", "", lport or "")
        process_norm = (process or "").lower()

        rows.append({
            "proto": proto,
            "laddr": laddr, "lport": lport_clean,
            "raddr": raddr, "rport": rport_clean,
            "state": state,
            "pid": pid, "process": process_norm,
            "raw": ln
        })

ns = pd.DataFrame(rows, columns=["proto","laddr","lport","raddr","rport","state","pid","process","raw"])
print("linhas parsadas:", len(ns))
ns.head(8)


linhas parsadas: 0


Unnamed: 0,proto,laddr,lport,raddr,rport,state,pid,process,raw


In [13]:
# permitir 'svchost.exe' com ou sem caminho, e portas 80/8080
cand = ns[
    ns["process"].str.contains(r"\bsvchost\.exe\b", na=False)
].copy()

# somente destinos IPv4 "não privados"
cand = cand[
    cand["raddr"].str.match(r"^\d{1,3}(\.\d{1,3}){3}$", na=False) &
    (~cand["raddr"].map(is_private_ip))
]

# portas HTTP claras (80/8080) – já limpas para dígitos
cand = cand[cand["rport"].isin(["80","8080"])]

# estado estabelecido ajuda a reduzir ruído
svchost_http = cand[cand["state"].str.upper().eq("ESTABLISHED")].copy()

svchost_http["rule"]  = "B_svchost_http_external"
svchost_http["score"] = 2

display(svchost_http[["process","pid","raddr","rport","state"]].drop_duplicates().head(20))
print("hits Regra B:", len(svchost_http))


Unnamed: 0,process,pid,raddr,rport,state


hits Regra B: 0


In [14]:
ns_80 = ns[(ns["rport"].isin(["80","8080"])) & 
           (ns["raddr"].str.match(r"^\d{1,3}(\.\d{1,3}){3}$", na=False)) &
           (~ns["raddr"].map(is_private_ip))]
ns_80[["process","pid","raddr","rport","state"]].drop_duplicates().head(20)
print("total conexões externas http-ish:", len(ns_80))

total conexões externas http-ish: 0


In [15]:
svchost_ext = ns[ ns["process"].str.contains(r"\bsvchost\.exe\b", na=False) &
                  (ns["raddr"].str.match(r"^\d{1,3}(\.\d{1,3}){3}$", na=False)) &
                  (~ns["raddr"].map(is_private_ip)) ]
svchost_ext[["process","pid","raddr","rport","state"]].drop_duplicates().head(20)
print("svchost → externo:", len(svchost_ext))

svchost → externo: 0


In [16]:
out_B = REPORTS/"alerts"/"day5_ruleB_svchost_http_external.csv"
svchost_http.to_csv(out_B, index=False)
out_B

PosixPath('reports/alerts/day5_ruleB_svchost_http_external.csv')

In [17]:
df = tx.copy()
assert "ts" in df.columns and "cpf" in df.columns, "Dataset precisa ter colunas ts e cpf"
df["ts"] = pd.to_datetime(df["ts"])

# bucket de 60min
df["ts_1h"] = df["ts"].dt.floor("60min")

agg = (df.groupby(["cpf","ts_1h"])
         .agg(tx_count_60m=("cpf","size"),
              uniq_devices_60m=("device_id", pd.Series.nunique),
              amount_sum_60m=("amount","sum"))
         .reset_index())

# thresholds simples (ajuste): >= 5 transações na hora OU >=3 devices na hora OU soma >= 50k
C_TX = 5
C_DEV = 3
C_AMT = 50_000

alerts_C = agg[
    (agg["tx_count_60m"] >= C_TX) |
    (agg["uniq_devices_60m"] >= C_DEV) |
    (agg["amount_sum_60m"] >= C_AMT)
].copy()

alerts_C["rule"] = "C_aml_burst_1h"
alerts_C["score"] = (
    (alerts_C["tx_count_60m"] >= C_TX).astype(int) +
    (alerts_C["uniq_devices_60m"] >= C_DEV).astype(int) +
    (alerts_C["amount_sum_60m"] >= C_AMT).astype(int)
)

alerts_C.sort_values(["score","amount_sum_60m","tx_count_60m"], ascending=False).head(10)


Unnamed: 0,cpf,ts_1h,tx_count_60m,uniq_devices_60m,amount_sum_60m,rule,score
0,111,2025-09-01 10:00:00,4,3,1300,C_aml_burst_1h,1


In [18]:
out_C = REPORTS/"alerts"/"day5_ruleC_aml_burst_1h.csv"
alerts_C.to_csv(out_C, index=False)
out_C

PosixPath('reports/alerts/day5_ruleC_aml_burst_1h.csv')

In [19]:
alerts_A2 = alerts_A[["rule","score","pid","process","args"]].copy() if 'alerts_A' in globals() else pd.DataFrame()
alerts_B2 = svchost_http[["rule","score","pid","process","raddr","rport","state"]].copy() if 'svchost_http' in globals() else pd.DataFrame()
alerts_C2 = alerts_C.copy()

# normalizar colunas
alerts_A2["kind"]="dfir_cmdline"; alerts_B2["kind"]="dfir_netscan"; alerts_C2["kind"]="aml_tx"

for col in ["pid","process","args","raddr","rport","state","cpf","ts_1h","tx_count_60m","uniq_devices_60m","amount_sum_60m"]:
    if col not in alerts_A2.columns: alerts_A2[col]=pd.NA
    if col not in alerts_B2.columns: alerts_B2[col]=pd.NA
    if col not in alerts_C2.columns: alerts_C2[col]=pd.NA

alerts_all = pd.concat([alerts_A2, alerts_B2, alerts_C2], ignore_index=True)
alerts_all.to_csv(REPORTS/"alerts"/"day5_alerts_all.csv", index=False)
alerts_all.head(12)


  alerts_all = pd.concat([alerts_A2, alerts_B2, alerts_C2], ignore_index=True)


Unnamed: 0,rule,score,pid,process,args,kind,raddr,rport,state,cpf,ts_1h,tx_count_60m,uniq_devices_60m,amount_sum_60m
0,C_aml_burst_1h,1,,,,aml_tx,,,,111,2025-09-01 10:00:00,4,3,1300


In [20]:
print("cmdline.txt existe?", (DFIR/"cmdline.txt").exists())
print("cmd_df shape:", cmd_df.shape)

# ver se a linha do WebDAV está mesmo no arquivo
cmd_df[cmd_df["args"].str.contains("davwwwroot", na=False)].head(10)


cmdline.txt existe? False
cmd_df shape: (0, 3)


Unnamed: 0,pid,process,args


In [21]:
svchost_http[["process","pid","raddr","rport","state"]].drop_duplicates().head(10)


Unnamed: 0,process,pid,raddr,rport,state


In [22]:
alerts_all.groupby(["rule","kind"]).size().reset_index(name="hits")
alerts_all.head(12)


Unnamed: 0,rule,score,pid,process,args,kind,raddr,rport,state,cpf,ts_1h,tx_count_60m,uniq_devices_60m,amount_sum_60m
0,C_aml_burst_1h,1,,,,aml_tx,,,,111,2025-09-01 10:00:00,4,3,1300


In [24]:
import os, sqlite3, pandas as pd
from pathlib import Path

print("CWD:", os.getcwd())  # confirme que é .../blue-team-aml-portfolio

# Estes três DEVEM existir
for name in ["alerts_all", "tx", "cmd_df"]:
    print(name, "existe?", name in globals())

# Se algum for False, RE-EXECUTE as células:
# 6.1→6.2 (alerts_C), 4.2→4.3 (alerts_A), 5.1→5.3 (svchost_http),
# e depois 7.1 (consolidado alerts_all)
if 'alerts_all' in globals():
    print("alerts_all shape:", alerts_all.shape)
    display(alerts_all.groupby(["rule","kind"]).size().reset_index(name="hits").head())


CWD: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks
alerts_all existe? True
tx existe? True
cmd_df existe? True
alerts_all shape: (1, 14)


Unnamed: 0,rule,kind,hits
0,C_aml_burst_1h,aml_tx,1


In [25]:
from pathlib import Path
import sqlite3, pandas as pd

DB = Path("data")/"aml.db"
print("DB alvo:", DB.resolve())

# Apague e recrie (opcional, mas deixa tudo limpo)
if DB.exists():
    DB.unlink()

conn = sqlite3.connect(DB)

# Gravar tabelas (só grava as que existem no ambiente)
if 'tx' in globals():
    tx.to_sql("transactions", conn, if_exists="replace", index=False)
if 'alerts_all' in globals():
    alerts_all.to_sql("alerts", conn, if_exists="replace", index=False)
if 'cmd_df' in globals():
    cmd_df.to_sql("dfir_cmdline", conn, if_exists="replace", index=False)
if 'ns' in globals():
    ns.to_sql("dfir_netscan", conn, if_exists="replace", index=False)

conn.commit()

# Ver o que entrou de fato
print(pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", conn))


DB alvo: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks/data/aml.db
           name
0        alerts
1  dfir_cmdline
2  dfir_netscan
3  transactions


In [26]:
q = "SELECT rule, kind, COUNT(*) AS hits FROM alerts GROUP BY rule, kind ORDER BY hits DESC;"
print(pd.read_sql(q, conn))
conn.close()


             rule    kind  hits
0  C_aml_burst_1h  aml_tx     1


In [29]:
# (no Jupyter) reconstrua alerts_all se mudou algo e salve CSV
alerts_all.to_csv("reports/alerts/day5_alerts_all.csv", index=False)

In [31]:
from pathlib import Path
import pandas as pd, re

REPORTS = Path("reports"); ALERTS = REPORTS/"alerts"
DFIR = Path("reports")/"dfir"
ALERTS.mkdir(parents=True, exist_ok=True)

# ---------- Regra A (cmdline: WebDAV + rundll32) ----------
cmd_path = DFIR/"cmdline.txt"
cmdlines = []
if cmd_path.exists():
    for ln in cmd_path.read_text(errors="ignore").splitlines():
        parts = re.split(r"\s{2,}|\t", ln.strip())
        if len(parts) >= 3 and parts[0].isdigit():
            pid = int(parts[0]); proc = parts[1]; args = "  ".join(parts[2:])
            cmdlines.append((pid, proc, args))
cmd_df = pd.DataFrame(cmdlines, columns=["pid","process","args"]) if cmdlines else pd.DataFrame(columns=["pid","process","args"])

pat_webdav = r'\\\\(?:\d{1,3}(?:\.\d{1,3}){3}|[A-Za-z0-9\.-]+)@\d+\\davwwwroot\\?'
pat_rundll = r'\brundll32\b'
pat_powershell_hidden = r'\bpowershell\.exe\b.*-windowstyle\s+hidden'

alerts_A = cmd_df[cmd_df["args"].str.contains(pat_webdav, flags=re.I, na=False)].copy()
if not alerts_A.empty:
    alerts_A["iocs"] = "WebDAV"
    alerts_A.loc[alerts_A["args"].str.contains(pat_rundll, flags=re.I, na=False), "iocs"] += "+rundll32"
    alerts_A.loc[alerts_A["args"].str.contains(pat_powershell_hidden, flags=re.I, na=False), "iocs"] += "+ps_hidden"
    alerts_A["score"] = alerts_A["iocs"].apply(lambda s: 2*("WebDAV" in s) + 2*("rundll32" in s) + 1*("ps_hidden" in s))
    alerts_A["rule"] = "A_WebDAV_rundll32"
    alerts_A["kind"] = "dfir_cmdline"
alerts_A.to_csv(ALERTS/"day5_ruleA_webdav_rundll32.csv", index=False)

# ---------- Regra B (netscan: svchost -> HTTP externo) ----------
def is_private_ip(ip):
    try:
        o = list(map(int, ip.split(".")))
        return (o[0]==10) or (o[0]==192 and o[1]==168) or (o[0]==172 and 16<=o[1]<=31)
    except: 
        return False

ns_path = DFIR/"netscan.txt"
rows = []
if ns_path.exists():
    for ln in ns_path.read_text(errors="ignore").splitlines():
        parts = ln.split()
        if len(parts) >= 8 and parts[0].startswith("TCP"):
            proto, laddr, lport, raddr, rport, state = parts[:6]
            pid = parts[6] if parts[6].isdigit() else None
            proc = parts[7] if not (parts[7].isdigit()) else None
            rows.append((proto,laddr,lport,raddr,rport,state,pid,proc,ln))
ns = pd.DataFrame(rows, columns=["proto","laddr","lport","raddr","rport","state","pid","process","raw"]) if rows else pd.DataFrame(columns=["proto","laddr","lport","raddr","rport","state","pid","process","raw"])

svchost_http = ns[
    (ns["process"].str.lower() == "svchost.exe") &
    (ns["rport"].isin(["80","8080"])) &
    (~ns["raddr"].map(is_private_ip))
].copy()
if not svchost_http.empty:
    svchost_http["rule"] = "B_svchost_http_external"
    svchost_http["score"] = 2
    svchost_http["kind"] = "dfir_netscan"
svchost_http.to_csv(ALERTS/"day5_ruleB_svchost_http_external.csv", index=False)

# ---------- Regra C (AML burst 1h) ----------
tx = pd.read_parquet("data/paysim.parquet")
tx["ts"] = pd.to_datetime(tx["ts"])
agg = (tx.assign(ts_1h=tx["ts"].dt.floor("60min"))
         .groupby(["cpf","ts_1h"])
         .agg(tx_count_60m=("cpf","size"),
              uniq_devices_60m=("device_id", pd.Series.nunique),
              amount_sum_60m=("amount","sum"))
         .reset_index())

C_TX, C_DEV, C_AMT = 5, 3, 50_000
alerts_C = agg[
    (agg["tx_count_60m"] >= C_TX) | (agg["uniq_devices_60m"] >= C_DEV) | (agg["amount_sum_60m"] >= C_AMT)
].copy()
alerts_C["rule"] = "C_aml_burst_1h"
alerts_C["score"] = ((alerts_C["tx_count_60m"] >= C_TX).astype(int) +
                     (alerts_C["uniq_devices_60m"] >= C_DEV).astype(int) +
                     (alerts_C["amount_sum_60m"] >= C_AMT).astype(int))
alerts_C["kind"] = "aml_tx"
alerts_C.to_csv(ALERTS/"day5_ruleC_aml_burst_1h.csv", index=False)

# ---------- Consolidado ----------
def ensure_cols(df, cols):
    for c in cols:
        if c not in df.columns: df[c] = pd.NA
    return df

common_cols = ["rule","score","kind","pid","process","args","raddr","rport","state","cpf","ts_1h","tx_count_60m","uniq_devices_60m","amount_sum_60m"]
alerts_A2 = ensure_cols(alerts_A[["rule","score","kind","pid","process","args"]].copy(), common_cols) if not alerts_A.empty else pd.DataFrame(columns=common_cols)
alerts_B2 = ensure_cols(svchost_http[["rule","score","kind","pid","process","raddr","rport","state"]].copy(), common_cols) if not svchost_http.empty else pd.DataFrame(columns=common_cols)
alerts_C2 = ensure_cols(alerts_C[["rule","score","kind","cpf","ts_1h","tx_count_60m","uniq_devices_60m","amount_sum_60m"]].copy(), common_cols) if not alerts_C.empty else pd.DataFrame(columns=common_cols)

alerts_all = pd.concat([alerts_A2, alerts_B2, alerts_C2], ignore_index=True)
(alerts_all).to_csv(ALERTS/"day5_alerts_all.csv", index=False)

# ---------- Mostrar caminhos/exists ----------
print("Gerados:")
for p in [
    "day5_ruleA_webdav_rundll32.csv",
    "day5_ruleB_svchost_http_external.csv",
    "day5_ruleC_aml_burst_1h.csv",
    "day5_alerts_all.csv"
]:
    fp = ALERTS/p
    print("-", fp.resolve(), fp.exists())

alerts_all.head(10)


Gerados:
- /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks/reports/alerts/day5_ruleA_webdav_rundll32.csv True
- /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks/reports/alerts/day5_ruleB_svchost_http_external.csv True
- /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks/reports/alerts/day5_ruleC_aml_burst_1h.csv True
- /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/notebooks/reports/alerts/day5_alerts_all.csv True


  alerts_all = pd.concat([alerts_A2, alerts_B2, alerts_C2], ignore_index=True)


Unnamed: 0,rule,score,kind,pid,process,args,raddr,rport,state,cpf,ts_1h,tx_count_60m,uniq_devices_60m,amount_sum_60m
0,C_aml_burst_1h,1,aml_tx,,,,,,,111,2025-09-01 10:00:00,4,3,1300


In [32]:
from pathlib import Path
import pandas as pd, re, os

# Detecta raiz do repo (sobe até encontrar .git)
ROOT = Path.cwd()
while ROOT != ROOT.parent and not (ROOT/".git").exists():
    ROOT = ROOT.parent

# Pastas corretas no projeto
REPORTS = ROOT/"reports"
ALERTS  = REPORTS/"alerts"
DFIR    = REPORTS/"dfir"
DATA    = ROOT/"data"

ALERTS.mkdir(parents=True, exist_ok=True)

# Debug: mostre onde estamos salvando/lendo
print("ROOT:", ROOT)
print("DFIR:", DFIR)
print("ALERTS:", ALERTS)
print("DATA:", DATA)


ROOT: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio
DFIR: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/reports/dfir
ALERTS: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/reports/alerts
DATA: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/data


In [34]:
# ---------- Regra A (cmdline: WebDAV + rundll32 + powershell hidden) ----------
cmd_path = DFIR / "cmdline.txt"

cmdlines = []
if cmd_path.exists():
    for ln in cmd_path.read_text(errors="ignore").splitlines():
        # tentar quebrar por TABs ou múltiplos espaços
        parts = re.split(r"\s{2,}|\t", ln.strip())
        # esperamos algo como: PID, Process, Args
        if len(parts) >= 3 and parts[0].strip().isdigit():
            pid = int(parts[0].strip())
            proc = parts[1].strip()
            args = "  ".join(parts[2:]).strip()
            cmdlines.append((pid, proc, args))

cmd_df = pd.DataFrame(cmdlines, columns=["pid", "process", "args"]) if cmdlines else pd.DataFrame(columns=["pid","process","args"])

# Indicadores
pat_webdav = r'\\\\(?:\d{1,3}(?:\.\d{1,3}){3}|[A-Za-z0-9\.-]+)@\d+\\davwwwroot\\?'
pat_rundll = r'\brundll32\b'
pat_ps_hidden = r'\bpowershell\.exe\b.*-windowstyle\s+hidden'

alerts_A = cmd_df[cmd_df["args"].str.contains(pat_webdav, flags=re.I, na=False)].copy()

if not alerts_A.empty:
    alerts_A["iocs"] = "WebDAV"
    alerts_A.loc[alerts_A["args"].str.contains(pat_rundll, flags=re.I, na=False), "iocs"] += "+rundll32"
    alerts_A.loc[alerts_A["args"].str.contains(pat_ps_hidden, flags=re.I, na=False), "iocs"] += "+ps_hidden"

    def score_A(iocs: str) -> int:
        s = 0
        if "WebDAV" in iocs: s += 2
        if "rundll32" in iocs: s += 2
        if "ps_hidden" in iocs: s += 1
        return s

    alerts_A["score"] = alerts_A["iocs"].apply(score_A)
    alerts_A["rule"]  = "A_WebDAV_rundll32"
    alerts_A["kind"]  = "dfir_cmdline"

out_A = ALERTS / "day5_ruleA_webdav_rundll32.csv"
alerts_A.to_csv(out_A, index=False)
print("Regra A →", out_A, "linhas:", len(alerts_A))
alerts_A.head(10)


Regra A → /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/reports/alerts/day5_ruleA_webdav_rundll32.csv linhas: 2


Unnamed: 0,pid,process,args,iocs,score,rule,kind
105,3692,powershell.exe,powershell.exe -windowstyle hidden net use \\...,WebDAV+rundll32+ps_hidden,5,A_WebDAV_rundll32,dfir_cmdline
107,2416,net.exe,"""C:\Windows\system32\net.exe"" use \\45.9.74.32...",WebDAV,2,A_WebDAV_rundll32,dfir_cmdline


In [35]:
# ---------- Regra B (netscan: svchost -> HTTP externo 80/8080) ----------
def is_private_ip(ip: str) -> bool:
    try:
        o = list(map(int, ip.split(".")))
        return (o[0] == 10) or (o[0] == 192 and o[1] == 168) or (o[0] == 172 and 16 <= o[1] <= 31)
    except:
        return False

ns_path = DFIR / "netscan.txt"
rows = []

if ns_path.exists():
    for ln in ns_path.read_text(errors="ignore").splitlines():
        parts = ln.split()
        # Esperado: TCPv4 192.168.x.x 51035 196.204.4.8 80 ESTABLISHED 1260 svchost.exe <timestamp...>
        if len(parts) >= 8 and parts[0].startswith("TCP"):
            proto, laddr, lport, raddr, rport, state = parts[:6]
            pid = parts[6] if parts[6].isdigit() else None
            proc = parts[7] if not parts[7].isdigit() else None
            rows.append((proto, laddr, lport, raddr, rport, state, pid, proc, ln))

ns = pd.DataFrame(rows, columns=["proto","laddr","lport","raddr","rport","state","pid","process","raw"]) if rows else pd.DataFrame(columns=["proto","laddr","lport","raddr","rport","state","pid","process","raw"])

svchost_http = ns[
    (ns["process"].str.lower() == "svchost.exe") &
    (ns["rport"].isin(["80","8080"])) &
    (~ns["raddr"].map(is_private_ip))
].copy()

if not svchost_http.empty:
    svchost_http["rule"]  = "B_svchost_http_external"
    svchost_http["score"] = 2
    svchost_http["kind"]  = "dfir_netscan"

out_B = ALERTS / "day5_ruleB_svchost_http_external.csv"
svchost_http.to_csv(out_B, index=False)
print("Regra B →", out_B, "linhas:", len(svchost_http))
svchost_http[["process","pid","raddr","rport","state"]].drop_duplicates().head(10)


Regra B → /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/reports/alerts/day5_ruleB_svchost_http_external.csv linhas: 0


Unnamed: 0,process,pid,raddr,rport,state


In [37]:
# ---------- Regra C (AML burst 1h por CPF) - Loader robusto ----------
from pathlib import Path
import pandas as pd, numpy as np
import glob, re

def smart_read_dataset(DATA: Path):
    # 1) Tentativas diretas
    candidates = [
        DATA / "paysim.parquet",
        DATA / "paysim_sample.parquet",
    ]
    # 2) Qualquer parquet na pasta
    candidates += [Path(p) for p in glob.glob(str(DATA / "*.parquet"))]
    # 3) CSV(s)
    candidates += [Path(p) for p in glob.glob(str(DATA / "transactions.csv"))]
    candidates += [Path(p) for p in glob.glob(str(DATA / "*.csv"))]

    for p in candidates:
        if p.exists():
            if p.suffix.lower() == ".parquet":
                print(f"Lendo PARQUET: {p}")
                df = pd.read_parquet(p)
            else:
                print(f"Lendo CSV: {p}")
                df = pd.read_csv(p)
            return df, p

    # 4) Se nada for encontrado, cria demo
    print("Nenhum arquivo encontrado em data/. Gerando DEMO dataset…")
    rng = pd.date_range("2025-09-01 09:00:00", periods=40, freq="15min")
    demo = pd.DataFrame({
        "ts": np.random.choice(rng, size=200),
        "cpf": np.random.choice(["111","222","333"], size=200, p=[0.5,0.3,0.2]),
        "device_id": np.random.choice(["devA","devB","devC","devD"], size=200),
        "amount": np.random.randint(10, 5000, size=200),
        "ip": np.random.choice(["10.0.0.10","10.0.0.11","192.168.1.12"], size=200),
        "channel": np.random.choice(["web","app","atm"], size=200)
    })
    demo_path = DATA / "demo_paysim.parquet"
    DATA.mkdir(parents=True, exist_ok=True)
    demo.to_parquet(demo_path, index=False)
    print(f"DEMO salvo em: {demo_path}")
    return demo, demo_path

# Carregar dataset
tx, tx_src = smart_read_dataset(DATA)

# Normalizar nomes esperados (ts, cpf, device_id, amount)
colmap = {c.lower(): c for c in tx.columns}
def find_col(options):
    for name in options:
        if name in colmap: return colmap[name]
    return None

ts_col   = find_col(["ts","timestamp","data_hora","datetime"])
cpf_col  = find_col(["cpf","customer_id","client_id","account_id"])
dev_col  = find_col(["device_id","device","dev_id"])
amt_col  = find_col(["amount","valor","value","amt"])

missing = [("ts",ts_col),("cpf",cpf_col),("device_id",dev_col),("amount",amt_col)]
missing = [k for k,v in missing if v is None]
if missing:
    raise ValueError(f"Colunas obrigatórias ausentes no dataset: {missing}. "
                     f"Cabecalho atual: {tx.columns.tolist()}")

tx = tx.rename(columns={ts_col:"ts", cpf_col:"cpf", dev_col:"device_id", amt_col:"amount"})
tx["ts"] = pd.to_datetime(tx["ts"], errors="coerce")
tx = tx.dropna(subset=["ts","cpf","device_id","amount"])

print("Fonte dos dados:", tx_src)
print(tx.shape)
display(tx.head(3))

# ---------- Agregação 60 min ----------
agg = (tx.assign(ts_1h=tx["ts"].dt.floor("60min"))
         .groupby(["cpf","ts_1h"])
         .agg(
             tx_count_60m=("cpf","size"),
             uniq_devices_60m=("device_id", pd.Series.nunique),
             amount_sum_60m=("amount","sum")
         )
         .reset_index())

# Limiar padrão (ajuste conforme seu dataset real)
C_TX, C_DEV, C_AMT = 5, 3, 50_000

alerts_C = agg[
    (agg["tx_count_60m"] >= C_TX) |
    (agg["uniq_devices_60m"] >= C_DEV) |
    (agg["amount_sum_60m"] >= C_AMT)
].copy()

alerts_C["rule"]  = "C_aml_burst_1h"
alerts_C["score"] = (
    (alerts_C["tx_count_60m"] >= C_TX).astype(int) +
    (alerts_C["uniq_devices_60m"] >= C_DEV).astype(int) +
    (alerts_C["amount_sum_60m"] >= C_AMT).astype(int)
)
alerts_C["kind"] = "aml_tx"

out_C = ALERTS / "day5_ruleC_aml_burst_1h.csv"
alerts_C.to_csv(out_C, index=False)
print("Regra C →", out_C, "linhas:", len(alerts_C))
alerts_C.sort_values(["score","amount_sum_60m","tx_count_60m"], ascending=False).head(10)


Lendo PARQUET: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/data/paysim.parquet
Fonte dos dados: /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/data/paysim.parquet
(6, 12)


Unnamed: 0,ts,cpf,device_id,ip,asn,city,amount,currency,channel,merchant_id,beneficiary_id,type
0,2025-09-01 10:00:00,111,devA,200.100.10.1,AS123,Campinas,120,BRL,app,m01,b01,payment
1,2025-09-01 10:02:00,111,devA,200.100.10.1,AS123,Campinas,180,BRL,app,m02,b02,payment
2,2025-09-01 10:03:00,111,devB,177.23.44.9,AS456,São Paulo,300,BRL,web,m03,b03,payment


Regra C → /Users/ricardoalmeida/Projetos/blue-team-aml-portfolio/reports/alerts/day5_ruleC_aml_burst_1h.csv linhas: 1


Unnamed: 0,cpf,ts_1h,tx_count_60m,uniq_devices_60m,amount_sum_60m,rule,score,kind
0,111,2025-09-01 10:00:00,4,3,1300,C_aml_burst_1h,1,aml_tx
