<a href="https://colab.research.google.com/github/Tiamaria278/5.18-Cyber-Arch-Homework-7-/blob/main/Prescriptive_DGA_Detector_Colab_HW9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# From Black Box to Playbook — DGA Detector (H2O + SHAP + Gemini)

This Colab notebook trains a DGA detector with **H2O AutoML**, explains single predictions with **per-row SHAP** (via `predict_contributions`), and converts the explanation into a **prescriptive incident response playbook** using **Google Gemini**.

**Run order:** top-to-bottom. If you want LLM-generated playbooks, set your API key in the **GenAI Setup** cell.

In [1]:
#@title Install dependencies
!pip -q install h2o==3.46.0.7 google-genai tldextract shap
print("✅ Dependencies installed")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.9/265.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Dependencies installed


In [2]:
#@title Imports & helpers
import os, math, random, json, pandas as pd
from pathlib import Path

import h2o
from h2o.automl import H2OAutoML

random.seed(42)

def shannon_entropy(s: str) -> float:
    if not s:
        return 0.0
    p, lns = {}, float(len(s))
    for c in s:
        p[c] = p.get(c, 0) + 1
    return -sum((count/lns) * math.log((count/lns), 2) for count in p.values())

print("✅ Imports ready")

✅ Imports ready


In [3]:
#@title Generate synthetic training data (entropy + length)
import csv

out_path = "/content/dga_dataset_train.csv"
header = ['domain','length','entropy','class']
data = []

legit_roots = ['google','facebook','amazon','github','wikipedia','microsoft']
for _ in range(100):
    d = random.choice(legit_roots) + ".com"
    data.append([d, len(d), shannon_entropy(d), 'legit'])

alphabet = "abcdefghijklmnopqrstuvwxyz0123456789"
for _ in range(100):
    L = random.randint(15,25)
    root = "".join(random.choice(alphabet) for _ in range(L))
    d = root + ".com"
    data.append([d, len(d), shannon_entropy(d), 'dga'])

with open(out_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(data)

df = pd.read_csv(out_path)
print("✅ dga_dataset_train.csv created ->", out_path, "Rows:", len(df))
df.head()

✅ dga_dataset_train.csv created -> /content/dga_dataset_train.csv Rows: 200


Unnamed: 0,domain,length,entropy,class
0,microsoft.com,13,3.026987,legit
1,google.com,10,2.646439,legit
2,google.com,10,2.646439,legit
3,microsoft.com,13,3.026987,legit
4,amazon.com,10,2.721928,legit


In [11]:
!python 1_generate_dga_data.py
!python 1_train_and_export.py
!python 2_analyze_domain.py --domain kq3v9z7j1x5f8g2h.info

python3: can't open file '/content/1_generate_dga_data.py': [Errno 2] No such file or directory
python3: can't open file '/content/1_train_and_export.py': [Errno 2] No such file or directory
python3: can't open file '/content/2_analyze_domain.py': [Errno 2] No such file or directory


In [13]:
aml = H2OAutoML(
    max_models=20,
    max_runtime_secs=120,
    seed=1,
    sort_metric="AUC",
    nfolds=0,                     # no CV -> bigger effective training split
    include_algos=["DRF","XGBoost"]  # tree models w/ SHAP; avoid GBM min_rows path
)


In [14]:
N_LEGIT = 500
N_DGA   = 500


In [15]:
#@title GenAI Setup (Optional) — set your API key for Gemini
# Enter your key here or set it securely via Colab secrets.
# Example: os.environ["GENAI_API_KEY"] = "YOUR_KEY_HERE"
GENAI_MODEL = os.getenv("GENAI_MODEL", "gemini-1.5-flash")

try:
    from google import genai as google_genai  # new SDK
    GENAI_KIND = "new"
except Exception:
    try:
        import google.generativeai as google_genai  # legacy SDK
        GENAI_KIND = "legacy"
    except Exception:
        google_genai = None
        GENAI_KIND = None

def _template_playbook(xai_findings: str) -> str:
    return f'''
### Prescriptive Incident Response Playbook (Template)

**Summary of Findings**
{xai_findings.strip()}

**Immediate (0–15 min)**
1. Block the domain in DNS and web egress.
2. Scope historical DNS/HTTP hits.
3. Isolate endpoints that contacted the domain.

**Containment (15–60 min)**
4. Collect triage artifacts (memory, netstat, autoruns).
5. Rotate credentials/tokens used on affected systems.
6. Add detections for lookalike high-entropy domains.

**Eradication & Recovery (Same day)**
7. Remove persistence/malware; reimage if integrity is uncertain.
8. Restore from known-good backups and verify.

**Lessons & Hardening (Week 1)**
9. Add drift monitors; schedule periodic re-training.
10. Document IOCs and share appropriately.
'''.strip()

def generate_playbook(xai_findings: str) -> str:
    api_key = os.getenv("GENAI_API_KEY") or os.getenv("GOOGLE_API_KEY")
    if not google_genai or not api_key:
        return _template_playbook(xai_findings)

    if GENAI_KIND == "new":
        client = google_genai.Client(api_key=api_key)
        prompt = f'''
You are a senior SOC analyst. Based on the following model explanation, produce a concise, step-by-step
prescriptive incident response playbook tailored to this alert. Organize by phases (Immediate, Contain,
Eradicate, Recover, Lessons). Reference the concrete findings and avoid boilerplate.

XAI Findings:
{xai_findings}
'''.strip()
        resp = client.models.generate_content(model=GENAI_MODEL, contents=prompt)
        return getattr(resp, "output_text", str(resp)).strip()

    google_genai.configure(api_key=api_key)
    model = google_genai.GenerativeModel(GENAI_MODEL)
    prompt = f'''
You are a senior SOC analyst. Based on the following model explanation, produce a concise, step-by-step
prescriptive incident response playbook tailored to this alert. Organize by phases (Immediate, Contain,
Eradicate, Recover, Lessons). Reference the concrete findings and avoid boilerplate.

XAI Findings:
{xai_findings}
'''.strip()
    resp = model.generate_content(prompt)
    return getattr(resp, "text", str(resp)).strip()

print("✅ GenAI configured (template fallback if no key present).")

✅ GenAI configured (template fallback if no key present).


Patch: robust analyze_domain that never fails when probability columns are missing

In [22]:
# Patch: robust analyze_domain that never fails when probability columns are missing
def analyze_domain(domain: str, native_fallback=None):
    import pandas as pd
    import math

    FEATURES = ["length", "entropy"]

    def compute_features(raw_domain: str) -> pd.DataFrame:
        d = raw_domain.strip().lower()
        return pd.DataFrame([{
            "domain": d,
            "length": len(d),           # must match training
            "entropy": shannon_entropy(d)
        }])

    # ensure H2O cluster
    try:
        h2o.cluster()
    except Exception:
        h2o.init()

    # Load MOJO (first .zip in /content/model or specific leader if you prefer)
    zips = sorted(Path("/content/model").glob("*.zip"))
    if not zips:
        raise FileNotFoundError("No MOJO found. Train AutoML and export a MOJO first.")
    mojo_path = zips[0]
    print("Loading MOJO:", mojo_path)
    model = h2o.import_mojo(str(mojo_path))

    # Features and prediction
    pdf = compute_features(domain)
    h2f = h2o.H2OFrame(pdf[FEATURES])
    pred_df = model.predict(h2f).as_data_frame()

    # Label
    if "predict" not in pred_df.columns:
        # extreme edge-case: ensure we still have a label string
        label = str(pred_df.iloc[0, 0])
    else:
        label = str(pred_df.loc[0, "predict"]).strip()

    # Probability (robust): look for ANY numeric columns other than 'predict'
    num_cols = [c for c in pred_df.columns
                if c.lower() != "predict" and pd.api.types.is_numeric_dtype(pred_df[c])]
    # Keep only finite values
    probs = []
    for c in num_cols:
        val = pred_df.loc[0, c]
        try:
            fv = float(val)
            if math.isfinite(fv):
                probs.append(fv)
        except Exception:
            pass
    proba = max(probs) if probs else None  # None when MOJO doesn’t return probabilities

    # SHAP per-row contributions (robust)
    xai_summary = ""
    shap_contribs = {}
    try:
        contrib_df = model.predict_contributions(h2f).as_data_frame()
        shap_contribs = {k: float(v) for k, v in contrib_df.loc[0].items() if k in FEATURES}
        ranked = sorted(shap_contribs.items(), key=lambda kv: abs(kv[1]), reverse=True)
        parts = []
        for fname, val in ranked[:3]:
            direction = "towards DGA" if val >= 0 else "towards legit"
            parts.append(f"- {fname}: {val:+.3f} ({direction}); value={pdf.loc[0, fname]:.3f}")
        prob_txt = "N/A" if proba is None else f"{proba:.1%}"
        xai_summary = (
            f"Alert: Potential {label.upper()} detected.\n"
            f"Domain: '{domain}'\n"
            f"AI Model Explanation (local SHAP): With probability {prob_txt} for '{label}', "
            f"the largest drivers were:\n" + "\n".join(parts)
        )
    except Exception as e:
        # Try native_fallback if provided (e.g., non-MOJO leader still in memory)
        if native_fallback is not None:
            try:
                contrib_df = native_fallback.predict_contributions(h2f).as_data_frame()
                shap_contribs = {k: float(v) for k, v in contrib_df.loc[0].items() if k in FEATURES}
                ranked = sorted(shap_contribs.items(), key=lambda kv: abs(kv[1]), reverse=True)
                parts = []
                for fname, val in ranked[:3]:
                    direction = "towards DGA" if val >= 0 else "towards legit"
                    parts.append(f"- {fname}: {val:+.3f} ({direction}); value={pdf.loc[0, fname]:.3f}")
                prob_txt = "N/A" if proba is None else f"{proba:.1%}"
                xai_summary = (
                    f"Alert: Potential {label.upper()} detected.\n"
                    f"Domain: '{domain}'\n"
                    f"AI Model Explanation (local SHAP via fallback): With probability {prob_txt} for '{label}', "
                    f"the largest drivers were:\n" + "\n".join(parts)
                )
            except Exception as e2:
                xai_summary = (
                    "Local SHAP contributions unavailable for this model type. "
                    f"(MOJO error: {type(e).__name__}: {e}; fallback error: {type(e2).__name__}: {e2})"
                )
        else:
            xai_summary = (
                "Local SHAP contributions unavailable for this model type. "
                f"(Detail: {type(e).__name__}: {e})"
            )

    # GenAI playbook
    playbook = generate_playbook(xai_summary)

    return {
        "domain": domain,
        "prediction": label,
        "probability": proba,
        "features": pdf.iloc[0].to_dict(),
        "shap_contributions": shap_contribs,
        "xai_findings": xai_summary,
        "playbook": playbook
    }


In [23]:
#@title Demo: analyze a sample domain (robust to missing probabilities)
sample_domain = "kq3v9z7j1x5f8g2h.info"  #@param {type:"string"}

# Run the analysis (uses your existing analyze_domain function)
try:
    result = analyze_domain(sample_domain, native_fallback=native_leader)
except Exception as e:
    raise RuntimeError(f"analyze_domain failed: {type(e).__name__}: {e}")

# Display prediction safely even if probability is missing
prob_value = result.get("probability", None)
prob_display = "N/A" if prob_value is None else float(prob_value)

print("=== Prediction ===")
print({
    "domain": result.get("domain", sample_domain),
    "predicted": result.get("prediction"),
    "probability": prob_display
})

print("\n=== XAI Findings ===")
print(result.get("xai_findings", "<none>"))

print("\n=== Prescriptive Playbook ===")
print(result.get("playbook", "<none>"))

# Save output JSON (robust)
from pathlib import Path
import json
out_dir = Path("/content/output")
out_dir.mkdir(parents=True, exist_ok=True)
with open(out_dir / "last_run.json", "w") as f:
    json.dump(result, f, indent=2)

print("\nSaved /content/output/last_run.json")


Loading MOJO: /content/model/GBM_4_AutoML_2_20250819_21743.zip
generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%
contributions progress: |




████████████████████████████████████████████████████████| (done) 100%
=== Prediction ===
{'domain': 'kq3v9z7j1x5f8g2h.info', 'predicted': 'dga', 'probability': 0.9999999999999972}

=== XAI Findings ===
Alert: Potential DGA detected.
Domain: 'kq3v9z7j1x5f8g2h.info'
AI Model Explanation (local SHAP): With probability 100.0% for 'dga', the largest drivers were:
- length: -17.755 (towards legit); value=21.000
- entropy: -15.774 (towards legit); value=4.297

=== Prescriptive Playbook ===
### Prescriptive Incident Response Playbook (Template)

**Summary of Findings**
Alert: Potential DGA detected.
Domain: 'kq3v9z7j1x5f8g2h.info'
AI Model Explanation (local SHAP): With probability 100.0% for 'dga', the largest drivers were:
- length: -17.755 (towards legit); value=21.000
- entropy: -15.774 (towards legit); value=4.297

**Immediate (0–15 min)**
1. Block the domain in DNS and web egress.
2. Scope historical DNS/HTTP hits.
3. Isolate endpoints that contacted the domain.

**Containment (15–60 min




In [24]:
#@title Optional: Shutdown H2O cluster
# h2o.shutdown(prompt=False)
print("Notebook complete. You can shut down H2O above if desired.")

Notebook complete. You can shut down H2O above if desired.
