In [27]:
import os, re, zipfile
import numpy as np
import pandas as pd
from io import BytesIO
from html import unescape
from oletools.olevba import VBA_Parser

# ---------------- I/O ----------------
def load_train_zip(zip_path: str):
    contents, labels, names = [], [], []
    with zipfile.ZipFile(zip_path) as z:
        for name in z.namelist():
            if name.endswith("/") or name.endswith(".labels"):
                continue
            base = os.path.basename(name)
            if not (base.endswith(".0") or base.endswith(".1")):
                continue
            lab = int(base.rsplit(".", 1)[1])
            contents.append(z.read(name))
            labels.append(lab)
            names.append(name)
    return names, contents, np.asarray(labels, dtype=int)

def load_test_zip(zip_path: str):
    names, contents = [], []
    with zipfile.ZipFile(zip_path) as z:
        for name in z.namelist():
            if name.endswith("/") or name.endswith(".labels"):
                continue
            contents.append(z.read(name))
            names.append(name)
    return names, contents

names, docx_bytes, labels = load_train_zip("docx-train.zip")

# ---------------- XML / TEXT HANDLING ----------------
def _safe_read_document_xml(docx_bytes: bytes) -> str:
    """
    Liefert den Inhalt von word/document.xml oder '' falls nicht vorhanden,
    Datei kaputt ist oder kein echtes DOCX vorliegt.
    """
    try:
        with zipfile.ZipFile(BytesIO(docx_bytes), "r") as z:
            if "word/document.xml" not in z.namelist():
                return ""
            return z.read("word/document.xml").decode("utf-8", errors="ignore")
    except Exception:
        return ""

def extract_plain_text_docx(docx_bytes: bytes) -> str:
    xml = _safe_read_document_xml(docx_bytes)
    if not xml:
        return ""
    chunks = re.findall(r'<w:t[^>]*>(.*?)</w:t>', xml, flags=re.I | re.S)
    return " ".join(unescape(c) for c in chunks)

def count_words_from_document_xml(xml: str) -> int:
    if not xml:
        return 0
    chunks = re.findall(r'<w:t[^>]*>(.*?)</w:t>', xml, flags=re.I | re.S)
    text = " ".join(unescape(c) for c in chunks)
    return len(re.findall(r"[A-Za-zÄÖÜäöüß']+", text))

def count_words_from_docx_bytes(docx_bytes: bytes) -> int:
    xml = _safe_read_document_xml(docx_bytes)
    return count_words_from_document_xml(xml)

# ---------------- MACRO ANALYSE ----------------
def vba_macros(docx_bytes: bytes) -> bool:
    """
    True, wenn VBA-Makros gefunden werden (OOXML/ZIP + Fallback vbaProject.bin).
    """
    # 1) Direkt mit olevba probieren
    try:
        vp = VBA_Parser("", docx_bytes)
        found = vp.detect_vba_macros()
        vp.close()
        if found:
            return True
    except Exception:
        pass

    # 2) Fallback: explizit vbaProject.bin suchen
    try:
        with zipfile.ZipFile(BytesIO(docx_bytes), "r") as z:
            for n in z.namelist():
                if n.lower().endswith("vbaproject.bin"):
                    data = z.read(n)
                    vp = VBA_Parser(n, data)
                    found = vp.detect_vba_macros()
                    vp.close()
                    if found:
                        return True
    except Exception:
        pass

    return False

# ---------------- FEATURE EXTRACTION ----------------
def extract_features(docx_bytes: bytes) -> dict:
    text = extract_plain_text_docx(docx_bytes)
    feats = {
        "vba_macros_found": int(vba_macros(docx_bytes)),
        "text_len": len(text),
        "word_count": len(re.findall(r"[A-Za-zÄÖÜäöüß']+", text)),
        "has_document_xml": int(bool(text)),  # nützlich für Debug/ML
    }
    return feats

def features_to_matrix(docx_list):
    rows = [extract_features(b) for b in docx_list]
    df = pd.DataFrame(rows).fillna(0.0)
    X = df.to_numpy(dtype=float)
    cols = df.columns.tolist()
    return X, cols, df

In [28]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score


def main():
    train_zip = "docx-train.zip"
    test_zip  = "docx-test.zip"

    print("[+] Lade Trainingsdaten…")
    train_names, train_docx, y = load_train_zip(train_zip)

    print("[+] Extrahiere Features (Train)…")
    X_train, feat_names, _ = features_to_matrix(train_docx)
    print(f"[+] X_train: {X_train.shape}, Features: {feat_names}")

    print("[+] GridSearchCV (CV, refit auf ALLEN Trainingsdaten)…")
    grid = GridSearchCV(
        Pipeline([
            ("scaler", StandardScaler()),
            ("svm", SVC(probability=False, random_state=42))
        ]),
        {
            "svm__kernel": ["rbf", "linear"],
            "svm__C": [1, 10, 100, 1000],
            "svm__gamma": ["scale", "auto"],
        },
        scoring="balanced_accuracy",
        cv=5,
        n_jobs=-1,
        refit=True,
        verbose=1,
    )
    grid.fit(X_train, y)
    pipe = grid.best_estimator_
    print("[+] Beste Parameter:", grid.best_params_)

    print("[+] Lade Testdaten…")
    test_files, test_docx = load_test_zip(test_zip)

    print("[+] Extrahiere Features (Test)…")
    X_test, _, _ = features_to_matrix(test_docx)

    print("[+] Erzeuge Vorhersagen…")
    predictions = pipe.predict(X_test)

    print("[+] Schreibe output.csv …")
    with open("output.csv", "w", encoding="utf-8") as f:
        for path, pred in zip(test_files, predictions):
            f.write(f"{path};{int(pred)}\n")

    print("[+] Fertig: output.csv")


if __name__ == "__main__":
    main()

[+] Lade Trainingsdaten…
[+] Extrahiere Features (Train)…
[+] X_train: (6301, 4), Features: ['vba_macros_found', 'text_len', 'word_count', 'has_document_xml']
[+] GridSearchCV (CV, refit auf ALLEN Trainingsdaten)…
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[+] Beste Parameter: {'svm__C': 1, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
[+] Lade Testdaten…
[+] Extrahiere Features (Test)…
[+] Erzeuge Vorhersagen…
[+] Schreibe output.csv …
[+] Fertig: output.csv
