In [5]:
import os, re, math, zipfile
from collections import Counter
from urllib.parse import urlparse, urljoin
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from bs4 import XMLParsedAsHTMLWarning
import warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [6]:
def load_train_zip(zip_path: str):
    websites, labels = [], []
    with zipfile.ZipFile(zip_path) as z:
        for name in z.namelist():
            if name.endswith("/") or name.endswith(".labels"):
                continue
            base = os.path.basename(name)
            if not (base.endswith(".0") or base.endswith(".1")):
                continue
            lab = int(base.split(".")[-1])
            website_c = z.read(name).decode("utf-8", errors="ignore").lower()
            websites.append(website_c)
            labels.append(lab)
    return websites, np.asarray(labels, dtype=int)

def load_test_zip(zip_path: str):
    files, websites = [], []
    with zipfile.ZipFile(zip_path) as z:
        for name in z.namelist():
            if name.endswith("/") or name.endswith(".labels"):
                continue
            website_c = z.read(name).decode("utf-8", errors="ignore").lower()
            files.append(name)
            websites.append(website_c)
    return files, websites

In [7]:
import re, math
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd

SPAM_WORDS = {
    "free","win","winner","urgent","limited","verify","account","login",
    "prize","bonus","crypto","airdrop","gift","click","now","offer","claim"
}
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"\+?\d[\d\s().-]{6,}\d")
CARD_RE  = re.compile(r"\b(?:\d[ -]*?){13,19}\b")
OBF_JS   = re.compile(r"\b(eval|atob|unescape|Function\s*\(|document\.write)\b", re.I)

def prepare_html(raw):
    if isinstance(raw, bytes):
        try:
            return raw.decode("utf-8")
        except UnicodeDecodeError:
            return raw.decode("iso-8859-1", errors="replace")
    return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]+', ' ', raw)

FEATURE_ORDER = [
    "html_len","text_len","word_count","text_density","link_count",
    "anchor_spamword_hits","form_count","password_field_count",
    "creditcard_pattern_hits","script_count","iframe_count",
    "obfuscated_js_hits","meta_refresh","email_hits","phone_hits",
    "exclamation_count","uppercase_ratio"
]

def extract_features(html: str) -> dict:
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(" ", strip=True)
    words = re.findall(r"\w+", text.lower())

    html_len = len(html)
    text_len = len(text)
    text_density = text_len / max(html_len, 1)

    anchors = soup.find_all("a")
    link_count = len(anchors)

    forms = soup.find_all("form")
    passwords = soup.select('input[type="password"]')
    scripts = soup.find_all("script")
    iframes = soup.find_all("iframe")
    obfus_js_hits = sum(bool(OBF_JS.search(s.get_text() or "")) for s in scripts)
    meta_refresh = bool(soup.find("meta", attrs={"http-equiv": re.compile("^refresh$", re.I)}))

    feats = {
        "html_len": html_len,
        "text_len": text_len,
        "word_count": len(words),
        "text_density": round(text_density, 4),
        "link_count": link_count,
        "anchor_spamword_hits": sum(
            w in " ".join(a.get_text(" ", strip=True).lower() for a in anchors)
            for w in SPAM_WORDS
        ),
        "form_count": len(forms),
        "password_field_count": len(passwords),
        "creditcard_pattern_hits": len(CARD_RE.findall(text)),
        "script_count": len(scripts),
        "iframe_count": len(iframes),
        "obfuscated_js_hits": obfus_js_hits,
        "meta_refresh": int(meta_refresh),
        "email_hits": len(EMAIL_RE.findall(text)),
        "phone_hits": len(PHONE_RE.findall(text)),
        "exclamation_count": text.count("!"),
        "uppercase_ratio": round(sum(1 for c in text if c.isupper()) / max(len(text), 1), 4),
    }
    return feats

def features_to_matrix(html_list):
    rows = [extract_features(prepare_html(h)) for h in html_list]
    X = pd.DataFrame(rows)[FEATURE_ORDER].fillna(0.0).to_numpy(dtype=float)
    return X

In [8]:
if __name__ == "__main__":
    train_zip = "webspam-train.zip"
    test_zip  = "webspam-test.zip"

    train_websites, y = load_train_zip(train_zip)
    X = features_to_matrix(train_websites)

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    grid = GridSearchCV(
        Pipeline([("scaler", StandardScaler()), ("svm", SVC(probability=False, random_state=42))]),
        {
            "svm__kernel": ["rbf", "linear"],
            "svm__C": [100, 1000],
            "svm__gamma": ["scale", "auto"],
        },
        scoring="balanced_accuracy",
        cv=5,
        n_jobs=-1,
        refit=True,
    )
    grid.fit(X_train, y_train)

    y_pred_val = grid.predict(X_val)
    bal_acc = balanced_accuracy_score(y_val, y_pred_val)
    print(f"Balanced Accuracy (Validation): {bal_acc:.4f}")
    print("Beste Parameter:", grid.best_params_)

    pipe = grid.best_estimator_

    test_files, test_html = load_test_zip(test_zip)
    test_websites = features_to_matrix(test_html)

    predictions = pipe.predict(test_websites)

    with open("predictions.csv", "w", encoding="utf-8") as f:
        for path, pred in zip(test_files, predictions):
            f.write(f"{path};{int(pred)}\n")

Balanced Accuracy (Validation): 0.9913
Beste Parameter: {'svm__C': 1000, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
