In [None]:
!nvidia-smi


In [None]:

import re
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.calibration import CalibratedClassifierCV

# SBERT
from sentence_transformers import SentenceTransformer



@dataclass
class Config:
    data_path: str = "Dataset.xlsx"
    out_dir: str = "results"
    seed: int = 42

    min_tokens: int = 30
    max_tokens: int = 1500

    holdout_min_samples: int = 300
    train_balance: bool = True

    # SBERT
    sbert_model: str = "all-MiniLM-L6-v2"
    batch_size: int = 64
    use_calibration: bool = True

cfg = Config()



def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def standardize_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["label_name"] = df["label_name"].astype(str).str.strip().str.lower()

    def map_label(x: str) -> int:
        if "human" in x:
            return 0
        if "ai" in x or "gpt" in x or "generated" in x or "llm" in x or "machine" in x:
            return 1
        raise ValueError(f"Unrecognized label_name: {x}")

    df["label"] = df["label_name"].map(map_label)
    return df


def describe_dataset(df: pd.DataFrame) -> None:
    print("\n=== Dataset head ===")
    print(df.head(2)[["text", "source", "label_name", "label"]])

    print("\n=== Label counts ===")
    print(df["label_name"].value_counts(dropna=False))

    print("\n=== Source counts (top 15) ===")
    print(df["source"].value_counts(dropna=False).head(15))

    print("\n=== Label by source (top sources) ===")
    top_sources = df["source"].value_counts().head(12).index
    print(df[df["source"].isin(top_sources)].groupby("source")["label"].value_counts().unstack(fill_value=0))

    print("\n=== Length summary (tokens) ===")
    print(df["n_tokens"].describe())


def make_holdout_ai_split(df: pd.DataFrame, holdout_ai_source: str, seed: int, train_balance: bool = True):
    """
    Hold out one AI generator/source, but keep BOTH classes in test by sampling human.
    test_ai = all AI samples of that source
    test_hu = same number sampled from human
    train = rest (optionally balanced)
    """
    ai = df[df["label"] == 1].copy()
    hu = df[df["label"] == 0].copy()

    test_ai = ai[ai["source"] == holdout_ai_source].copy()
    if len(test_ai) == 0:
        raise ValueError(f"No AI samples for source={holdout_ai_source}")

    test_hu = hu.sample(n=len(test_ai), random_state=seed).copy()
    test_df = pd.concat([test_ai, test_hu], ignore_index=True).sample(frac=1.0, random_state=seed)

    train_ai = ai[ai["source"] != holdout_ai_source].copy()
    train_hu = hu.drop(index=test_hu.index).copy()

    if train_balance:
        n = min(len(train_ai), len(train_hu))
        train_ai = train_ai.sample(n=n, random_state=seed)
        train_hu = train_hu.sample(n=n, random_state=seed)

    train_df = pd.concat([train_ai, train_hu], ignore_index=True).sample(frac=1.0, random_state=seed)
    return train_df, test_df


#SBERT +Classifier

def encode_sbert(texts, model: SentenceTransformer, batch_size: int):
    return model.encode(
        list(texts),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )


def fit_predict_proba_lr(X_train, y_train, X_test, seed: int, use_calibration: bool):
    clf = LogisticRegression(max_iter=2000, solver="liblinear", random_state=seed)
    if use_calibration:
        cal = CalibratedClassifierCV(clf, method="sigmoid", cv=3)
        cal.fit(X_train, y_train)
        proba = cal.predict_proba(X_test)[:, 1]
    else:
        clf.fit(X_train, y_train)
        proba = clf.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)
    return pred, proba


def brier_score(y_true, y_prob):
    y_true = np.asarray(y_true).astype(float)
    y_prob = np.asarray(y_prob).astype(float)
    return float(np.mean((y_prob - y_true) ** 2))


def evaluate(y_true, y_pred, y_prob):
    out = {
        "f1": float(f1_score(y_true, y_pred)),
        "brier": brier_score(y_true, y_prob),
    }
    if len(np.unique(y_true)) == 2:
        out["auc"] = float(roc_auc_score(y_true, y_prob))
    else:
        out["auc"] = np.nan
    return out


def run_setting_sbert(df_train, df_test, setting: str, model: SentenceTransformer, cfg: Config):
    y_train = df_train["label"].values
    y_test = df_test["label"].values

    print(f"\n--- Encoding SBERT for setting: {setting} ---")
    X_train = encode_sbert(df_train["text"], model, cfg.batch_size)
    X_test = encode_sbert(df_test["text"], model, cfg.batch_size)

    print(f"--- Training LR (calibration={cfg.use_calibration}) ---")
    pred, proba = fit_predict_proba_lr(X_train, y_train, X_test, cfg.seed, cfg.use_calibration)
    m = evaluate(y_test, pred, proba)

    return {
        **m,
        "setting": setting,
        "features": f"sbert:{cfg.sbert_model}",
        "model": "LR_calibrated" if cfg.use_calibration else "LR",
        "n_train": int(len(df_train)),
        "n_test": int(len(df_test)),
    }



def main():
    Path(cfg.out_dir).mkdir(parents=True, exist_ok=True)

    if cfg.data_path.endswith(".xlsx"):
        df = pd.read_excel(cfg.data_path)
    else:
        df = pd.read_csv(cfg.data_path)

    df["text"] = df["text"].apply(basic_clean)
    df["source"] = df["source"].astype(str).str.strip().str.lower()
    df = df[df["text"].str.len() > 0].copy()
    df = standardize_labels(df)

    df["n_tokens"] = df["text"].str.split().apply(len)
    df = df[(df["n_tokens"] >= cfg.min_tokens) & (df["n_tokens"] <= cfg.max_tokens)].copy()

    describe_dataset(df)

    device = "cuda" if __import__("torch").cuda.is_available() else "cpu"
    print("\n=== SBERT device ===")
    print(device)
    sbert = SentenceTransformer(cfg.sbert_model, device=device)

    results = []

    tr, te = train_test_split(df, test_size=0.2, random_state=cfg.seed, stratify=df["label"])
    results.append(run_setting_sbert(tr, te, "random_80_20", sbert, cfg))

    ai_sources = (
        df[df["label"] == 1]["source"]
        .value_counts()
        .loc[lambda s: s >= cfg.holdout_min_samples]
        .index
        .tolist()
    )

    for src in ai_sources:
        tr, te = make_holdout_ai_split(df, src, cfg.seed, train_balance=cfg.train_balance)
        results.append(run_setting_sbert(tr, te, f"holdout_ai:{src}", sbert, cfg))

    res_df = pd.DataFrame(results)
    out_path = Path(cfg.out_dir) / "metrics_sbert.csv"
    res_df.to_csv(out_path, index=False)

    print("\n=== Results (SBERT) ===")
    print(res_df.sort_values(["setting"]).to_string(index=False))
    print(f"\nSaved: {out_path}")


if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

SBERT_METRICS_PATH = "results/metrics_sbert.csv"

df = pd.read_csv(SBERT_METRICS_PATH)

df_sbert = df[(df["model"] == "LR_calibrated") & (df["features"].str.contains("sbert", na=False))].copy()

def pretty_setting(s):
    if s == "random_80_20":
        return "Random 80/20"
    if isinstance(s, str) and s.startswith("holdout_ai:"):
        return "Hold-out " + s.split("holdout_ai:")[1]
    return str(s)

df_sbert["Setting"] = df_sbert["setting"].map(pretty_setting)

cols = ["Setting", "accuracy", "precision", "recall", "f1", "auc", "brier"]
existing = [c for c in cols if c in df_sbert.columns]
table_df = df_sbert[existing].sort_values("Setting")

# Round
for c in table_df.columns:
    if c != "Setting":
        table_df[c] = table_df[c].astype(float).round(3)

fig, ax = plt.subplots(figsize=(12, 0.6 + 0.35 * len(table_df)))
ax.axis("off")

tbl = ax.table(
    cellText=table_df.values,
    colLabels=table_df.columns,
    loc="center",
    cellLoc="center",
)

tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1, 1.4)

plt.tight_layout()
out_path = "sbert_results_table.png"
plt.savefig(out_path, dpi=200, bbox_inches="tight")
plt.show()

print(f"Saved: {out_path}")
