In [1]:
# Imports, configuration, and constants
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional, Tuple
from collections import defaultdict

# Reproducibility
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

# File paths 
TRAIN_CSV = r"C:\Users\marco\Desktop\Thesis\data\raw\raid_train.csv"
TEST_CSV  = r"C:\Users\marco\Desktop\Thesis\data\raw\raid_test.csv"

# Output files 
OUT_SMALL  = "raid_sample_small.csv"
OUT_MEDIUM = "raid_sample_medium.csv"
OUT_LARGE  = "raid_sample_large.csv"

# Sampling sizes
SIZE_SMALL  = 3000
SIZE_MEDIUM = 12000
SIZE_LARGE  = 60000

# Length bin target weights (strict): 25% short, 25% medium, 50% long
LENGTH_WEIGHTS = {"short": 0.25, "medium": 0.25, "long": 0.50}

# Known AI models in RAID 
AI_MODELS = [
    "chatgpt", "gpt4", "gpt3", "gpt2",
    "llama-chat",
    "mistral", "mistral-chat",
    "mpt", "mpt-chat",
    "cohere", "cohere-chat"
]

# Domains to exclude
EXCLUDED_DOMAINS = {"czech", "german"}


In [2]:
# Normalization, diagnostics, and filtering for text-like content in a DataFrame column.

import re, unicodedata, math
from collections import Counter
from typing import Tuple, List

# Remove zero-width, BOM, bidi, soft hyphen; also strip control chars (except \t\n\r)
HIDDEN_RE = re.compile(r"[\u200B-\u200D\uFEFF\u00AD\u200E\u200F\u202A-\u202E\u2060\u2066-\u2069]")
CTRL_RE   = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", str(s))
    s = HIDDEN_RE.sub("", s)
    s = CTRL_RE.sub("", s)
    s = s.replace("\u00A0", " ")  # NBSP → space
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Tokenizer for alphabetic words with optional apostrophes (English/Italian adequate)
WORD_RE = re.compile(r"[^\W\d_]+(?:'[^\W\d_]+)?", re.UNICODE)

def text_diagnostics(s: str) -> dict:
    s = s or ""
    n_chars = len(s)
    n_alpha = sum(c.isalpha() for c in s)
    n_digit = sum(c.isdigit() for c in s)
    n_punct = sum((not c.isalnum()) and (not c.isspace()) for c in s)

    toks = WORD_RE.findall(s)
    n_tok = len(toks)
    avg_wl = (sum(len(t) for t in toks) / n_tok) if n_tok else float("nan")
    std_wl = (math.sqrt(sum((len(t)-avg_wl)**2 for t in toks)/n_tok) if n_tok else float("nan"))

    if n_chars:
        counts = Counter(s)
        probs = [v/n_chars for v in counts.values()]
        ent = -sum(p*math.log(p, 2) for p in probs)
        ent_norm = ent / math.log(max(2, len(counts)), 2)
    else:
        ent = float("nan"); ent_norm = float("nan")

    return dict(
        n_chars=n_chars, n_tok=n_tok,
        alpha_ratio=(n_alpha/n_chars) if n_chars else 0.0,
        digit_ratio=(n_digit/n_chars) if n_chars else 0.0,
        punct_ratio=(n_punct/n_chars) if n_chars else 0.0,
        avg_word_length=avg_wl,
        std_word_length=std_wl,
        entropy_bits=ent, entropy_norm=ent_norm
    )

def is_text_like(di: dict) -> Tuple[bool, List[str]]:
    reasons = []
    if di["n_chars"] < 30: reasons.append("too_short_chars")
    if di["n_tok"]   < 5:  reasons.append("too_few_tokens")

    awl = di["avg_word_length"]
    if (not isinstance(awl, float)) or math.isnan(awl) or awl < 2.0 or awl > 10.0:
        reasons.append("avg_word_length_out_of_range")

    if di["alpha_ratio"] < 0.55: reasons.append("low_alpha_ratio")
    if di["digit_ratio"] > 0.30: reasons.append("high_digit_ratio")
    if di["punct_ratio"] > 0.35: reasons.append("high_punct_ratio")

    en = di["entropy_norm"]
    if (not isinstance(en, float)) or math.isnan(en) or en < 0.35:
        reasons.append("low_entropy")

    return (len(reasons) == 0, reasons)

def apply_text_gate(df: pd.DataFrame, src_col: str = "generation") -> pd.DataFrame:
    """Create generation_clean, diagnostics, and gate flags. Filters to text-like only."""
    out = df.copy()
    out["generation_clean"] = out[src_col].astype(str).map(normalize_text)

    # diagnostics as columns
    diags = out["generation_clean"].map(text_diagnostics).apply(pd.Series)
    out = pd.concat([out, diags], axis=1)

    # decision + reason
    decisions = diags.apply(lambda row: is_text_like(row.to_dict()), axis=1)
    out["is_text_like"] = decisions.map(lambda x: x[0])
    out["not_text_reason"] = decisions.map(lambda x: ";".join(x[1]))

    # filter
    filtered = out[out["is_text_like"]].copy()
    
    # CRITICAL: Replace generation with cleaned version before returning
    filtered["generation"] = filtered["generation_clean"]
    filtered = filtered.drop(columns=["generation_clean"])
    
    filtered.reset_index(drop=True, inplace=True)
    return filtered

In [3]:
def safe_load_and_prepare(train_csv: str, test_csv: str) -> pd.DataFrame:
    """
    Loads train and test (if present), concatenates, normalizes text fields,
    excludes czech/german, labels source_type, de-duplicates, removes empty,
    THEN applies a text-likeness gate and returns only text-like rows.
    """
    frames = []
    for p in [train_csv, test_csv]:
        try:
            tmp = pd.read_csv(p)
            frames.append(tmp)
        except Exception as e:
            print(f"[INFO] Could not load {p}: {e}")
    if not frames:
        cols = ["id","adv_source_id","source_id","model","decoding","repetition_penalty",
                "attack","domain","generation"]
        return pd.DataFrame(columns=cols)

    df = pd.concat(frames, ignore_index=True)

    # --- existing normalization of metadata ---
    if "model" in df.columns:
        df["model"] = df["model"].astype(str).str.strip().str.lower()
    else:
        df["model"] = np.nan

    if "domain" in df.columns:
        df["domain"] = df["domain"].astype(str).str.strip().str.lower()
        df = df[~df["domain"].isin(EXCLUDED_DOMAINS)].copy()
    else:
        df["domain"] = ""

    # Define is_ai / source_type
    df["is_ai"] = df["model"].isin(set(AI_MODELS))
    df["source_type"] = np.where(df["is_ai"], "ai", "human")

    # Ensure 'generation' exists and not only whitespace
    if "generation" not in df.columns:
        raise ValueError("The dataset is missing the 'generation' column which is required.")
    df["generation"] = df["generation"].astype(str)
    df = df[df["generation"].str.strip().astype(bool)].copy()

    # De-duplicate
    if "id" in df.columns:
        df = df.drop_duplicates(subset=["id"])
    else:
        df = df.drop_duplicates(subset=["generation"])

    df.reset_index(drop=True, inplace=True)

    # --- normalize text and gate non-text ---
    df = apply_text_gate(df, src_col="generation")  # adds generation_clean + diagnostics and filters to text-like

    return df


df = safe_load_and_prepare(TRAIN_CSV, TEST_CSV)
print("Rows loaded:", len(df))
df.head(3)


KeyboardInterrupt: 

In [None]:
# Length features and binning utilities (25% short, 25% medium, 50% long)

def compute_length_features(df: pd.DataFrame, text_col: str = "generation") -> pd.DataFrame:
    out = df.copy()
    # Use the column as-is (already cleaned by apply_text_gate)
    out["n_tokens_ws"] = out[text_col].fillna("").map(lambda s: len(str(s).split()))
    out["n_chars"] = out[text_col].fillna("").map(lambda s: len(str(s)))
    return out

def assign_length_bins(df: pd.DataFrame, token_col: str = "n_tokens_ws") -> pd.DataFrame:
    """
    Assign 'short' if <= 25th percentile, 'medium' if (25th, 50th], 'long' if > 50th.
    This sets up the population so that sampling can enforce the exact 25/25/50 composition.
    """
    out = df.copy()
    if len(out) == 0:
        out["length_bin"] = pd.Categorical([], categories=["short","medium","long"])
        return out

    p25 = np.percentile(out[token_col], 25)
    p50 = np.percentile(out[token_col], 50)

    def _bin(x):
        if x <= p25:
            return "short"
        elif x <= p50:
            return "medium"
        else:
            return "long"

    out["length_bin"] = out[token_col].map(_bin)
    out["length_bin"] = pd.Categorical(out["length_bin"], categories=["short","medium","long"])
    return out

# Apply on the loaded df - this now operates on cleaned text
df = compute_length_features(df, "generation")
df = assign_length_bins(df, "n_tokens_ws")
df["length_bin"].value_counts(dropna=False)

length_bin
long      3133501
short     1578114
medium    1576205
Name: count, dtype: int64

In [None]:
# Allocation helpers (largest remainder method) and proportional rounding

def largest_remainder(target_total: int, weights: dict[str, float]) -> dict[str, int]:
    """
    Allocate 'target_total' into integer counts according to 'weights' (which should sum to 1),
    using the Largest Remainder Method to ensure totals match exactly.
    """
    if not weights:
        return {}
    keys = list(weights.keys())
    raw = np.array([weights[k] for k in keys], dtype=float) * target_total
    floors = np.floor(raw).astype(int)
    remainder = target_total - floors.sum()
    fracs = raw - floors
    order = np.argsort(-fracs)  # descending by fractional remainder
    alloc = floors.copy()
    for i in range(max(0, remainder)):
        alloc[order[i]] += 1
    return {k: int(v) for k, v in zip(keys, alloc)}

def proportional_weights(counts: dict[str, int]) -> dict[str, float]:
    total = sum(counts.values())
    if total <= 0:
        n = len(counts)
        return {k: 1.0 / n for k in counts} if n else {}
    return {k: v / total for k, v in counts.items()}

def proportional_allocation(target_total: int, available_counts: dict[str, int]) -> dict[str, int]:
    """
    Allocate target_total across keys proportionally to available_counts, capped by availability.
    If some caps bind, redistribute the remainder among non-capped keys iteratively.
    """
    remaining_total = target_total
    remaining = available_counts.copy()
    alloc = {k: 0 for k in available_counts}

    while remaining_total > 0 and any(v > 0 for v in remaining.values()):
        weights = proportional_weights(remaining)
        step = largest_remainder(remaining_total, weights)
        # Cap by availability
        capped = {}
        for k, v in step.items():
            take = min(v, remaining[k])
            capped[k] = take
        # Update allocations
        for k, take in capped.items():
            alloc[k] += take
            remaining[k] -= take
        remaining_total = target_total - sum(alloc.values())
        if remaining_total <= 0:
            break
        if not any(v > 0 for v in remaining.values()):
            break

    return alloc

# Quick self-test
print("Largest remainder test:", largest_remainder(10, {"a":0.25,"b":0.25,"c":0.5}))


Largest remainder test: {'a': 3, 'b': 2, 'c': 5}


In [None]:
#  Core stratified sampler

def stratified_sample(
    df: pd.DataFrame,
    total_size: int,
    length_weights: dict[str, float] = LENGTH_WEIGHTS,
    ai_models_even: bool = True,
    random_state: int = RANDOM_SEED
) -> Tuple[pd.DataFrame, dict]:
    """
    Create a stratified sample with:
      - 50% human, 50% AI
      - Length-bin distribution per 'length_weights' (e.g., 25/25/50)
      - For AI: even across models if ai_models_even=True (among models present)
      - Domains: proportional to availability within each (group × length_bin)
    Returns: (sample_df, diagnostics_dict)
    """
    if len(df) == 0:
        return df.copy(), {"note": "Empty dataframe; run locally with data available."}

    required_cols = {"source_type", "length_bin", "domain", "generation", "model"}
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    work = df.dropna(subset=["length_bin"]).copy()

    # Split desired totals
    half = total_size // 2
    remainder = total_size - 2 * half  # handle odd totals if any
    target_by_source = {"human": half, "ai": half + remainder}

    # Within each source_type, allocate to length bins
    length_targets = {s: largest_remainder(target_by_source[s], length_weights) for s in target_by_source}

    picked_indices = []
    debug_info = {"by_source_length": {}, "by_model_length": {}, "by_domain": {}}
    rng_local = np.random.default_rng(random_state)

    for source in ["human", "ai"]:
        for lb, lb_target in length_targets[source].items():
            subset = work[(work["source_type"] == source) & (work["length_bin"] == lb)]

            if source == "ai" and ai_models_even:
                # Even per AI model among those present in this length bin
                present_models = sorted(set(subset["model"]) & set(AI_MODELS))
                if not present_models:
                    continue
                per_model = largest_remainder(lb_target, {m: 1/len(present_models) for m in present_models})
                debug_info["by_model_length"].setdefault(lb, {})[source] = per_model

                for m, m_target in per_model.items():
                    sub_m = subset[subset["model"] == m]
                    # Domain proportional allocation within this slice
                    domain_counts = sub_m["domain"].value_counts().to_dict()
                    alloc_dom = proportional_allocation(m_target, domain_counts)

                    for dom, take in alloc_dom.items():
                        take = int(take)
                        if take <= 0:
                            continue
                        candidates = sub_m[sub_m["domain"] == dom]
                        if len(candidates) == 0:
                            continue
                        take = min(take, len(candidates))
                        chosen = candidates.sample(
                            n=take,
                            random_state=rng_local.integers(0, 1_000_000)
                        ).index.tolist()
                        picked_indices.extend(chosen)
                        debug_info["by_domain"].setdefault((source, lb, m), {})[dom] = take
            else:
                # HUMAN or AI without even split: directly allocate by domain
                domain_counts = subset["domain"].value_counts().to_dict()
                alloc_dom = proportional_allocation(lb_target, domain_counts)
                debug_info["by_source_length"].setdefault(source, {})[lb] = alloc_dom

                for dom, take in alloc_dom.items():
                    take = int(take)
                    if take <= 0:
                        continue
                    candidates = subset[subset["domain"] == dom]
                    if len(candidates) == 0:
                        continue
                    take = min(take, len(candidates))
                    chosen = candidates.sample(
                        n=take,
                        random_state=rng_local.integers(0, 1_000_000)
                    ).index.tolist()
                    picked_indices.extend(chosen)

    sample_df = work.loc[sorted(set(picked_indices))].copy()

    shortfall = total_size - len(sample_df)
    diagnostics = {
        "target_total": total_size,
        "picked_total": len(sample_df),
        "shortfall": shortfall,
        "length_targets": length_targets,
        "by_source_length": debug_info.get("by_source_length", {}),
        "by_model_length": debug_info.get("by_model_length", {}),
    }
    return sample_df, diagnostics


In [None]:
#  Orchestration to produce small/medium/large samples and save

def run_all_samples(df_full: pd.DataFrame):
    # Compute features and bins (idempotent)
    dd = df_full.copy()
    if "n_tokens_ws" not in dd.columns:
        dd = compute_length_features(dd, "generation")
    if "length_bin" not in dd.columns or dd["length_bin"].isna().any():
        dd = assign_length_bins(dd, "n_tokens_ws")

    # Small
    small_df, small_diag = stratified_sample(dd, SIZE_SMALL, LENGTH_WEIGHTS, ai_models_even=True, random_state=RANDOM_SEED+1)
    # Medium
    med_df, med_diag = stratified_sample(dd, SIZE_MEDIUM, LENGTH_WEIGHTS, ai_models_even=True, random_state=RANDOM_SEED+2)
    # Large
    large_df, large_diag = stratified_sample(dd, SIZE_LARGE, LENGTH_WEIGHTS, ai_models_even=True, random_state=RANDOM_SEED+3)

    # Save
    small_df.to_csv(OUT_SMALL, index=False)
    med_df.to_csv(OUT_MEDIUM, index=False)
    large_df.to_csv(OUT_LARGE, index=False)

    return (small_df, small_diag), (med_df, med_diag), (large_df, large_diag)

# Execute if data present
if len(df) > 0:
    (small_df, small_diag), (med_df, med_diag), (large_df, large_diag) = run_all_samples(df)
    print("Saved:")
    print(OUT_SMALL, len(small_df))
    print(OUT_MEDIUM, len(med_df))
    print(OUT_LARGE, len(large_df))
else:
    print("[INFO] No data loaded. Ensure TRAIN_CSV/TEST_CSV paths are correct and re-run.")


Saved:
raid_sample_small.csv 3000
raid_sample_medium.csv 12000
raid_sample_large.csv 60000


In [None]:
#  Validation utilities strict balance and ratios

def check_balance(df_sample: pd.DataFrame, expected_total: int) -> pd.DataFrame:
    """
    Returns a table comparing actual vs target counts for:
    - total
    - human vs ai
    - length bins overall
    - length bins within source_type
    - AI models evenness
    """
    rows = []
    if len(df_sample) == 0:
        return pd.DataFrame(columns=["dimension","category","actual","target"])

    # Totals
    rows.append(("total", "all", len(df_sample), expected_total))

    # Human vs AI
    for k, grp in df_sample.groupby("source_type"):
        rows.append(("source_type", k, len(grp), expected_total/2))

    # Length bins overall
    for lb, w in LENGTH_WEIGHTS.items():
        target = expected_total * w
        rows.append(("length_bin", lb, int((df_sample["length_bin"]==lb).sum()), target))

    # Length bins within source
    for k, grp in df_sample.groupby("source_type"):
        for lb, w in LENGTH_WEIGHTS.items():
            target = (expected_total/2) * w
            rows.append((f"length_bin|{k}", lb, int((grp["length_bin"]==lb).sum()), target))

    # AI models evenness (only within AI)
    ai_df = df_sample[df_sample["source_type"]=="ai"]
    if len(ai_df) > 0:
        models_present = sorted(set(ai_df["model"]) & set(AI_MODELS))
        if models_present:
            per_model_target = (expected_total/2) / len(models_present)
            for m in models_present:
                rows.append(("ai_model", m, int((ai_df["model"]==m).sum()), per_model_target))

    return pd.DataFrame(rows, columns=["dimension","category","actual","target"])

print("Validator ready. Example usage after sampling:")
print("check_balance(small_df, SIZE_SMALL)")


Validator ready. Example usage after sampling:
check_balance(small_df, SIZE_SMALL)


In [None]:
check_balance(small_df, SIZE_SMALL)

Unnamed: 0,dimension,category,actual,target
0,total,all,3000,3000.0
1,source_type,ai,1500,1500.0
2,source_type,human,1500,1500.0
3,length_bin,short,750,750.0
4,length_bin,medium,750,750.0
5,length_bin,long,1500,1500.0
6,length_bin|ai,short,375,375.0
7,length_bin|ai,medium,375,375.0
8,length_bin|ai,long,750,750.0
9,length_bin|human,short,375,375.0


In [None]:
check_balance(med_df, SIZE_MEDIUM)

Unnamed: 0,dimension,category,actual,target
0,total,all,12000,12000.0
1,source_type,ai,6000,6000.0
2,source_type,human,6000,6000.0
3,length_bin,short,3000,3000.0
4,length_bin,medium,3000,3000.0
5,length_bin,long,6000,6000.0
6,length_bin|ai,short,1500,1500.0
7,length_bin|ai,medium,1500,1500.0
8,length_bin|ai,long,3000,3000.0
9,length_bin|human,short,1500,1500.0


In [None]:
check_balance(large_df, SIZE_LARGE)

Unnamed: 0,dimension,category,actual,target
0,total,all,60000,60000.0
1,source_type,ai,30000,30000.0
2,source_type,human,30000,30000.0
3,length_bin,short,15000,15000.0
4,length_bin,medium,15000,15000.0
5,length_bin,long,30000,30000.0
6,length_bin|ai,short,7500,7500.0
7,length_bin|ai,medium,7500,7500.0
8,length_bin|ai,long,15000,15000.0
9,length_bin|human,short,7500,7500.0
