In [1]:
#Configuration and dataset selection

import sys
import math
import re
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import spacy
from spacy.language import Language
from spacy.tokens import Doc

import nltk
from nltk.corpus import cmudict

from g2p_en import G2p


# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths to the three samples
SAMPLE_PATHS = {
    "small":  "raid_sample_small.csv",
    "medium": "raid_sample_medium.csv",
    "large":  "raid_sample_large.csv",
}

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set this to one of: "small", "medium", "large"
SELECTED_DATASET = "small"
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

TEXT_COL = "generation"

df = pd.read_csv(SAMPLE_PATHS[SELECTED_DATASET])
print(f"Loaded {SELECTED_DATASET} dataset with {len(df)} rows.")
df.head(3)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\cmudict.zip.


Loaded small dataset with 3000 rows.


Unnamed: 0,id,adv_source_id,source_id,model,decoding,repetition_penalty,attack,domain,title,prompt,...,digit_ratio,punct_ratio,avg_word_length,std_word_length,entropy_bits,entropy_norm,is_text_like,not_text_reason,n_tokens_ws,length_bin
0,404002f2-1cdb-4021-9215-976801dbf032,0333dff5-5fb3-4c06-a892-3ae170dfae75,0333dff5-5fb3-4c06-a892-3ae170dfae75,human,,,whitespace,reddit,Anxiety over naptime and transitioning to daycare,,...,0.003208,0.022454,4.008163,2.136164,4.252406,0.793722,True,,248,long
1,eedc3d68-f646-425b-bb9c-9ce469d1a348,07c99297-5b54-4240-987f-91cb2e6bbe45,07c99297-5b54-4240-987f-91cb2e6bbe45,human,,,insert_paragraphs,poetry,Love And War,,...,0.0,0.050992,3.852518,1.865006,4.475145,0.767226,True,,270,long
2,146c7064-c894-4290-b79e-daa517c3dec7,b5faf769-2be4-4428-9cde-6a6fd3fdc80a,b5faf769-2be4-4428-9cde-6a6fd3fdc80a,human,,,perplexity_misspelling,reddit,Socialists have more interest in controlling y...,,...,0.00406,0.032476,4.210145,2.178461,4.284333,0.794525,True,,132,short


In [2]:
# Initialize spaCy, CMUdict (NLTK), and g2p_en fallback

# Ensure CMUdict is available
nltk.download('cmudict', quiet=True)
nltk.download('averaged_perceptron_tagger_eng')
CMU = cmudict.dict()  # key: lowercase word, value: list of pronunciations (list of ARPAbet tokens)

# Load spaCy English model (use 'en_core_web_sm' unless you already have md/lg installed)
try:
    nlp: Language = spacy.load("en_core_web_sm")
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Ensure sentence boundaries are available (parser usually handles this; add sentencizer if needed)
if "sentencizer" not in nlp.pipe_names and "parser" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

# g2p_en for OOV coverage
G2P = G2p()

# ARPAbet vowel bases (stress digits removed when checking)
ARPA_VOWELS = {
    "AA", "AE", "AH", "AO", "AW", "AY",
    "EH", "ER", "EY",
    "IH", "IY",
    "OW", "OY",
    "UH", "UW"
}

# Cache syllable counts for speed
_SYLL_CACHE: Dict[str, int] = {}

def cmu_syllables(word: str) -> int | None:
    """
    Returns syllable count using CMUdict if available; else None.
    Policy: use the first pronunciation variant.
    """
    w = word.lower()
    if w not in CMU:
        return None
    phones = CMU[w][0]
    count = 0
    for ph in phones:
        base = re.sub(r"\d", "", ph)
        if base in ARPA_VOWELS:
            count += 1
    return max(count, 1)  # at least one for non-empty alphabetic words

def g2p_syllables(word: str) -> int:
    """
    Returns syllable count using neural g2p_en; counts vowel phonemes.
    """
    w = word.lower()
    if w in _SYLL_CACHE:
        return _SYLL_CACHE[w]
    phones = G2P(w)
    count = 0
    for ph in phones:
        base = re.sub(r"\d", "", ph)
        if base in ARPA_VOWELS:
            count += 1
    # Guard: ensure >=1 for alphabetic tokens
    if count == 0 and re.search(r"[A-Za-z]", w):
        count = 1
    _SYLL_CACHE[w] = count
    return count

def syllables_hybrid(word: str) -> int:
    """
    Hybrid policy: try CMUdict first; if OOV, fall back to g2p_en.
    """
    c = cmu_syllables(word)
    if c is not None:
        return c
    return g2p_syllables(word)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


In [3]:
# Feature computation utilities using spaCy + CMUdict with g2p_en fallback

def _word_like(tok) -> bool:
    """
    Select lexical tokens (alphabetic, not space).
    spaCy's tok.is_alpha ensures letter-only tokens; change if you want alphanumerics.
    """
    return tok.is_alpha and not tok.is_space

def _alnum_char_count(token_text: str) -> int:
    """Count alphanumeric characters for ARI; excludes whitespace and punctuation."""
    return sum(ch.isalnum() for ch in token_text)

def features_from_doc(doc: Doc) -> Dict[str, float]:
    """
    Computes:
      - avg_word_length
      - type_token_ratio
      - stopword_ratio
      - punctuation_ratio       (punct chars / non-space chars)
      - avg_sentence_length     (words per sentence)
      - sentence_length_std     (std of sentence word counts)
      - flesch_reading_ease
      - gunning_fog
      - smog_index
      - automated_readability_index
    """
    # Sentences
    sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
    n_sents = max(len(sents), 1)

    # Token groups
    word_toks = [t for t in doc if _word_like(t)]
    punct_toks = [t for t in doc if t.is_punct]
    nonspace_toks = [t for t in doc if not t.is_space]

    W = len(word_toks)

    # Characters for ARI and punctuation ratio
    chars_alnum = sum(_alnum_char_count(t.text) for t in nonspace_toks)
    punct_chars = sum(len(t.text) for t in punct_toks)
    nonspace_chars = sum(len(t.text) for t in nonspace_toks)

    # Sentence-level word counts
    sent_word_counts = [sum(1 for t in s if _word_like(t)) for s in sents]
    avg_sentence_length = float(np.mean(sent_word_counts)) if sent_word_counts else 0.0
    sentence_length_std  = float(np.std(sent_word_counts, ddof=0)) if len(sent_word_counts) > 1 else 0.0

    # Word-level lengths
    word_lengths = [len(t.text) for t in word_toks]
    avg_word_length = float(np.mean(word_lengths)) if word_lengths else 0.0

    # Type-token ratio (lowercased forms)
    vocab = {t.text.lower() for t in word_toks}
    type_token_ratio = (len(vocab) / W) if W > 0 else 0.0

    # Stopword ratio via spaCy stop flags
    stop_count = sum(1 for t in word_toks if t.is_stop)
    stopword_ratio = (stop_count / W) if W > 0 else 0.0

    # Punctuation ratio over non-space characters
    punctuation_ratio = (punct_chars / nonspace_chars) if nonspace_chars > 0 else 0.0

    # Syllables (hybrid)
    syll_per_word = [syllables_hybrid(t.text) for t in word_toks] if W > 0 else []
    syll_total = int(np.sum(syll_per_word)) if syll_per_word else 0
    polysyllables = int(np.sum([syl >= 3 for syl in syll_per_word])) if syll_per_word else 0
    complex_words = polysyllables  # standard: >= 3 syllables

    # Rates for readability
    words_per_sentence = (W / n_sents) if n_sents > 0 else 0.0
    syllables_per_word = (syll_total / W) if W > 0 else 0.0
    chars_per_word_ari = (chars_alnum / W) if W > 0 else 0.0

    # Readability indices
    # Flesch Reading Ease
    flesch = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word

    # Gunning Fog
    fog = 0.4 * (words_per_sentence + 100.0 * (complex_words / W if W > 0 else 0.0))

    # SMOG
    smog = (1.043 * math.sqrt(polysyllables * (30.0 / n_sents)) + 3.1291) if (polysyllables > 0 and n_sents > 0) else 0.0

    # Automated Readability Index
    ari = 4.71 * chars_per_word_ari + 0.5 * words_per_sentence - 21.43

    return {
        "avg_word_length": avg_word_length,
        "type_token_ratio": type_token_ratio,
        "stopword_ratio": stopword_ratio,
        "punctuation_ratio": punctuation_ratio,
        "avg_sentence_length": avg_sentence_length,
        "sentence_length_std": sentence_length_std,
        "flesch_reading_ease": flesch,
        "gunning_fog": fog,
        "smog_index": smog,
        "automated_readability_index": ari,
    }



In [4]:
# N-gram feature extraction utilities

from collections import Counter
from typing import List, Tuple
import math

def extract_ngrams(tokens: List[str], n: int) -> List[Tuple[str, ...]]:
    """Extract n-grams from a list of tokens."""
    if len(tokens) < n:
        return []
    return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

def ngram_diversity(tokens: List[str], n: int) -> float:
    """
    Calculate n-gram diversity (unique n-grams / total n-grams).
    Returns 0 if no n-grams possible.
    """
    ngrams = extract_ngrams(tokens, n)
    if not ngrams:
        return 0.0
    return len(set(ngrams)) / len(ngrams)

def ngram_entropy(tokens: List[str], n: int) -> float:
    """
    Calculate Shannon entropy of n-gram distribution.
    Returns 0 if no n-grams possible.
    """
    ngrams = extract_ngrams(tokens, n)
    if not ngrams:
        return 0.0
    
    counts = Counter(ngrams)
    total = len(ngrams)
    entropy = 0.0
    
    for count in counts.values():
        prob = count / total
        entropy -= prob * math.log2(prob)
    
    return entropy

def calculate_burstiness(tokens: List[str]) -> float:
    """
    Calculate burstiness coefficient based on word frequency distribution.
    Burstiness = (sigma - mu) / (sigma + mu)
    where mu is mean frequency and sigma is standard deviation.
    Returns 0 if only one unique token.
    """
    if not tokens:
        return 0.0
    
    word_counts = Counter(tokens)
    frequencies = list(word_counts.values())
    
    if len(frequencies) < 2:
        return 0.0
    
    mu = np.mean(frequencies)
    sigma = np.std(frequencies, ddof=0)
    
    if mu + sigma == 0:
        return 0.0
    
    return (sigma - mu) / (sigma + mu)

print("N-gram and burstiness utility functions loaded.")

N-gram and burstiness utility functions loaded.


In [5]:
# Character-level feature extraction

import gzip

def character_ngram_features(text: str, n: int = 3) -> Dict[str, float]:
    """
    Extract character n-gram diversity and entropy.
    Returns diversity ratio and entropy for character n-grams.
    """
    if len(text) < n:
        return 0.0, 0.0
    
    char_ngrams = [text[i:i+n] for i in range(len(text) - n + 1)]
    
    if not char_ngrams:
        return 0.0, 0.0
    
    # Diversity
    diversity = len(set(char_ngrams)) / len(char_ngrams)
    
    # Entropy
    counts = Counter(char_ngrams)
    total = len(char_ngrams)
    entropy = 0.0
    for count in counts.values():
        prob = count / total
        entropy -= prob * math.log2(prob)
    
    return diversity, entropy

def compression_ratio(text: str) -> float:
    """
    Calculate gzip compression ratio: compressed_size / original_size.
    Lower values indicate more compressible (potentially more predictable) text.
    """
    if not text:
        return 1.0
    
    original_bytes = text.encode('utf-8')
    compressed_bytes = gzip.compress(original_bytes)
    
    return len(compressed_bytes) / len(original_bytes)

def character_statistics(text: str) -> Dict[str, float]:
    """
    Extract surface-level character statistics.
    """
    if not text:
        return {
            "uppercase_ratio": 0.0,
            "digit_ratio": 0.0,
            "whitespace_ratio": 0.0,
            "unique_char_count": 0.0,
        }
    
    total_chars = len(text)
    
    return {
        "uppercase_ratio": sum(1 for c in text if c.isupper()) / total_chars,
        "digit_ratio": sum(1 for c in text if c.isdigit()) / total_chars,
        "whitespace_ratio": sum(1 for c in text if c.isspace()) / total_chars,
        "unique_char_count": float(len(set(text))),
    }

print("Character-level feature functions loaded.")

Character-level feature functions loaded.


In [6]:
# Dependency tree and syntactic structure features

def get_tree_depth(token) -> int:
    """
    Recursively calculate the depth of the dependency subtree rooted at token.
    """
    if not list(token.children):
        return 1
    return 1 + max(get_tree_depth(child) for child in token.children)

def dependency_tree_features(doc: Doc) -> Dict[str, float]:
    """
    Extract dependency tree structural features.
    """
    word_toks = [t for t in doc if _word_like(t)]
    
    if not word_toks:
        return {
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0.0,
            "avg_dependency_distance": 0.0,
            "left_dependency_ratio": 0.0,
            "right_dependency_ratio": 0.0,
        }
    
    # Find sentence roots and calculate tree depths
    sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
    tree_depths = []
    
    for sent in sents:
        root = [token for token in sent if token.head == token]
        if root:
            tree_depths.append(get_tree_depth(root[0]))
    
    avg_tree_depth = float(np.mean(tree_depths)) if tree_depths else 0.0
    max_tree_depth = float(np.max(tree_depths)) if tree_depths else 0.0
    
    # Dependency distances (how far apart head and dependent are)
    dep_distances = []
    left_deps = 0
    right_deps = 0
    
    for token in word_toks:
        if token.head != token:  # Not root
            distance = abs(token.i - token.head.i)
            dep_distances.append(distance)
            
            # Track direction
            if token.i < token.head.i:
                left_deps += 1
            else:
                right_deps += 1
    
    avg_dep_distance = float(np.mean(dep_distances)) if dep_distances else 0.0
    total_deps = left_deps + right_deps
    
    left_ratio = left_deps / total_deps if total_deps > 0 else 0.0
    right_ratio = right_deps / total_deps if total_deps > 0 else 0.0
    
    return {
        "avg_tree_depth": avg_tree_depth,
        "max_tree_depth": max_tree_depth,
        "avg_dependency_distance": avg_dep_distance,
        "left_dependency_ratio": left_ratio,
        "right_dependency_ratio": right_ratio,
    }

print("Dependency tree feature functions loaded.")

Dependency tree feature functions loaded.


In [7]:
# Vocabulary sophistication and lexical diversity measures

def hapax_legomena_ratio(tokens: List[str]) -> float:
    """
    Calculate ratio of words appearing exactly once (hapax legomena).
    Indicates vocabulary richness.
    """
    if not tokens:
        return 0.0
    
    word_counts = Counter(tokens)
    hapax_count = sum(1 for count in word_counts.values() if count == 1)
    
    return hapax_count / len(tokens)

def yules_k(tokens: List[str]) -> float:
    """
    Calculate Yule's K measure of lexical diversity.
    More robust to text length than TTR.
    K = 10000 * (M2 - M1) / (M1 * M1)
    where M1 = number of tokens, M2 = sum of (frequency^2 * num_words_with_that_frequency)
    """
    if not tokens:
        return 0.0
    
    word_counts = Counter(tokens)
    M1 = len(tokens)
    
    frequency_spectrum = Counter(word_counts.values())
    M2 = sum(freq * freq * count for freq, count in frequency_spectrum.items())
    
    if M1 <= 1:
        return 0.0
    
    K = 10000 * (M2 - M1) / (M1 * M1)
    return K

def mtld(tokens: List[str], threshold: float = 0.72) -> float:
    """
    Calculate MTLD (Measure of Textual Lexical Diversity).
    Counts how many sequential word segments maintain TTR above threshold.
    """
    if len(tokens) < 10:
        return float(len(set(tokens)))
    
    def compute_factor(token_list):
        ttr = 1.0
        word_set = set()
        factor_count = 0.0
        
        for i, token in enumerate(token_list, 1):
            word_set.add(token)
            ttr = len(word_set) / i
            
            if ttr < threshold:
                factor_count += 1
                word_set = set()
        
        if len(word_set) > 0:
            factor_count += (1.0 - ttr) / (1.0 - threshold)
        
        return factor_count
    
    # Calculate forward and backward
    forward = compute_factor(tokens)
    backward = compute_factor(list(reversed(tokens)))
    
    if forward == 0 and backward == 0:
        return float(len(tokens))
    
    factors = [f for f in [forward, backward] if f > 0]
    mean_factor = np.mean(factors)
    
    return len(tokens) / mean_factor if mean_factor > 0 else float(len(tokens))

def vocabulary_sophistication_features(tokens: List[str]) -> Dict[str, float]:
    """
    Aggregate vocabulary sophistication measures.
    """
    return {
        "hapax_legomena_ratio": hapax_legomena_ratio(tokens),
        "yules_k": yules_k(tokens),
        "mtld": mtld(tokens),
    }

print("Vocabulary sophistication functions loaded.")

Vocabulary sophistication functions loaded.


In [8]:
# Punctuation pattern analysis

def punctuation_patterns(doc: Doc) -> Dict[str, float]:
    """
    Detailed punctuation pattern features beyond simple ratio.
    """
    all_tokens = [t for t in doc if not t.is_space]
    punct_tokens = [t for t in doc if t.is_punct]
    
    if not all_tokens:
        return {
            "comma_ratio": 0.0,
            "period_ratio": 0.0,
            "question_ratio": 0.0,
            "exclamation_ratio": 0.0,
            "semicolon_ratio": 0.0,
            "colon_ratio": 0.0,
            "quote_ratio": 0.0,
        }
    
    total = len(all_tokens)
    
    # Count specific punctuation marks
    punct_text = ''.join([t.text for t in punct_tokens])
    
    return {
        "comma_ratio": punct_text.count(',') / total,
        "period_ratio": punct_text.count('.') / total,
        "question_ratio": punct_text.count('?') / total,
        "exclamation_ratio": punct_text.count('!') / total,
        "semicolon_ratio": punct_text.count(';') / total,
        "colon_ratio": punct_text.count(':') / total,
        "quote_ratio": (punct_text.count('"') + punct_text.count("'")) / total,
    }

print("Punctuation pattern functions loaded.")

Punctuation pattern functions loaded.


In [10]:
from textblob import TextBlob

def sentiment_features(text: str, doc: Doc) -> Dict[str, float]:
    """
    Extract sentiment and emotional tone features.
    Uses TextBlob for polarity and subjectivity.
    """
    if not text or not text.strip():
        return {
            "sentiment_polarity": 0.0,
            "sentiment_subjectivity": 0.0,
            "sentiment_polarity_variance": 0.0,
            "positive_word_ratio": 0.0,
            "negative_word_ratio": 0.0,
            "neutral_sentence_ratio": 0.0,
        }
    
    # Overall document sentiment
    blob = TextBlob(text)
    features = {
        "sentiment_polarity": blob.sentiment.polarity,  # -1 (negative) to 1 (positive)
        "sentiment_subjectivity": blob.sentiment.subjectivity,  # 0 (objective) to 1 (subjective)
    }
    
    # Sentence-level sentiment variance
    sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
    sent_polarities = []
    neutral_count = 0
    
    for sent in sents:
        sent_blob = TextBlob(sent.text)
        polarity = sent_blob.sentiment.polarity
        sent_polarities.append(polarity)
        
        # Count neutral sentences (polarity close to 0)
        if abs(polarity) < 0.1:
            neutral_count += 1
    
    features["sentiment_polarity_variance"] = float(np.var(sent_polarities)) if len(sent_polarities) > 1 else 0.0
    features["neutral_sentence_ratio"] = neutral_count / len(sents) if sents else 0.0
    
    # Positive/negative word ratios using spaCy tokens
    word_toks = [t for t in doc if _word_like(t)]
    if word_toks:
        positive_count = 0
        negative_count = 0
        
        for token in word_toks:
            word_blob = TextBlob(token.text.lower())
            polarity = word_blob.sentiment.polarity
            
            if polarity > 0.1:
                positive_count += 1
            elif polarity < -0.1:
                negative_count += 1
        
        features["positive_word_ratio"] = positive_count / len(word_toks)
        features["negative_word_ratio"] = negative_count / len(word_toks)
    else:
        features["positive_word_ratio"] = 0.0
        features["negative_word_ratio"] = 0.0
    
    return features

In [11]:
# Application NLP.pipe based

BATCH_SIZE = 64
# 1 for deterministic ordering in some environments; -1  all available cores
N_PROCESS = -1

texts = df[TEXT_COL].astype(str).tolist()

feature_rows: List[Dict[str, float]] = []
for text, doc in zip(texts, nlp.pipe(texts, batch_size=BATCH_SIZE, n_process=N_PROCESS)):
    doc_features: Dict[str, float] = {}

    tokens = [t.text for t in doc if _word_like(t)]
    doc_features.update(features_from_doc(doc))

    # Word n-gram diversity/entropy and burstiness
    doc_features["unigram_diversity"] = ngram_diversity(tokens, 1)
    doc_features["bigram_diversity"] = ngram_diversity(tokens, 2)
    doc_features["trigram_diversity"] = ngram_diversity(tokens, 3)
    doc_features["bigram_entropy"] = ngram_entropy(tokens, 2)
    doc_features["trigram_entropy"] = ngram_entropy(tokens, 3)
    doc_features["token_burstiness"] = calculate_burstiness(tokens)

    # Character-level features
    trigram_diversity, trigram_entropy = character_ngram_features(text, n=3)
    doc_features["char_trigram_diversity"] = trigram_diversity
    doc_features["char_trigram_entropy"] = trigram_entropy
    doc_features["compression_ratio"] = compression_ratio(text)
    doc_features.update(character_statistics(text))

    # Syntactic, lexical, punctuation, and sentiment features
    doc_features.update(dependency_tree_features(doc))
    doc_features.update(vocabulary_sophistication_features(tokens))
    doc_features.update(punctuation_patterns(doc))
    doc_features.update(sentiment_features(text, doc))

    feature_rows.append(doc_features)

feat_df = pd.DataFrame(feature_rows)
df_with_features = pd.concat([df.reset_index(drop=True), feat_df.reset_index(drop=True)], axis=1)

print("Computed feature columns:")
print(list(feat_df.columns))
df_with_features.head(3)


Computed feature columns:
['avg_word_length', 'type_token_ratio', 'stopword_ratio', 'punctuation_ratio', 'avg_sentence_length', 'sentence_length_std', 'flesch_reading_ease', 'gunning_fog', 'smog_index', 'automated_readability_index', 'unigram_diversity', 'bigram_diversity', 'trigram_diversity', 'bigram_entropy', 'trigram_entropy', 'token_burstiness', 'char_trigram_diversity', 'char_trigram_entropy', 'compression_ratio', 'uppercase_ratio', 'digit_ratio', 'whitespace_ratio', 'unique_char_count', 'avg_tree_depth', 'max_tree_depth', 'avg_dependency_distance', 'left_dependency_ratio', 'right_dependency_ratio', 'hapax_legomena_ratio', 'yules_k', 'mtld', 'comma_ratio', 'period_ratio', 'question_ratio', 'exclamation_ratio', 'semicolon_ratio', 'colon_ratio', 'quote_ratio', 'sentiment_polarity', 'sentiment_subjectivity', 'sentiment_polarity_variance', 'neutral_sentence_ratio', 'positive_word_ratio', 'negative_word_ratio']


Unnamed: 0,id,adv_source_id,source_id,model,decoding,repetition_penalty,attack,domain,title,prompt,...,exclamation_ratio,semicolon_ratio,colon_ratio,quote_ratio,sentiment_polarity,sentiment_subjectivity,sentiment_polarity_variance,neutral_sentence_ratio,positive_word_ratio,negative_word_ratio
0,404002f2-1cdb-4021-9215-976801dbf032,0333dff5-5fb3-4c06-a892-3ae170dfae75,0333dff5-5fb3-4c06-a892-3ae170dfae75,human,,,whitespace,reddit,Anxiety over naptime and transitioning to daycare,,...,0.0,0.0,0.0,0.0,0.025321,0.473184,0.070465,0.4375,0.04898,0.02449
1,eedc3d68-f646-425b-bb9c-9ce469d1a348,07c99297-5b54-4240-987f-91cb2e6bbe45,07c99297-5b54-4240-987f-91cb2e6bbe45,human,,,insert_paragraphs,poetry,Love And War,,...,0.011662,0.011662,0.002915,0.0,0.225272,0.601691,0.049464,0.545455,0.051661,0.01476
2,146c7064-c894-4290-b79e-daa517c3dec7,b5faf769-2be4-4428-9cde-6a6fd3fdc80a,b5faf769-2be4-4428-9cde-6a6fd3fdc80a,human,,,perplexity_misspelling,reddit,Socialists have more interest in controlling y...,,...,0.0,0.0,0.0,0.0,0.378333,0.341667,0.055892,0.7,0.045455,0.0


In [12]:
#  Save enriched dataset and basic descriptive statistics

OUT_WITH_FEATS = f"raid_sample_{SELECTED_DATASET}_with_features_PREPOS.csv"
df_with_features.to_csv(OUT_WITH_FEATS, index=False)
print(f"Saved enriched dataset: {OUT_WITH_FEATS}  (rows: {len(df_with_features)})")

feature_cols = [col for col in df_with_features.columns if col not in df.columns]
print(f"Feature columns saved ({len(feature_cols)} total):")
print(feature_cols)

display(df_with_features[feature_cols].describe(percentiles=[0.1, 0.5, 0.9]).T)


Saved enriched dataset: raid_sample_small_with_features_PREPOS.csv  (rows: 3000)
Feature columns saved (42 total):
['type_token_ratio', 'stopword_ratio', 'punctuation_ratio', 'avg_sentence_length', 'sentence_length_std', 'flesch_reading_ease', 'gunning_fog', 'smog_index', 'automated_readability_index', 'unigram_diversity', 'bigram_diversity', 'trigram_diversity', 'bigram_entropy', 'trigram_entropy', 'token_burstiness', 'char_trigram_diversity', 'char_trigram_entropy', 'compression_ratio', 'uppercase_ratio', 'whitespace_ratio', 'unique_char_count', 'avg_tree_depth', 'max_tree_depth', 'avg_dependency_distance', 'left_dependency_ratio', 'right_dependency_ratio', 'hapax_legomena_ratio', 'yules_k', 'mtld', 'comma_ratio', 'period_ratio', 'question_ratio', 'exclamation_ratio', 'semicolon_ratio', 'colon_ratio', 'quote_ratio', 'sentiment_polarity', 'sentiment_subjectivity', 'sentiment_polarity_variance', 'neutral_sentence_ratio', 'positive_word_ratio', 'negative_word_ratio']


Unnamed: 0,count,mean,std,min,10%,50%,90%,max
type_token_ratio,3000.0,0.584479,0.145419,0.004184,0.445572,0.578947,0.747368,1.0
stopword_ratio,3000.0,0.4395,0.170037,0.0,0.178825,0.474031,0.610283,0.898438
punctuation_ratio,3000.0,0.029414,0.011826,0.0,0.016418,0.028056,0.044707,0.100962
avg_sentence_length,3000.0,22.980522,21.406023,2.333333,12.5,19.538462,30.620513,356.0
sentence_length_std,3000.0,10.242729,10.866627,0.0,4.049981,7.789765,16.690021,173.5
flesch_reading_ease,3000.0,53.26772,30.376332,-268.334775,20.280795,58.412607,82.008107,110.639455
gunning_fog,3000.0,14.38853,9.321022,2.542922,7.591747,12.949603,20.922971,148.242697
smog_index,3000.0,12.103777,4.356894,0.0,7.44753,11.690004,17.122413,66.486521
automated_readability_index,3000.0,12.89804,11.296721,-1.387872,5.709764,11.077639,19.216023,175.471105
unigram_diversity,3000.0,0.607818,0.147812,0.004184,0.469693,0.601504,0.775775,1.0


In [26]:
df = df_with_features.copy()

In [27]:
for col in ["avg_sentence_length","flesch_reading_ease","gunning_fog",
            "automated_readability_index","mtld","yules_k","max_tree_depth"]:
    if col in df.columns:
        print(col, "min:", df[col].min(), "max:", df[col].max())


avg_sentence_length min: 2.3333333333333335 max: 356.0
flesch_reading_ease min: -268.33477528089884 max: 110.63945543603536
gunning_fog min: 2.542922374429224 max: 148.2426966292135
automated_readability_index min: -1.3878716865155596 max: 175.471104815864
mtld min: 1.004201680672269 max: 34890.519999999386
yules_k min: 0.0 max: 9918.69918699187
max_tree_depth min: 4.0 max: 166.0


In [28]:
import numpy as np
import pandas as pd

# -------------------------------
# Helpers
# -------------------------------

NUMERIC_COLS = [
    "avg_sentence_length","sentence_length_std","flesch_reading_ease",
    "gunning_fog","automated_readability_index","mtld","yules_k",
    "bigram_entropy","trigram_entropy","bigram_diversity","trigram_diversity",
    "avg_tree_depth","max_tree_depth","avg_dependency_distance",
    "compression_ratio","uppercase_ratio","unique_char_count","whitespace_ratio"
]

def ensure_numeric(df: pd.DataFrame, cols=NUMERIC_COLS) -> pd.DataFrame:
    """Coerce known feature columns to numeric (if present)."""
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def _q(df, col, q):
    return float(np.nanquantile(df[col].values, q)) if col in df else np.nan

# -------------------------------
# Thresholds and Diagnostics
# -------------------------------

def compute_diagnostic_thresholds(df: pd.DataFrame) -> dict:
    """Data-driven thresholds (quantile-based) + a few hard guards."""
    thr = {}
    thr["asl_hi"]       = max(80.0, _q(df, "avg_sentence_length", 0.99))
    thr["sls_hi"]       = max(80.0, _q(df, "sentence_length_std", 0.99))
    thr["fog_hi"]       = max(60.0, _q(df, "gunning_fog", 0.995))
    thr["ari_hi"]       = max(60.0, _q(df, "automated_readability_index", 0.995))
    thr["fre_lo"]       = min(-50.0, _q(df, "flesch_reading_ease", 0.005))
    thr["mtld_hi"]      = max(500.0, _q(df, "mtld", 0.995))
    thr["yk_hi"]        = max(1000.0, _q(df, "yules_k", 0.995))
    thr["depth_max_hi"] = max(50.0, _q(df, "max_tree_depth", 0.995))
    thr["depth_avg_hi"] = max(25.0, _q(df, "avg_tree_depth", 0.995))
    thr["depdist_hi"]   = max(8.0, _q(df, "avg_dependency_distance", 0.995))
    thr["comp_hi"]      = 1.05
    thr["upper_hi"]     = max(0.15, _q(df, "uppercase_ratio", 0.995))
    thr["uniq_hi"]      = max(80.0, _q(df, "unique_char_count", 0.995))
    thr["ws_lo"]        = min(0.12, _q(df, "whitespace_ratio", 0.005))
    thr["ws_hi"]        = max(0.25, _q(df, "whitespace_ratio", 0.995))
    return thr

def diagnose_feature_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """Return diagnostics dataframe with boolean flags and suspected causes list."""
    # Ensure numeric so comparisons fire correctly
    df = ensure_numeric(df)

    thr = compute_diagnostic_thresholds(df)
    D = pd.DataFrame(index=df.index)

    # Flag definitions
    D["seg_len_extreme"] = (
        (df.get("avg_sentence_length", np.nan) > thr["asl_hi"]) |
        (df.get("sentence_length_std", np.nan) > thr["sls_hi"])
    )
    D["readability_outlier"] = (
        (df.get("flesch_reading_ease", np.nan) < thr["fre_lo"]) |
        (df.get("gunning_fog", np.nan) > thr["fog_hi"]) |
        (df.get("automated_readability_index", np.nan) > thr["ari_hi"])
    )
    D["lexical_instability"] = (
        (df.get("mtld", np.nan) > thr["mtld_hi"]) |
        (df.get("yules_k", np.nan) > thr["yk_hi"])
    )
    D["ngram_edge_effects"] = (
        (df.get("bigram_entropy", np.nan) == 0) |
        (df.get("trigram_entropy", np.nan) == 0) |
        ((df.get("trigram_diversity", np.nan) >= 0.99) & (df.get("trigram_entropy", np.nan) < 1.0)) |
        ((df.get("bigram_diversity", np.nan)   >= 0.99) & (df.get("bigram_entropy", np.nan)   < 1.0))
    )
    D["depth_implausible"] = (
        (df.get("max_tree_depth", np.nan) > thr["depth_max_hi"]) |
        (df.get("avg_tree_depth", np.nan) > thr["depth_avg_hi"])
    )
    D["dep_distance_implausible"] = (df.get("avg_dependency_distance", np.nan) > thr["depdist_hi"])
    D["compression_overhead"] = (df.get("compression_ratio", np.nan) > thr["comp_hi"])
    D["markup_noise"] = (
        ((df.get("uppercase_ratio", np.nan) > thr["upper_hi"]) & (df.get("unique_char_count", np.nan) > thr["uniq_hi"])) |
        (df.get("whitespace_ratio", np.nan) < thr["ws_lo"]) |
        (df.get("whitespace_ratio", np.nan) > thr["ws_hi"])
    )

    # Suspected causes column (pre-allocate + .at)
    D["suspected_causes"] = pd.Series([[] for _ in range(len(D))], index=D.index, dtype=object)
    for i in D.index:
        c = []
        if bool(D.at[i, "depth_implausible"]) or bool(D.at[i, "dep_distance_implausible"]):
            c.append("dependency_depth_computation_bug")
        if bool(D.at[i, "seg_len_extreme"]) and bool(D.at[i, "readability_outlier"]):
            c.append("sentence_segmentation_failure")
        if bool(D.at[i, "lexical_instability"]) or bool(D.at[i, "ngram_edge_effects"]):
            c.append("lexical_metric_instability_or_length_effects")
        if bool(D.at[i, "compression_overhead"]) or bool(D.at[i, "markup_noise"]):
            c.append("markup_or_code_noise")
        D.at[i, "suspected_causes"] = c

    # Severity score
    flag_cols = [c for c in D.columns if c != "suspected_causes"]
    for c in flag_cols:
        D[c] = D[c].fillna(False).astype(bool)
    D["severity"] = D[flag_cols].sum(axis=1).astype(int)

    # Attach thresholds for later inspection
    D.attrs["thresholds"] = thr
    return D

def build_diagnostic_report(diag: pd.DataFrame, top_k: int = 15):
    """Summarize counts by cause and return top offending rows by severity."""
    cause_counts = (
        diag["suspected_causes"].explode().value_counts(dropna=True)
        .rename_axis("cause").to_frame("count")
    )
    flag_counts = (
        diag.drop(columns=["suspected_causes"])
            .sum()
            .sort_values(ascending=False)
            .to_frame("count")
    )
    top_offenders = diag.sort_values("severity", ascending=False).head(top_k)
    return {"cause_counts": cause_counts, "flag_counts": flag_counts, "top_offenders": top_offenders}

# -------------------------------
# Runner / Example usage
# -------------------------------

# Assume you already have `df` with your features
# If your features were just computed and may be strings, the coercion inside
# diagnose_feature_outliers will handle them; coercing here is optional:
# df = ensure_numeric(df)

diag = diagnose_feature_outliers(df)
rep  = build_diagnostic_report(diag, top_k=20)

print("Computed thresholds:")
for k, v in diag.attrs.get("thresholds", {}).items():
    print(f"  {k}: {v}")

from IPython.display import display

print("\nCause counts:")
display(rep["cause_counts"])

print("\nFlag counts:")
display(rep["flag_counts"])

print("\nTop offenders (with original feature values):")
cols_to_show = [
    "avg_sentence_length","sentence_length_std","flesch_reading_ease",
    "gunning_fog","automated_readability_index","mtld","yules_k",
    "avg_tree_depth","max_tree_depth","avg_dependency_distance",
    "compression_ratio","uppercase_ratio","unique_char_count",
    "whitespace_ratio","bigram_entropy","trigram_entropy",
    "bigram_diversity","trigram_diversity"
]
existing_cols = [c for c in cols_to_show if c in df.columns]
display(rep["top_offenders"].join(df[existing_cols], how="left"))





Computed thresholds:
  asl_hi: 105.66999999999993
  sls_hi: 80.0
  fog_hi: 69.58179761904763
  ari_hi: 87.17153812500003
  fre_lo: -88.94751031717743
  mtld_hi: 9246.999300000036
  yk_hi: 1515.6763993614138
  depth_max_hi: 50.0
  depth_avg_hi: 25.0
  depdist_hi: 8.0
  comp_hi: 1.05
  upper_hi: 0.15
  uniq_hi: 80.0
  ws_lo: 0.12
  ws_hi: 0.25

Cause counts:


Unnamed: 0_level_0,count
cause,Unnamed: 1_level_1
lexical_metric_instability_or_length_effects,30
dependency_depth_computation_bug,18
sentence_segmentation_failure,18
markup_or_code_noise,16



Flag counts:


Unnamed: 0,count
severity,121
seg_len_extreme,38
lexical_instability,30
readability_outlier,18
depth_implausible,12
markup_noise,10
compression_overhead,6
dep_distance_implausible,6
ngram_edge_effects,1



Top offenders (with original feature values):


Unnamed: 0,seg_len_extreme,readability_outlier,lexical_instability,ngram_edge_effects,depth_implausible,dep_distance_implausible,compression_overhead,markup_noise,suspected_causes,severity,...,max_tree_depth,avg_dependency_distance,compression_ratio,uppercase_ratio,unique_char_count,whitespace_ratio,bigram_entropy,trigram_entropy,bigram_diversity,trigram_diversity
2131,True,True,True,True,True,False,False,True,"[dependency_depth_computation_bug, sentence_se...",6,...,122.0,1.008197,0.031805,0.090976,10.0,0.090237,0.0,0.0,0.008197,0.008264
1731,True,True,True,False,True,False,False,True,"[dependency_depth_computation_bug, sentence_se...",5,...,139.0,1.804035,0.049206,0.057937,15.0,0.27619,2.387079,2.387206,0.022989,0.023055
369,True,True,True,False,False,False,False,True,"[sentence_segmentation_failure, lexical_metric...",4,...,16.0,4.238693,0.497942,0.009053,42.0,0.109739,8.647458,8.643856,1.0,1.0
2007,True,True,True,False,True,False,False,False,"[dependency_depth_computation_bug, sentence_se...",4,...,83.0,3.48,0.033784,0.050369,13.0,0.199631,1.99998,2.0,0.012308,0.012346
2363,True,True,True,False,True,False,False,False,"[dependency_depth_computation_bug, sentence_se...",4,...,100.0,1.874525,0.057904,0.0,14.0,0.241728,2.999926,2.999873,0.030418,0.030534
1381,True,True,True,False,False,False,False,False,"[sentence_segmentation_failure, lexical_metric...",3,...,13.0,6.292994,0.133734,0.000751,29.0,0.157025,3.659705,4.227868,0.216561,0.269231
225,True,True,False,False,True,False,False,False,"[dependency_depth_computation_bug, sentence_se...",3,...,79.0,1.890196,0.064407,0.0,19.0,0.216102,3.321651,3.32166,0.039216,0.03937
1462,True,True,True,False,False,False,False,False,"[sentence_segmentation_failure, lexical_metric...",3,...,12.0,7.874317,0.165308,0.00593,35.0,0.167532,4.316971,5.107096,0.273224,0.346154
1105,True,True,False,False,False,True,False,False,"[dependency_depth_computation_bug, sentence_se...",3,...,17.0,12.525568,0.058041,0.041124,24.0,0.183238,3.612247,3.82964,0.039773,0.048433
1746,True,True,False,False,False,True,False,False,"[dependency_depth_computation_bug, sentence_se...",3,...,8.0,14.422535,0.084525,0.031697,21.0,0.220634,4.226473,4.841327,0.095775,0.129944
