In [1]:
#Configuration and dataset selection

import sys
import math
import re
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import spacy
from spacy.language import Language
from spacy.tokens import Doc

import nltk
from nltk.corpus import cmudict

from g2p_en import G2p


# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths to the three samples
SAMPLE_PATHS = {
    "small":  "raid_sample_small.csv",
    "medium": "raid_sample_medium.csv",
    "large":  "raid_sample_large.csv",
}

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Set this to one of: "small", "medium", "large"
SELECTED_DATASET = "large"
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

TEXT_COL = "generation"

df = pd.read_csv(SAMPLE_PATHS[SELECTED_DATASET])
print(f"Loaded {SELECTED_DATASET} dataset with {len(df)} rows.")
df.head(3)


Loaded large dataset with 60000 rows.


Unnamed: 0,id,adv_source_id,source_id,model,decoding,repetition_penalty,attack,domain,title,prompt,generation,is_ai,source_type,n_tokens_ws,n_chars,length_bin
0,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,human,,,none,abstracts,Combo Loss: Handling Input and Output Imbalanc...,,Simultaneous segmentation of multiple organs f...,False,human,271,1816,long
1,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,human,,,none,abstracts,A Variational Image Segmentation Model based o...,,Image segmentation is a fundamental research t...,False,human,271,1892,long
2,38d4f731-4259-4770-9669-255b61bf61b2,38d4f731-4259-4770-9669-255b61bf61b2,38d4f731-4259-4770-9669-255b61bf61b2,human,,,none,abstracts,Computing Valid p-values for Image Segmentatio...,,Image segmentation is one of the most fundamen...,False,human,183,1283,medium


In [2]:
# Initialize spaCy, CMUdict (NLTK), and g2p_en fallback

# Ensure CMUdict is available
nltk.download('cmudict', quiet=True)
nltk.download('averaged_perceptron_tagger_eng')
CMU = cmudict.dict()  # key: lowercase word, value: list of pronunciations (list of ARPAbet tokens)

# Load spaCy English model (use 'en_core_web_sm' unless you already have md/lg installed)
try:
    nlp: Language = spacy.load("en_core_web_sm")
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Ensure sentence boundaries are available (parser usually handles this; add sentencizer if needed)
if "sentencizer" not in nlp.pipe_names and "parser" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

# g2p_en for OOV coverage
G2P = G2p()

# ARPAbet vowel bases (stress digits removed when checking)
ARPA_VOWELS = {
    "AA", "AE", "AH", "AO", "AW", "AY",
    "EH", "ER", "EY",
    "IH", "IY",
    "OW", "OY",
    "UH", "UW"
}

# Cache syllable counts for speed
_SYLL_CACHE: Dict[str, int] = {}

def cmu_syllables(word: str) -> int | None:
    """
    Returns syllable count using CMUdict if available; else None.
    Policy: use the first pronunciation variant.
    """
    w = word.lower()
    if w not in CMU:
        return None
    phones = CMU[w][0]
    count = 0
    for ph in phones:
        base = re.sub(r"\d", "", ph)
        if base in ARPA_VOWELS:
            count += 1
    return max(count, 1)  # at least one for non-empty alphabetic words

def g2p_syllables(word: str) -> int:
    """
    Returns syllable count using neural g2p_en; counts vowel phonemes.
    """
    w = word.lower()
    if w in _SYLL_CACHE:
        return _SYLL_CACHE[w]
    phones = G2P(w)
    count = 0
    for ph in phones:
        base = re.sub(r"\d", "", ph)
        if base in ARPA_VOWELS:
            count += 1
    # Guard: ensure >=1 for alphabetic tokens
    if count == 0 and re.search(r"[A-Za-z]", w):
        count = 1
    _SYLL_CACHE[w] = count
    return count

def syllables_hybrid(word: str) -> int:
    """
    Hybrid policy: try CMUdict first; if OOV, fall back to g2p_en.
    """
    c = cmu_syllables(word)
    if c is not None:
        return c
    return g2p_syllables(word)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
# Feature computation utilities using spaCy + CMUdict with g2p_en fallback

def _word_like(tok) -> bool:
    """
    Select lexical tokens (alphabetic, not space).
    spaCy's tok.is_alpha ensures letter-only tokens; change if you want alphanumerics.
    """
    return tok.is_alpha and not tok.is_space

def _alnum_char_count(token_text: str) -> int:
    """Count alphanumeric characters for ARI; excludes whitespace and punctuation."""
    return sum(ch.isalnum() for ch in token_text)

def features_from_doc(doc: Doc) -> Dict[str, float]:
    """
    Computes:
      - avg_word_length
      - type_token_ratio
      - stopword_ratio
      - punctuation_ratio       (punct chars / non-space chars)
      - avg_sentence_length     (words per sentence)
      - sentence_length_std     (std of sentence word counts)
      - flesch_reading_ease
      - gunning_fog
      - smog_index
      - automated_readability_index
    """
    # Sentences
    sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
    n_sents = max(len(sents), 1)

    # Token groups
    word_toks = [t for t in doc if _word_like(t)]
    punct_toks = [t for t in doc if t.is_punct]
    nonspace_toks = [t for t in doc if not t.is_space]

    W = len(word_toks)

    # Characters for ARI and punctuation ratio
    chars_alnum = sum(_alnum_char_count(t.text) for t in nonspace_toks)
    punct_chars = sum(len(t.text) for t in punct_toks)
    nonspace_chars = sum(len(t.text) for t in nonspace_toks)

    # Sentence-level word counts
    sent_word_counts = [sum(1 for t in s if _word_like(t)) for s in sents]
    avg_sentence_length = float(np.mean(sent_word_counts)) if sent_word_counts else 0.0
    sentence_length_std  = float(np.std(sent_word_counts, ddof=0)) if len(sent_word_counts) > 1 else 0.0

    # Word-level lengths
    word_lengths = [len(t.text) for t in word_toks]
    avg_word_length = float(np.mean(word_lengths)) if word_lengths else 0.0

    # Type-token ratio (lowercased forms)
    vocab = {t.text.lower() for t in word_toks}
    type_token_ratio = (len(vocab) / W) if W > 0 else 0.0

    # Stopword ratio via spaCy stop flags
    stop_count = sum(1 for t in word_toks if t.is_stop)
    stopword_ratio = (stop_count / W) if W > 0 else 0.0

    # Punctuation ratio over non-space characters
    punctuation_ratio = (punct_chars / nonspace_chars) if nonspace_chars > 0 else 0.0

    # Syllables (hybrid)
    syll_per_word = [syllables_hybrid(t.text) for t in word_toks] if W > 0 else []
    syll_total = int(np.sum(syll_per_word)) if syll_per_word else 0
    polysyllables = int(np.sum([syl >= 3 for syl in syll_per_word])) if syll_per_word else 0
    complex_words = polysyllables  # standard: >= 3 syllables

    # Rates for readability
    words_per_sentence = (W / n_sents) if n_sents > 0 else 0.0
    syllables_per_word = (syll_total / W) if W > 0 else 0.0
    chars_per_word_ari = (chars_alnum / W) if W > 0 else 0.0

    # Readability indices
    # Flesch Reading Ease
    flesch = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word

    # Gunning Fog
    fog = 0.4 * (words_per_sentence + 100.0 * (complex_words / W if W > 0 else 0.0))

    # SMOG
    smog = (1.043 * math.sqrt(polysyllables * (30.0 / n_sents)) + 3.1291) if (polysyllables > 0 and n_sents > 0) else 0.0

    # Automated Readability Index
    ari = 4.71 * chars_per_word_ari + 0.5 * words_per_sentence - 21.43

    return {
        "avg_word_length": avg_word_length,
        "type_token_ratio": type_token_ratio,
        "stopword_ratio": stopword_ratio,
        "punctuation_ratio": punctuation_ratio,
        "avg_sentence_length": avg_sentence_length,
        "sentence_length_std": sentence_length_std,
        "flesch_reading_ease": flesch,
        "gunning_fog": fog,
        "smog_index": smog,
        "automated_readability_index": ari,
    }



In [4]:
# Application NLP.pipe based

BATCH_SIZE = 64
# 1 for deterministic ordering in some environments; -1  all available cores
N_PROCESS = -1

texts = df[TEXT_COL].astype(str).tolist()

feature_rows: List[Dict[str, float]] = []
for doc in nlp.pipe(texts, batch_size=BATCH_SIZE, n_process=N_PROCESS):
    feature_rows.append(features_from_doc(doc))

feat_df = pd.DataFrame(feature_rows)
df_with_features = pd.concat([df.reset_index(drop=True), feat_df.reset_index(drop=True)], axis=1)

print("Computed feature columns:")
print(list(feat_df.columns))
df_with_features.head(3)


Computed feature columns:
['avg_word_length', 'type_token_ratio', 'stopword_ratio', 'punctuation_ratio', 'avg_sentence_length', 'sentence_length_std', 'flesch_reading_ease', 'gunning_fog', 'smog_index', 'automated_readability_index']


Unnamed: 0,id,adv_source_id,source_id,model,decoding,repetition_penalty,attack,domain,title,prompt,...,avg_word_length,type_token_ratio,stopword_ratio,punctuation_ratio,avg_sentence_length,sentence_length_std,flesch_reading_ease,gunning_fog,smog_index,automated_readability_index
0,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,human,,,none,abstracts,Combo Loss: Handling Input and Output Imbalanc...,,...,5.434783,0.576087,0.387681,0.023933,27.6,7.337575,21.88187,20.750145,17.916177,18.053152
1,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,human,,,none,abstracts,A Variational Image Segmentation Model based o...,,...,5.691756,0.512545,0.415771,0.020962,19.928571,7.075756,18.317177,18.294009,16.084391,15.342458
2,38d4f731-4259-4770-9669-255b61bf61b2,38d4f731-4259-4770-9669-255b61bf61b2,38d4f731-4259-4770-9669-255b61bf61b2,human,,,none,abstracts,Computing Valid p-values for Image Segmentatio...,,...,5.739362,0.542553,0.446809,0.019982,23.5,9.0,12.8825,21.314894,18.243606,17.352394


In [5]:
# Cell 5: Save enriched dataset and basic descriptive statistics

OUT_WITH_FEATS = f"raid_sample_{SELECTED_DATASET}_with_features_PREPOS.csv"
df_with_features.to_csv(OUT_WITH_FEATS, index=False)
print(f"Saved enriched dataset: {OUT_WITH_FEATS}  (rows: {len(df_with_features)})")

cols_to_describe = [
    "avg_word_length",
    "type_token_ratio",
    "stopword_ratio",
    "punctuation_ratio",
    "avg_sentence_length",
    "sentence_length_std",
    "flesch_reading_ease",
    "gunning_fog",
    "smog_index",
    "automated_readability_index",
]
display(df_with_features[cols_to_describe].describe(percentiles=[0.1, 0.5, 0.9]).T)


Saved enriched dataset: raid_sample_large_with_features_PREPOS.csv  (rows: 60000)


Unnamed: 0,count,mean,std,min,10%,50%,90%,max
avg_word_length,60000.0,4.395837,1.548118,0.0,3.586753,4.679701,5.567568,103.444444
type_token_ratio,60000.0,0.543659,0.237435,0.0,0.081917,0.570597,0.807692,1.0
stopword_ratio,60000.0,0.401944,0.205058,0.0,0.0,0.462687,0.604297,1.0
punctuation_ratio,60000.0,0.027598,0.018124,0.0,0.005291,0.026568,0.046552,1.0
avg_sentence_length,60000.0,22.572604,28.120557,0.0,8.666667,19.166667,30.5,510.0
sentence_length_std,60000.0,9.283595,13.115418,0.0,0.0,6.819091,15.881937,507.187233
flesch_reading_ease,60000.0,63.494831,55.253984,-451.815,19.530901,59.35073,94.516103,206.835
gunning_fog,60000.0,13.960141,12.196909,0.0,5.152914,12.755556,20.907391,204.0
smog_index,60000.0,11.330193,5.831059,0.0,4.604125,11.491704,17.122413,81.24973
automated_readability_index,60000.0,11.222168,28.318284,-21.43,2.223114,10.911729,19.190989,4543.06
