In [None]:
%pip install -q spacy spacy-transformers torch torchvision torchaudio --upgrade
!python -m spacy download en_core_web_trf

import spacy

Collecting en-core-web-trf==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Load spaCy transformer model (en_core_web_trf)

spacy.require_gpu()

try:
    nlp = spacy.load("en_core_web_trf")
except OSError:
    from spacy.cli import download
    download("en_core_web_trf")
    nlp = spacy.load("en_core_web_trf")


print("Pipeline:", nlp.pipe_names)

Pipeline: ['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
# Select which sample to process and load it

import pandas as pd

SAMPLE_PATHS = {
    "small":  "raid_sample_small_with_features_PREPOS.csv",
    "medium": "raid_sample_medium_with_features_PREPOS.csv",
    "large":  "/content/drive/MyDrive/Tesi Magistrale/raid_sample_large_with_features_PREPOS.csv",
}

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Choose one: "small", "medium", or "large"
SELECTED_DATASET = "large"
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

TEXT_COL = "generation"

df = pd.read_csv(SAMPLE_PATHS[SELECTED_DATASET])
if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found.")
texts = df[TEXT_COL].astype(str).tolist()
print(f"Loaded {SELECTED_DATASET}: {len(df)} rows.")
df.head(3)

Loaded large: 60000 rows.


Unnamed: 0,id,adv_source_id,source_id,model,decoding,repetition_penalty,attack,domain,title,prompt,...,avg_word_length,type_token_ratio,stopword_ratio,punctuation_ratio,avg_sentence_length,sentence_length_std,flesch_reading_ease,gunning_fog,smog_index,automated_readability_index
0,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,human,,,none,abstracts,Combo Loss: Handling Input and Output Imbalanc...,,...,5.434783,0.576087,0.387681,0.023933,27.6,7.337575,21.88187,20.750145,17.916177,18.053152
1,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,9ad3ff6b-b309-4a44-9a1f-ecd14cd04e10,human,,,none,abstracts,A Variational Image Segmentation Model based o...,,...,5.691756,0.512545,0.415771,0.020962,19.928571,7.075756,18.317177,18.294009,16.084391,15.342458
2,38d4f731-4259-4770-9669-255b61bf61b2,38d4f731-4259-4770-9669-255b61bf61b2,38d4f731-4259-4770-9669-255b61bf61b2,human,,,none,abstracts,Computing Valid p-values for Image Segmentatio...,,...,5.739362,0.542553,0.446809,0.019982,23.5,9.0,12.8825,21.314894,18.243606,17.352394


In [None]:
#  inventories and helper functions

import math
import numpy as np
from collections import Counter
from typing import Dict, List

# Low-dimensional UPOS subset (13 tags) to keep features compact
POS_TAGS = ["NOUN", "VERB", "ADJ", "ADV", "PRON", "DET", "ADP", "AUX", "CCONJ", "PART", "NUM", "PUNCT", "X"]
POS_SET = set(POS_TAGS)

# Content vs Function sets (within the 13-tag subset)
CONTENT = {"NOUN", "VERB", "ADJ", "ADV", "NUM"}
FUNCTION = {"PRON", "DET", "ADP", "AUX", "CCONJ", "PART", "PUNCT", "X"}

def safe_div(a: float, b: float) -> float:
    return float(a / b) if b > 0 else 0.0

def shannon_entropy_from_counts(counter: Counter) -> float:
    total = sum(counter.values())
    if total == 0:
        return 0.0
    ent = 0.0
    for c in counter.values():
        p = c / total
        ent -= p * math.log2(max(p, 1e-12))
    return float(ent)

def row_entropy(transition_counts: Counter) -> Dict[str, float]:
    """
    Row-wise entropy H(P(.|A)) for each preceding tag A in POS_TAGS.
    """
    res = {}
    # Row totals per A
    row_totals = {A: 0 for A in POS_TAGS}
    for (A, B), v in transition_counts.items():
        if A in POS_SET and B in POS_SET:
            row_totals[A] += v
    for A in POS_TAGS:
        den = row_totals[A]
        if den == 0:
            res[A] = 0.0
            continue
        ent = 0.0
        for B in POS_TAGS:
            num = transition_counts.get((A, B), 0)
            if num == 0:
                continue
            p = num / den
            ent -= p * math.log2(max(p, 1e-12))
        res[A] = float(ent)
    return res

def run_lengths(seq: List[str], target: str) -> List[int]:
    """
    Consecutive run lengths for 'target' in seq (e.g., NOUN, PUNCT).
    """
    runs = []
    cur = 0
    for t in seq:
        if t == target:
            cur += 1
        else:
            if cur > 0:
                runs.append(cur)
                cur = 0
    if cur > 0:
        runs.append(cur)
    return runs



In [None]:
# Core extractor

def pos_compact_features(doc) -> Dict[str, float]:
    # Tokens (exclude spaces)
    toks = [t for t in doc if not t.is_space]
    total_tokens = len(toks)
    if total_tokens == 0:
        # Build zeroed feature vector with all expected keys
        feats = {f"pos_ratio_{tag}": 0.0 for tag in POS_TAGS}
        base_zeros = {
            "upos_entropy": 0.0,
            "pos_transition_entropy": 0.0,
            "pos_row_entropy_weighted": 0.0,
            "self_transition_rate": 0.0,
            "content_to_function_rate": 0.0,
            "function_to_content_rate": 0.0,
            "noun_verb_alternation_rate": 0.0,
            "content_function_ratio": 0.0,
            "noun_verb_ratio": 0.0,
            "adj_adv_ratio": 0.0,
            "verbs_per_100_tok": 0.0,
            "nouns_per_100_tok": 0.0,
            "adj_per_100_tok": 0.0,
            "adv_per_100_tok": 0.0,
            "pron_per_100_tok": 0.0,
            "punct_per_100_tok": 0.0,
            "tokens_per_sentence_mean": 0.0,
            "sentence_length_std": 0.0,
            "mean_nouns_per_sent": 0.0,
            "mean_verbs_per_sent": 0.0,
            "mean_adjs_per_sent": 0.0,
            "mean_advs_per_sent": 0.0,
            "prop_sents_with_verb": 0.0,
            "unique_upos_per_sent_mean": 0.0,
            "max_runlen_NOUN": 0.0,
            "max_runlen_PUNCT": 0.0,
        }
        feats.update(base_zeros)
        return feats

    # UPOS sequence restricted to our 13-tag subset (map others to X)
    pos_seq = [t.pos_ if t.pos_ in POS_SET else "X" for t in toks]

    # Unigram ratios (baseline)
    pos_counts = Counter(pos_seq)
    pos_ratios = {f"pos_ratio_{tag}": safe_div(pos_counts.get(tag, 0), total_tokens) for tag in POS_TAGS}

    # Unigram entropy
    upos_entropy = shannon_entropy_from_counts(pos_counts)

    # Bigrams and transition summaries
    transitions = list(zip(pos_seq, pos_seq[1:]))
    trans_counts = Counter(transitions)
    total_transitions = sum(trans_counts.values())

    # Global bigram entropy
    pos_transition_entropy = shannon_entropy_from_counts(trans_counts)

    # Row entropy per A, then weighted mean by P(A)
    row_ent = row_entropy(trans_counts)                         # dict A -> H(P(.|A))
    pA = {A: safe_div(sum(v for (A2, _), v in trans_counts.items() if A2 == A), total_transitions) for A in POS_TAGS}
    pos_row_entropy_weighted = sum(pA[A] * row_ent.get(A, 0.0) for A in POS_TAGS)

    # Self-transition rate
    self_transition_rate = safe_div(sum(trans_counts.get((A, A), 0) for A in POS_TAGS), total_transitions)

    # Content↔Function transition rates
    content_to_function = sum(v for (A, B), v in trans_counts.items() if A in CONTENT and B in FUNCTION)
    function_to_content = sum(v for (A, B), v in trans_counts.items() if A in FUNCTION and B in CONTENT)
    content_to_function_rate = safe_div(content_to_function, total_transitions)
    function_to_content_rate = safe_div(function_to_content, total_transitions)

    # Noun↔Verb alternation rate
    nv_alt = (trans_counts.get(("NOUN","VERB"),0) + trans_counts.get(("PROPN","VERB"),0) +
              trans_counts.get(("VERB","NOUN"),0) + trans_counts.get(("VERB","PROPN"),0) +
              trans_counts.get(("AUX","NOUN"),0)  + trans_counts.get(("AUX","PROPN"),0) +
              trans_counts.get(("NOUN","AUX"),0)  + trans_counts.get(("PROPN","AUX"),0))
    noun_verb_alternation_rate = safe_div(nv_alt, total_transitions)

    # Ratios and densities
    nouns = pos_counts.get("NOUN",0)      # PROPN not in 13-tag subset; kept compact by design
    verbs = pos_counts.get("VERB",0) + pos_counts.get("AUX",0)
    adjs  = pos_counts.get("ADJ",0)
    advs  = pos_counts.get("ADV",0)
    prons = pos_counts.get("PRON",0)
    punct = pos_counts.get("PUNCT",0)

    content_sum  = sum(pos_counts.get(t,0) for t in CONTENT)
    function_sum = sum(pos_counts.get(t,0) for t in FUNCTION)

    content_function_ratio = safe_div(content_sum, function_sum)
    noun_verb_ratio        = safe_div(nouns, verbs)
    adj_adv_ratio          = safe_div(adjs, advs)

    verbs_per_100_tok = 100.0 * safe_div(verbs, total_tokens)
    nouns_per_100_tok = 100.0 * safe_div(nouns, total_tokens)
    adj_per_100_tok   = 100.0 * safe_div(adjs,  total_tokens)
    adv_per_100_tok   = 100.0 * safe_div(advs,  total_tokens)
    pron_per_100_tok  = 100.0 * safe_div(prons, total_tokens)
    punct_per_100_tok = 100.0 * safe_div(punct, total_tokens)

    # Sentence-level aggregates
    sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
    if len(sents) == 0:
        sents = [doc]

    toks_per_sent = []
    nouns_ps, verbs_ps, adjs_ps, advs_ps, verb_presence = [], [], [], [], []
    unique_upos_counts = []

    for s in sents:
        stoks = [t for t in s if not t.is_space]
        toks_per_sent.append(len(stoks))
        upos_s = [t.pos_ if t.pos_ in POS_SET else "X" for t in stoks]
        c_s = Counter(upos_s)
        nouns_ps.append(c_s.get("NOUN",0))
        verbs_ps.append(c_s.get("VERB",0) + c_s.get("AUX",0))
        adjs_ps.append(c_s.get("ADJ",0))
        advs_ps.append(c_s.get("ADV",0))
        verb_presence.append(1 if (c_s.get("VERB",0)+c_s.get("AUX",0))>0 else 0)
        unique_upos_counts.append(len({t for t in upos_s if t in POS_SET}))

    tokens_per_sentence_mean = float(np.mean(toks_per_sent)) if toks_per_sent else 0.0
    sentence_length_std      = float(np.std(toks_per_sent, ddof=0)) if len(toks_per_sent)>1 else 0.0
    mean_nouns_per_sent      = float(np.mean(nouns_ps)) if nouns_ps else 0.0
    mean_verbs_per_sent      = float(np.mean(verbs_ps)) if verbs_ps else 0.0
    mean_adjs_per_sent       = float(np.mean(adjs_ps))  if adjs_ps  else 0.0
    mean_advs_per_sent       = float(np.mean(advs_ps))  if advs_ps  else 0.0
    prop_sents_with_verb     = safe_div(sum(verb_presence), len(verb_presence))
    unique_upos_per_sent_mean= float(np.mean(unique_upos_counts)) if unique_upos_counts else 0.0

    # Run-length indicators (kept minimal)
    rl_noun  = run_lengths(pos_seq, "NOUN")
    rl_punct = run_lengths(pos_seq, "PUNCT")
    max_runlen_NOUN  = float(max(rl_noun))  if rl_noun  else 0.0
    max_runlen_PUNCT = float(max(rl_punct)) if rl_punct else 0.0

    feats = {}
    feats.update(pos_ratios)
    feats.update({
        "upos_entropy": upos_entropy,
        "pos_transition_entropy": pos_transition_entropy,
        "pos_row_entropy_weighted": pos_row_entropy_weighted,
        "self_transition_rate": self_transition_rate,
        "content_to_function_rate": content_to_function_rate,
        "function_to_content_rate": function_to_content_rate,
        "noun_verb_alternation_rate": noun_verb_alternation_rate,
        "content_function_ratio": content_function_ratio,
        "noun_verb_ratio": noun_verb_ratio,
        "adj_adv_ratio": adj_adv_ratio,
        "verbs_per_100_tok": verbs_per_100_tok,
        "nouns_per_100_tok": nouns_per_100_tok,
        "adj_per_100_tok":   adj_per_100_tok,
        "adv_per_100_tok":   adv_per_100_tok,
        "pron_per_100_tok":  pron_per_100_tok,
        "punct_per_100_tok": punct_per_100_tok,
        "tokens_per_sentence_mean": tokens_per_sentence_mean,
        "sentence_length_std":      sentence_length_std,
        "mean_nouns_per_sent":      mean_nouns_per_sent,
        "mean_verbs_per_sent":      mean_verbs_per_sent,
        "mean_adjs_per_sent":       mean_adjs_per_sent,
        "mean_advs_per_sent":       mean_advs_per_sent,
        "prop_sents_with_verb":     prop_sents_with_verb,
        "unique_upos_per_sent_mean": unique_upos_per_sent_mean,
        "max_runlen_NOUN":  max_runlen_NOUN,
        "max_runlen_PUNCT": max_runlen_PUNCT,
    })
    return feats


In [None]:
#  Apply over the dataset

BATCH_SIZE = 28
N_PROCESS = 1
rows = []

for doc in nlp.pipe(texts, batch_size=BATCH_SIZE, n_process=N_PROCESS):
    rows.append(pos_compact_features(doc))

pos_df = pd.DataFrame(rows).fillna(0.0)

df_postpos = pd.concat([df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)


In [None]:
OUT_POSTPOS = f"raid_sample_{SELECTED_DATASET}_PostPOS.csv"
df_postpos.to_csv(OUT_POSTPOS, index=False)
print(f"Saved: {OUT_POSTPOS}  (rows: {len(df_postpos)}, cols: {df_postpos.shape[1]})")

# Brief check
preview_cols = [
    "pos_ratio_NOUN","pos_ratio_VERB","pos_ratio_ADJ","pos_ratio_ADV",
    "upos_entropy","pos_transition_entropy","pos_row_entropy_weighted",
    "content_function_ratio","noun_verb_ratio","adj_adv_ratio",
    "verbs_per_100_tok","nouns_per_100_tok","punct_per_100_tok",
    "tokens_per_sentence_mean","sentence_length_std","prop_sents_with_verb",
    "unique_upos_per_sent_mean","max_runlen_NOUN","max_runlen_PUNCT"
]
existing = [c for c in preview_cols if c in df_postpos.columns]
display(df_postpos[existing].describe(percentiles=[0.1,0.5,0.9]).T)

Saved: raid_sample_large_PostPOS.csv  (rows: 60000, cols: 65)


Unnamed: 0,count,mean,std,min,10%,50%,90%,max
pos_ratio_NOUN,60000.0,0.174074,0.10192,0.0,0.00295,0.181495,0.292385,1.0
pos_ratio_VERB,60000.0,0.092963,0.051279,0.0,0.0,0.103275,0.148352,0.444444
pos_ratio_ADJ,60000.0,0.059583,0.040468,0.0,0.0,0.060606,0.107914,0.707819
pos_ratio_ADV,60000.0,0.03021,0.028304,0.0,0.0,0.024896,0.066667,0.401929
upos_entropy,60000.0,2.783839,1.039526,0.0,0.856874,3.241742,3.431795,3.595925
pos_transition_entropy,60000.0,4.634109,1.732962,0.0,1.584945,5.351941,5.853708,6.421572
pos_row_entropy_weighted,60000.0,1.851352,0.724222,0.0,0.6469,2.118272,2.466479,2.887667
content_function_ratio,60000.0,0.941733,5.432987,0.0,0.017937,0.697938,1.285781,340.0
noun_verb_ratio,60000.0,1.427636,3.930688,0.0,0.0,1.2,2.428571,479.0
adj_adv_ratio,60000.0,2.495151,3.783412,0.0,0.0,1.642857,5.5,255.0
