 ###  **Importing** + Config

In [None]:
# ─── Configuration ─────────────────────────────────────────────────────────
from pathlib import Path
import random
import numpy as np
import pandas as pd
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)

from utils.baseline_utils import (
    load_data,
    process_binary_labels,
    process_base_ablation_labels,
)

In [None]:
# ─── If processing TRAIN_FILE ─────────────────────────────────────────────────────────

train_df, test_df = load_data()

df = train_df

In [None]:
# ─── If processing TEST_FILE ─────────────────────────────────────────────────────────
train_df, test_df = load_data()

df = test_df


In [None]:

import spacy
import textstat

# Load spaCy model
nlp = spacy.load("en_core_web_md")

### Baseline **Text Features**

In [4]:
# Helper function to extract features
def extract_text_features(text):
    doc = nlp(text)
    
    words = [token.text for token in doc if token.is_alpha]
    stopwords = [token for token in doc if token.is_stop]
    punctuations = [token for token in doc if token.is_punct]
    sentences = list(doc.sents)

    total_words = len(words)
    total_sentences = len(sentences)
    
    features = {
        "avg_word_length": sum(len(word) for word in words) / total_words if total_words else 0,
        "type_token_ratio": len(set(words)) / total_words if total_words else 0,
        "stopword_ratio": len(stopwords) / len(doc) if len(doc) else 0,
        "punctuation_ratio": len(punctuations) / len(doc) if len(doc) else 0,
        "avg_sentence_length": total_words / total_sentences if total_sentences else 0,
        "sentence_length_std": pd.Series([len(sent) for sent in sentences]).std() if total_sentences > 1 else 0,
        
        # Readability scores (textstat works on raw string)
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "gunning_fog": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "automated_readability_index": textstat.automated_readability_index(text),
    }
    
    return features

In [5]:
# Ensure generation column is string
df["generation"] = df["generation"].astype(str)

# Apply feature extraction
features_df = df["generation"].apply(extract_text_features).apply(pd.Series)

# Merge with original data
df_features = pd.concat([df, features_df], axis=1)

### Baseline **POS-Tags** features extraction

In [8]:
import numpy as np
from collections import Counter
import math

# Define POS tags of interest (coarse-grained)
POS_TAGS = ["NOUN", "VERB", "ADJ", "ADV", "PRON", "DET", "ADP", "AUX", "CCONJ", "PART", "NUM", "PUNCT", "X"]

def extract_pos_features(text):
    doc = nlp(text)
    total_tokens = len(doc)
    
    # POS ratios
    pos_counts = Counter([token.pos_ for token in doc])
    pos_ratios = {f"pos_ratio_{tag}": pos_counts.get(tag, 0) / total_tokens if total_tokens else 0 for tag in POS_TAGS}
    
    # POS bigram transitions
    pos_sequence = [token.pos_ for token in doc]
    transitions = list(zip(pos_sequence, pos_sequence[1:]))
    transition_counts = Counter(transitions)
    
    # Transition entropy
    total_transitions = sum(transition_counts.values())
    entropy = -sum(
        (count / total_transitions) * math.log2(count / total_transitions)
        for count in transition_counts.values()
    ) if total_transitions else 0
    
    pos_ratios["pos_transition_entropy"] = entropy
    
    return pos_ratios

# Apply POS features
pos_df = df["generation"].apply(extract_pos_features).apply(pd.Series)
df_pos = pd.concat([df, pos_df], axis=1)

print("✅ POS features added.")


✅ POS features added.


### **Perplexity**  extraction w/ GPT2

In [9]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from tqdm import tqdm
tqdm.pandas()

# Load pretrained GPT-2
tokenizer = GPT2TokenizerFast.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def compute_perplexity(text, max_length=512):
    try:
        # Tokenize and manually move tensors to the target device
        encodings = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length
        )
        encodings = {k: v.to(device) for k, v in encodings.items()}

        with torch.no_grad():
            outputs = model(**encodings, labels=encodings["input_ids"])
            loss = outputs.loss

        return torch.exp(loss).item()
    except Exception:
        return np.nan




In [10]:
df_pos["gpt2_perplexity"] = df_pos["generation"].progress_apply(compute_perplexity)

  0%|          | 0/2000 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 2000/2000 [18:50<00:00,  1.77it/s]


### **Sentence Variance** + **Repetition** + **Burstiness** (Prototype) extraction

In [11]:
import numpy as np
import math

def sentence_variance_features(text):
    doc = nlp(text)
    sentence_lengths = [len([t for t in sent if not t.is_punct]) for sent in doc.sents]
    
    n = len(sentence_lengths)
    total_words = sum(sentence_lengths)
    
    avg_len = np.mean(sentence_lengths) if n else 0
    std_len = np.std(sentence_lengths) if n else 0

    # Normalize for entropy
    if total_words > 0:
        probs = [l / total_words for l in sentence_lengths if l > 0]
        entropy = -sum(p * math.log2(p) for p in probs)
    else:
        entropy = 0

    return {
        "avg_sentence_length_v2": avg_len,
        "sentence_length_std": std_len,
        "sentence_length_entropy": entropy
    }

sentence_df = df_pos["generation"].apply(sentence_variance_features).apply(pd.Series)
df_pos = pd.concat([df_pos, sentence_df], axis=1)

print("✅ Sentence variance features added.")


✅ Sentence variance features added.


In [12]:
from collections import Counter

def repetition_features(text):
    doc = nlp(text.lower())  # Lowercased for n-gram counting
    words = [token.text for token in doc if token.is_alpha]

    ngram_features = {}
    for n in [2, 3]:
        ngrams = list(zip(*[words[i:] for i in range(n)]))
        counts = Counter(ngrams)
        total = len(ngrams)
        repeats = sum(1 for v in counts.values() if v > 1)

        ngram_features[f"repeated_{n}gram_ratio"] = repeats / total if total else 0
        ngram_features[f"unique_{n}grams"] = len(counts)

    return ngram_features

repetition_df = df_pos["generation"].apply(repetition_features).apply(pd.Series)
df_pos = pd.concat([df_pos, repetition_df], axis=1)

print("✅ Repetition features added.")


✅ Repetition features added.


In [13]:
def lexical_burstiness(text):
    doc = nlp(text.lower())
    words = [token.text for token in doc if token.is_alpha]
    counts = Counter(words)
    
    freqs = np.array(list(counts.values()))
    return {
        "burstiness_token_var": np.var(freqs) if len(freqs) > 1 else 0,
        "burstiness_token_std": np.std(freqs) if len(freqs) > 1 else 0
    }

burstiness_df = df_pos["generation"].apply(lexical_burstiness).apply(pd.Series)
df_pos = pd.concat([df_pos, burstiness_df], axis=1)

print("✅ Burstiness features added.")


✅ Burstiness features added.


In [14]:
# Step 1: Find overlapping columns (excluding the index)
overlap_cols = df_features.columns.intersection(df_pos.columns)

# Step 2: Drop those columns from df_pos
df_pos_cleaned = df_pos.drop(columns=overlap_cols)

# Step 3: Concatenate horizontally
df_all = pd.concat([df_features, df_pos_cleaned], axis=1)

print(f"✅ Combined DataFrame shape: {df_all.shape}")


✅ Combined DataFrame shape: (2000, 45)


In [15]:
output_path = "test_sample_baseline.csv"
df_all.to_csv(output_path, index=False)
print(f"✅ Saved to {output_path}")


✅ Saved to test_sample_baseline.csv
