In [None]:
# Cell 1A: Check GPU availability
"""
Verify GPU is available and accessible
"""

import torch

print("GPU Check:")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("WARNING: No GPU detected. Will use CPU (slower)")

In [None]:
# Cell 1B: Install dependencies
"""
Installation cell - run once at notebook start
"""

!pip install fastcoref -q
!pip install spacy -q
!python -m spacy download en_core_web_trf -q

print("Dependencies installed successfully")

In [None]:
# Cell 2: Import libraries
"""
All necessary imports
"""

import pandas as pd
import numpy as np
from fastcoref import FCoref
import spacy
from typing import Dict, List, Tuple, Optional
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import warnings
import json
import pickle
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

In [None]:
# Cell 3: Initialize models
"""
Load FastCoref and spaCy with full pipeline for REG analysis
Important: Keep parser and tagger enabled for dependency analysis
"""

import torch

# Determine device
if torch.cuda.is_available():
    device = 'cuda:0'
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = 'cpu'
    print("Using CPU")

# Initialize FastCoref
print("\nInitializing FastCoref model...")
coref_model = FCoref(device=device)

# Initialize spaCy with necessary components for REG analysis
# Keep: tokenizer, tagger, parser (needed for dependency analysis)
# Disable: ner, lemmatizer (not needed, saves memory)
print("Loading spaCy with parser and tagger...")
nlp = spacy.load("en_core_web_trf", disable=["ner", "lemmatizer"])

print("\nModels ready")
print(f"FastCoref device: {device}")
print(f"spaCy components: {nlp.pipe_names}")
print("Note: Parser enabled for dependency analysis (Tier 2)")

In [None]:
# Cell 4: Load dataset
"""
Load RAID dataset
Adjust path as needed
"""

# Load data - UPDATE THIS PATH
df = pd.read_csv('/content/raid_sample_medium_PostPOS_CLEAN (1).csv')

print(f"Dataset loaded: {len(df)} samples")
print(f"Columns: {list(df.columns)}")

# Identify label column
if 'is_ai' in df.columns:
    label_column = 'is_ai'
elif 'label' in df.columns:
    label_column = 'label'
else:
    print("WARNING: No label column found")
    label_column = None

if label_column:
    print(f"\nLabel column: '{label_column}'")
    print(f"Class distribution:")
    print(df[label_column].value_counts())

print(f"\nFirst sample preview:")
print(f"Text: {df['generation'].iloc[0][:200]}...")

In [None]:
# Cell 5: Chain extraction from FastCoref
"""
Extract coreference chains with full mention metadata
"""

def get_sentence_index(doc, token_idx: int) -> int:
    """Get sentence index for a token position"""
    for sent_idx, sent in enumerate(doc.sents):
        if sent.start <= token_idx < sent.end:
            return sent_idx
    return 0


def extract_chains_from_fastcoref(text: str, coref_model, nlp) -> Tuple[List[List[Dict]], List[Dict], object]:
    """
    Extract coreference chains using FastCoref

    Returns:
        chains: List of chains, each chain is list of mention dicts
        all_mentions: Flat list of all mentions across all chains
        doc: spaCy Doc object (for further analysis)
    """
    chains = []
    all_mentions = []

    if not text or len(text.strip()) < 10:
        return chains, all_mentions, None

    try:
        # Get predictions from FastCoref
        preds = coref_model.predict(texts=[text])

        if not preds or len(preds) == 0:
            return chains, all_mentions, None

        # Get clusters as character spans
        coref_result = preds[0]
        clusters = coref_result.get_clusters(as_strings=False)

        if not clusters:
            return chains, all_mentions, None

        # Process with spaCy for token and sentence information
        doc = nlp(text)

        # Build token index mapping from character positions
        char_to_token = {}
        for token in doc:
            for char_idx in range(token.idx, token.idx + len(token.text)):
                char_to_token[char_idx] = token.i

        # Convert each cluster to our format
        for cluster in clusters:
            chain_mentions = []

            for char_start, char_end in cluster:
                # Convert character span to token span
                start_token_idx = char_to_token.get(char_start)
                end_token_idx = char_to_token.get(char_end - 1)

                if start_token_idx is None or end_token_idx is None:
                    continue

                # Get the span
                span = doc[start_token_idx:end_token_idx + 1]

                # Check if mention is a pronoun
                if len(span) == 1:
                    is_pronoun = span[0].pos_ == "PRON"
                else:
                    is_pronoun = span.root.pos_ == "PRON"

                # Get sentence index
                sent_idx = get_sentence_index(doc, start_token_idx)

                # Get token indices list
                token_indices = list(range(start_token_idx, end_token_idx + 1))

                # Create mention dictionary with EXTENDED info for REG analysis
                mention_dict = {
                    'text': span.text,
                    'start_token': start_token_idx,
                    'end_token': end_token_idx + 1,
                    'start_char': char_start,
                    'end_char': char_end,
                    'sent_idx': sent_idx,
                    'is_pronoun': is_pronoun,
                    'token_count': len(span),
                    'token_indices': token_indices,
                    # NEW: Store spaCy span for later analysis
                    'span_start': start_token_idx,
                    'span_end': end_token_idx + 1
                }

                chain_mentions.append(mention_dict)
                all_mentions.append(mention_dict)

            if chain_mentions:
                # Sort mentions by position in document
                chain_mentions.sort(key=lambda m: m['start_token'])
                chains.append(chain_mentions)

    except Exception as e:
        print(f"Error in coreference extraction: {e}")
        return chains, all_mentions, None

    return chains, all_mentions, doc


print("Chain extraction function defined")

In [None]:
# Cell 6: Test chain extraction
"""
Test on example to verify everything works
"""

test_text = "We are so happy to see you using our coref package. This package is very fast!"

print("Testing FastCoref extraction...")
print(f"Text: {test_text}")
print("\n" + "=" * 70)

# Test extraction
test_chains, test_mentions, test_doc = extract_chains_from_fastcoref(test_text, coref_model, nlp)

print(f"Chains detected: {len(test_chains)}")
print(f"Total mentions: {len(test_mentions)}")

for chain_idx, chain in enumerate(test_chains):
    print(f"\nChain {chain_idx}:")
    for mention in chain:
        print(f"  '{mention['text']}' (tokens {mention['start_token']}-{mention['end_token']}, "
              f"sent {mention['sent_idx']}, pronoun={mention['is_pronoun']}, "
              f"token_count={mention['token_count']})")

In [None]:
# Cell 8: TIER 1 - RMO (Repeat-Mention Overspecification) Features
"""
Tier 1: Analyze repeat mentions for overspecification

3 features:
1. repeat_mention_expansion_rate: How often repeat mentions add tokens
2. avg_tokens_added_on_repeat: Mean additional tokens when expanding
3. repeat_overspecification_ratio: Proportion of expansions that are unnecessary
"""

def calculate_tier1_rmo_features(chains: List[List[Dict]], doc) -> Dict[str, float]:
    """
    Calculate Tier 1 RMO features

    Core hypothesis: AI adds unnecessary modifiers when re-mentioning entities
    Humans use minimal expressions for established referents
    """
    features = {
        'repeat_mention_expansion_rate': 0.0,
        'avg_tokens_added_on_repeat': 0.0,
        'repeat_overspecification_ratio': 0.0
    }

    if not chains or doc is None:
        return features

    # Track repeat mentions (2nd+ mentions in each chain)
    repeat_mentions = []
    expansions = []
    unnecessary_expansions = []

    for chain in chains:
        if len(chain) < 2:
            continue

        # First mention establishes baseline
        first_mention = chain[0]
        baseline_tokens = first_mention['token_count']
        baseline_is_pronoun = first_mention['is_pronoun']

        # Analyze subsequent mentions
        for mention in chain[1:]:
            repeat_mentions.append(mention)

            current_tokens = mention['token_count']
            current_is_pronoun = mention['is_pronoun']

            # Check if mention expanded
            if current_tokens > baseline_tokens:
                expansion_amount = current_tokens - baseline_tokens
                expansions.append(expansion_amount)

                # Heuristic for "unnecessary" expansion:
                # If previous mention was NOT a pronoun and context is same sentence
                # or adjacent sentence, expansion is likely unnecessary
                prev_mention = chain[chain.index(mention) - 1]
                sent_distance = abs(mention['sent_idx'] - prev_mention['sent_idx'])

                # Unnecessary if:
                # 1. Previous mention was full NP (not pronoun)
                # 2. Close in discourse (within 1 sentence)
                # 3. Current mention adds tokens
                if not prev_mention['is_pronoun'] and sent_distance <= 1:
                    unnecessary_expansions.append(expansion_amount)

    # Calculate features
    if repeat_mentions:
        # Feature 1: How often do repeat mentions expand?
        expansion_count = len(expansions)
        features['repeat_mention_expansion_rate'] = expansion_count / len(repeat_mentions)

        # Feature 2: Average tokens added when expanding
        if expansions:
            features['avg_tokens_added_on_repeat'] = np.mean(expansions)

        # Feature 3: Proportion of expansions that are unnecessary
        if expansions:
            features['repeat_overspecification_ratio'] = len(unnecessary_expansions) / len(expansions)

    return features


print("Tier 1 RMO feature functions defined")

In [None]:
# Cell 9: TIER 2 - MTA (Modification Type Analysis) Features
"""
Tier 2: Analyze types of modifications used in referring expressions

5 features:
1. adjective_modification_rate: Proportion of mentions with adjectives
2. prepositional_modification_rate: Proportion with PP modifiers
3. relative_clause_rate: Proportion with relative clauses
4. modification_type_entropy: Diversity of modification types
5. avg_modifiers_per_mention: Mean modification complexity
"""

def analyze_mention_modifications(mention: Dict, doc) -> Dict[str, int]:
    """
    Analyze syntactic modifications in a mention using spaCy dependencies

    Returns dict with counts for each modification type
    """
    mods = {
        'amod': 0,        # Adjective modifiers
        'prep': 0,        # Prepositional phrases
        'relcl': 0,       # Relative clauses
        'compound': 0,    # Compound modifiers
        'det': 0,         # Determiners (for reference)
        'poss': 0,        # Possessive modifiers
    }

    # Get span from doc
    start_idx = mention['span_start']
    end_idx = mention['span_end']
    span = doc[start_idx:end_idx]

    # Analyze dependencies within span
    for token in span:
        dep = token.dep_

        if dep == 'amod':
            mods['amod'] += 1
        elif dep in ['prep', 'pobj']:
            mods['prep'] += 1
        elif dep == 'relcl':
            mods['relcl'] += 1
        elif dep == 'compound':
            mods['compound'] += 1
        elif dep == 'det':
            mods['det'] += 1
        elif dep in ['poss', 'nmod:poss']:
            mods['poss'] += 1

    return mods


def calculate_tier2_mta_features(chains: List[List[Dict]], all_mentions: List[Dict], doc) -> Dict[str, float]:
    """
    Calculate Tier 2 MTA features

    Core hypothesis: AI and humans differ in HOW they modify expressions
    AI may prefer certain modification types (e.g., adjective stacking)
    """
    features = {
        'adjective_modification_rate': 0.0,
        'prepositional_modification_rate': 0.0,
        'relative_clause_rate': 0.0,
        'modification_type_entropy': 0.0,
        'avg_modifiers_per_mention': 0.0
    }

    if not all_mentions or doc is None:
        return features

    # Analyze all mentions
    mention_mod_counts = []
    modification_types = []

    for mention in all_mentions:
        # Skip pronouns (they don't have complex modifications)
        if mention['is_pronoun']:
            continue

        mods = analyze_mention_modifications(mention, doc)
        mention_mod_counts.append(sum(mods.values()))

        # Track which types are used
        if mods['amod'] > 0:
            modification_types.append('amod')
        if mods['prep'] > 0:
            modification_types.append('prep')
        if mods['relcl'] > 0:
            modification_types.append('relcl')
        if mods['compound'] > 0:
            modification_types.append('compound')
        if mods['poss'] > 0:
            modification_types.append('poss')

    non_pronoun_mentions = len([m for m in all_mentions if not m['is_pronoun']])

    if non_pronoun_mentions > 0:
        # Feature 1: Adjective modification rate
        amod_count = sum(1 for m in all_mentions
                         if not m['is_pronoun'] and
                         analyze_mention_modifications(m, doc)['amod'] > 0)
        features['adjective_modification_rate'] = amod_count / non_pronoun_mentions

        # Feature 2: Prepositional modification rate
        prep_count = sum(1 for m in all_mentions
                         if not m['is_pronoun'] and
                         analyze_mention_modifications(m, doc)['prep'] > 0)
        features['prepositional_modification_rate'] = prep_count / non_pronoun_mentions

        # Feature 3: Relative clause rate
        relcl_count = sum(1 for m in all_mentions
                          if not m['is_pronoun'] and
                          analyze_mention_modifications(m, doc)['relcl'] > 0)
        features['relative_clause_rate'] = relcl_count / non_pronoun_mentions

        # Feature 4: Modification type entropy (diversity)
        if modification_types:
            type_counts = Counter(modification_types)
            total = sum(type_counts.values())
            probs = [count / total for count in type_counts.values()]
            entropy = -sum(p * np.log2(p) for p in probs if p > 0)
            features['modification_type_entropy'] = entropy

        # Feature 5: Average modifiers per mention
        if mention_mod_counts:
            features['avg_modifiers_per_mention'] = np.mean(mention_mod_counts)

    return features


print("Tier 2 MTA feature functions defined")

In [None]:
# Cell 10: Integrated feature extraction function
"""
Extract ALL features: Baseline + Tier 1 + Tier 2
"""

def extract_all_coref_features(text: str, coref_model, nlp) -> Dict[str, float]:
    """
    Extract all coreference features from text

    Returns:
        Dictionary with 14 features:
        - 6 baseline features
        - 3 Tier 1 (RMO) features
        - 5 Tier 2 (MTA) features
    """
    # Initialize all features with defaults
    features = {
        # Tier 1: RMO
        'repeat_mention_expansion_rate': 0.0,
        'avg_tokens_added_on_repeat': 0.0,
        'repeat_overspecification_ratio': 0.0,
        # Tier 2: MTA
        'adjective_modification_rate': 0.0,
        'prepositional_modification_rate': 0.0,
        'relative_clause_rate': 0.0,
        'modification_type_entropy': 0.0,
        'avg_modifiers_per_mention': 0.0
    }

    # Handle empty text
    if not text or len(text.strip()) < 10:
        return features

    try:
        # Extract chains
        chains, all_mentions, doc = extract_chains_from_fastcoref(text, coref_model, nlp)

        if not all_mentions or doc is None:
            return features

        doc_length = len(doc)

        # === TIER 1: RMO FEATURES ===
        tier1_features = calculate_tier1_rmo_features(chains, doc)
        features.update(tier1_features)

        # === TIER 2: MTA FEATURES ===
        tier2_features = calculate_tier2_mta_features(chains, all_mentions, doc)
        features.update(tier2_features)

    except Exception as e:
        print(f"Error processing text: {e}")
        # Return zeros on error

    return features


print("Integrated feature extraction function defined")

In [None]:
# Cell 12: Batch processing with chain persistence
"""
Process dataframe with CHAIN SAVING for future analysis
"""

def extract_coref_features_batch_with_chains(df: pd.DataFrame,
                                               text_column: str,
                                               coref_model,
                                               nlp,
                                               batch_size: int = 32,
                                               save_chains: bool = True) -> Tuple[pd.DataFrame, List[Dict]]:
    """
    Extract features for entire dataframe WITH chain persistence

    Returns:
        df_result: DataFrame with features
        chains_data: List of chain structures (if save_chains=True)
    """
    print(f"Extracting features from {len(df)} texts...")
    print(f"Text column: '{text_column}'")
    print(f"Batch size: {batch_size}")
    print(f"Save chains: {save_chains}")

    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")

    # Initialize result dataframe
    df_result = df.copy()

    # Initialize feature columns
    feature_names = [
        # Tier 1
        'repeat_mention_expansion_rate', 'avg_tokens_added_on_repeat',
        'repeat_overspecification_ratio',
        # Tier 2
        'adjective_modification_rate', 'prepositional_modification_rate',
        'relative_clause_rate', 'modification_type_entropy',
        'avg_modifiers_per_mention'
    ]

    for feature_name in feature_names:
        df_result[feature_name] = 0.0

    # Storage for chains
    chains_data = [] if save_chains else None

    # Prepare valid texts
    valid_indices = []
    valid_texts = []

    for idx, row in df.iterrows():
        text = row[text_column]
        if not pd.isna(text) and len(str(text).strip()) >= 10:
            valid_indices.append(idx)
            valid_texts.append(str(text))

    print(f"Valid texts for processing: {len(valid_texts)}/{len(df)}")

    # Process in batches
    successful = 0
    failed = 0

    for batch_start in tqdm(range(0, len(valid_texts), batch_size), desc="Processing batches"):
        batch_end = min(batch_start + batch_size, len(valid_texts))
        batch_texts = valid_texts[batch_start:batch_end]
        batch_indices = valid_indices[batch_start:batch_end]

        for text, idx in zip(batch_texts, batch_indices):
            try:
                # Extract features
                features = extract_all_coref_features(text, coref_model, nlp)

                # Store features
                for feature_name, value in features.items():
                    if np.isfinite(value):
                        df_result.at[idx, feature_name] = value

                # Save chains if requested
                if save_chains:
                    chains, all_mentions, doc = extract_chains_from_fastcoref(text, coref_model, nlp)

                    # Serialize chains (remove doc reference)
                    chains_serializable = [
                        [
                            {k: v for k, v in mention.items() if k not in ['span_start', 'span_end']}
                            for mention in chain
                        ]
                        for chain in chains
                    ]

                    chains_data.append({
                        'idx': idx,
                        'chains': chains_serializable,
                        'n_chains': len(chains),
                        'n_mentions': len(all_mentions)
                    })

                if any(value > 0 for value in features.values()):
                    successful += 1
                else:
                    failed += 1

            except Exception as e:
                failed += 1
                if save_chains:
                    chains_data.append({
                        'idx': idx,
                        'chains': [],
                        'n_chains': 0,
                        'n_mentions': 0
                    })
                continue

        # Periodic GPU memory cleanup
        if torch.cuda.is_available() and batch_start % (batch_size * 10) == 0:
            torch.cuda.empty_cache()

    print(f"\nExtraction complete:")
    print(f"Successful: {successful}/{len(valid_texts)}")
    print(f"Failed: {failed}/{len(valid_texts)}")
    print(f"Skipped (invalid): {len(df) - len(valid_texts)}")

    if torch.cuda.is_available():
        print(f"GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

    return df_result, chains_data


print("Batch processing with chain persistence defined")

In [None]:
# Cell 13: Test on first 100 samples
"""
Test pipeline on small subset before running full dataset
"""

print("Testing on first 100 samples...")
df_test = df.head(100).copy()

df_test_processed, chains_test = extract_coref_features_batch_with_chains(
    df=df_test,
    text_column='generation',
    coref_model=coref_model,
    nlp=nlp,
    save_chains=True
)

# Check results
print("\n" + "=" * 70)
print("FEATURE STATISTICS (100 samples)")
print("=" * 70)

feature_names = [
    'repeat_mention_expansion_rate', 'avg_tokens_added_on_repeat',
    'repeat_overspecification_ratio', 'adjective_modification_rate',
    'prepositional_modification_rate', 'relative_clause_rate',
    'modification_type_entropy', 'avg_modifiers_per_mention'
]

print("\nBASELINE FEATURES:")
for feature in feature_names[:6]:
    values = df_test_processed[feature]
    non_zero = (values > 0).sum()
    print(f"  {feature:30s}: mean={values.mean():.4f}, non-zero={non_zero}/100")

print("\nTIER 1: RMO FEATURES:")
for feature in feature_names[6:9]:
    values = df_test_processed[feature]
    non_zero = (values > 0).sum()
    print(f"  {feature:30s}: mean={values.mean():.4f}, non-zero={non_zero}/100")

print("\nTIER 2: MTA FEATURES:")
for feature in feature_names[9:]:
    values = df_test_processed[feature]
    non_zero = (values > 0).sum()
    print(f"  {feature:30s}: mean={values.mean():.4f}, non-zero={non_zero}/100")

# Save test results
print("\n" + "=" * 70)
print("Saving test results...")
df_test_processed.to_csv('test_100_with_tier1_tier2_features.csv', index=False)

with open('test_100_chains.pkl', 'wb') as f:
    pickle.dump(chains_test, f)

print("✓ Saved: test_100_with_tier1_tier2_features.csv")
print("✓ Saved: test_100_chains.pkl")
print("\nIf results look good, proceed with full dataset extraction!")

In [None]:
# Cell 14: Full dataset extraction with chunking
"""
Process full dataset in chunks with checkpoints
ONLY RUN THIS AFTER VERIFYING TEST RESULTS
"""

# CONFIGURATION
chunk_size = 1000
n_chunks = int(np.ceil(len(df) / chunk_size))

print(f"Processing {len(df)} samples in {n_chunks} chunks of {chunk_size}")
print(f"Estimated total time: ~10-12 hours")
print("\nStarting chunked extraction...")
print("=" * 70)

results = []
all_chains_data = []

for chunk_idx in range(n_chunks):
    start_idx = chunk_idx * chunk_size
    end_idx = min((chunk_idx + 1) * chunk_size, len(df))

    print(f"\n{'='*70}")
    print(f"CHUNK {chunk_idx + 1}/{n_chunks}")
    print(f"Samples {start_idx} to {end_idx}")
    print(f"{'='*70}")

    # Extract chunk
    df_chunk = df.iloc[start_idx:end_idx].copy()

    # Process chunk
    df_chunk_processed, chains_chunk = extract_coref_features_batch_with_chains(
        df=df_chunk,
        text_column='generation',
        coref_model=coref_model,
        nlp=nlp,
        save_chains=True
    )

    # Save chunk checkpoint
    checkpoint_features_path = f'chunk_{chunk_idx+1}_of_{n_chunks}_features.csv'
    checkpoint_chains_path = f'chunk_{chunk_idx+1}_of_{n_chunks}_chains.pkl'

    df_chunk_processed.to_csv(checkpoint_features_path, index=False)
    with open(checkpoint_chains_path, 'wb') as f:
        pickle.dump(chains_chunk, f)

    print(f"✓ Checkpoint saved: {checkpoint_features_path}")
    print(f"✓ Checkpoint saved: {checkpoint_chains_path}")

    results.append(df_chunk_processed)
    all_chains_data.extend(chains_chunk)

    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"GPU memory cleared: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Combine all chunks
print("\n" + "="*70)
print("COMBINING ALL CHUNKS")
print("="*70)

df_with_features = pd.concat(results, ignore_index=True)

# Save final results
final_features_path = 'data_with_tier1_tier2_features_FINAL.csv'
final_chains_path = 'data_chains_FINAL.pkl'

df_with_features.to_csv(final_features_path, index=False)

with open(final_chains_path, 'wb') as f:
    pickle.dump(all_chains_data, f)

print(f"\n✓ SUCCESS!")
print(f"Total samples processed: {len(df_with_features)}")
print(f"✓ Features saved: {final_features_path}")
print(f"✓ Chains saved: {final_chains_path}")

# Quick summary
print(f"\n" + "=" * 70)
print("FINAL FEATURE SUMMARY")
print("=" * 70)

for feature in feature_names:
    values = df_with_features[feature]
    non_zero = (values > 0).sum()
    print(f"{feature:35s}: mean={values.mean():.4f}, "
          f"non-zero={non_zero}/{len(df_with_features)} "
          f"({100*non_zero/len(df_with_features):.1f}%)")