In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer
import torch
import re
import random
import json
from functools import lru_cache

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
cache_dir = "/tmp/huggingface"

t5_model = T5ForConditionalGeneration.from_pretrained("t5-large", torch_dtype=torch.float16, cache_dir=cache_dir)
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large", cache_dir=cache_dir)
device = "cuda" if torch.cuda.is_available() else "cpu"
t5_model.to(device)
print(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


cpu


In [3]:
# Load GPT-2 model and tokenizer
model_name = "openai-community/gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=cache_dir)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [5]:
file_path = "subtaskB_train.jsonl"

data_human = []
data_ai = []

# Efficiently process the file line by line
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        record = json.loads(line)  # Parse JSON once
        if record.get("model") == "human":
            data_human.append(record)
        else:
            data_ai.append(record)

# Print the first record of each category
print("First human record:", data_human[0] if data_human else "No human data found.")
print("First AI record:", data_ai[0] if data_ai else "No AI data found.")


FileNotFoundError: [Errno 2] No such file or directory: 'subtaskB_train.jsonl'

In [None]:
def batch_mask_text(texts, mask_ratio=0.15, max_words=370):
    """Mask multiple texts at once."""
    masked_texts = []
    
    for text in texts:
        words = text.split()
        
        # Truncate text
        if len(words) > max_words:
            words = words[:max_words]
        
        num_masks = int(len(words) * mask_ratio)
        
        # Randomly select spans to mask (sorted in reverse to avoid index shifts)
        mask_indices = sorted(random.sample(range(len(words) - 1), num_masks), reverse=True)
        
        for i, idx in enumerate(mask_indices):
            words[idx] = f"<extra_id_{i}>"
            if idx + 1 < len(words):  # Ensure a 2-word span
                del words[idx + 1]  # Remove instead of replacing with ""
        
        masked_texts.append(" ".join(words))
    
    return masked_texts


def batch_replace_masks(texts, batch_size=8):
    """Generate T5 model outputs for masked texts in batches."""
    all_outputs = []
    
    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        n_expected = [text.count("<extra_id_") for text in batch_texts]
        stop_id = t5_tokenizer.encode(f"<extra_id_{max(n_expected)}>")[0]
        
        tokens = t5_tokenizer(batch_texts, return_tensors="pt", padding=True)
        
        # Move input tensors to model's device
        with torch.no_grad():
            outputs = t5_model.generate(
                input_ids=tokens["input_ids"].to(t5_model.device),
                attention_mask=tokens["attention_mask"].to(t5_model.device),
                max_length=150,
                do_sample=True,
                top_p=0.9,
                num_return_sequences=1,
                eos_token_id=stop_id
            )
            
        # Move outputs back to CPU to save GPU memory
        outputs = outputs.detach().cpu()
        batch_decoded = t5_tokenizer.batch_decode(outputs, skip_special_tokens=False)
        all_outputs.extend(batch_decoded)
    
    return all_outputs

def batch_extract_fills(texts):
    """Extract the generated fills from T5's output for multiple texts."""
    extracted_fills = []
    for text in texts:
        text = text.replace("<pad>", "").replace("</s>", "").strip()
        
        # Use regex to extract text inside <extra_id_X> tokens
        fills = re.findall(r"<extra_id_\d+>\s*(.*?)\s*(?=<extra_id_\d+>|$)", text)
        
        # Clean extracted tokens
        extracted_fills.append([fill.strip() for fill in fills])
    
    return extracted_fills

def batch_apply_extracted_fills(masked_texts, extracted_fills):
    """Replace mask tokens in the masked texts with generated fills."""
    filled_texts = []
    
    for masked_text, fills in zip(masked_texts, extracted_fills):
        if not fills:
            filled_texts.append(masked_text)
            continue
        
        filled_text = masked_text
        # Iterate through expected mask positions and replace them
        for i, fill in enumerate(fills):
            filled_text = filled_text.replace(f"<extra_id_{i}>", fill, 1)
        
        filled_texts.append(filled_text)
    
    return filled_texts

def batch_average_log_prob(texts, batch_size=8):
    """Calculate average log probability for multiple texts in batches."""
    
    all_log_probs = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        # Tokenize input
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids, attention_mask=attention_mask)

        # Extract logits
        logits = outputs.logits  # [batch_size, seq_len, vocab_size]

        # Shift logits and labels to align
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = input_ids[..., 1:].contiguous()
        shift_mask = attention_mask[..., 1:].contiguous()  # Ensure mask aligns

        # Compute per-token loss
        loss_fct = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=tokenizer.pad_token_id)
        loss_per_token = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # Reshape to [batch_size, seq_length - 1]
        loss_per_token = loss_per_token.view(shift_labels.size())

        # Compute per-sample log prob
        sample_losses = []
        for j in range(loss_per_token.size(0)):
            mask = shift_mask[j].bool()  # Use shift_mask for actual tokens
            if mask.sum() > 0:
                sample_loss = loss_per_token[j][mask].mean().item()
                sample_losses.append(-sample_loss)  # Negative loss as log prob
            else:
                sample_losses.append(float('-inf'))  # Avoid zero prob bias

        all_log_probs.extend(sample_losses)

    return all_log_probs


# Main optimized processing loop
def optimized_processing(data_human, num_samples=50, iterations=25, batch_size=8):
    log_probs_per_text_base = []
    log_probs_per_text_transformed = []
    
    # Process original texts in batches
    original_texts = [" ".join(data_human[j]["text"].split()[:50]) for j in range(num_samples)]
    base_log_probs = batch_average_log_prob(original_texts, batch_size)
    
    # Inside the loop in optimized_processing()
    for iter_idx in range(iterations):
        all_masked_texts = batch_mask_text(original_texts)
        all_raw_fills = batch_replace_masks(all_masked_texts, batch_size)
        all_extracted_fills = batch_extract_fills(all_raw_fills)
        all_perturbed_texts = batch_apply_extracted_fills(all_masked_texts, all_extracted_fills)

        all_log_probs = batch_average_log_prob(all_perturbed_texts, batch_size)
        
        # Organize results by original text
        for j in range(num_samples):
            if iter_idx == 0:
                log_probs_per_text_transformed.append([])
            log_probs_per_text_transformed[j].append(all_log_probs[j])
    
    return base_log_probs, log_probs_per_text_transformed

# Memory management utilities
def clear_cuda_cache():
    """Clear CUDA cache to free up memory."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


# Add caching for tokenization
@lru_cache(maxsize=1024)
def cached_tokenize(text, is_t5=False):
    """Cache tokenization results to avoid repeated work."""
    if is_t5:
        return t5_tokenizer(text, return_tensors="pt", padding=True)
    else:
        return tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [None]:
num_samples = 300
iterations = 100
batch_size = 128

log_probs_base, log_probs_transformed = optimized_processing(data_human, num_samples=num_samples, iterations=iterations, batch_size=batch_size)

results = {}
results["log_probs_base"] = log_probs_base
results["log_probs_transformed"] = log_probs_transformed

with open("results_human.json", "w") as f:
    json.dump(results, f, indent=2)

log_probs_base, log_probs_transformed = optimized_processing(data_ai, num_samples=num_samples, iterations=iterations, batch_size=batch_size)

results = {}
results["log_probs_base"] = log_probs_base
results["log_probs_transformed"] = log_probs_transformed

with open("results_ai.json", "w") as f:
    json.dump(results, f, indent=2)