In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer
import torch
import re
import random
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
t5_model = T5ForConditionalGeneration.from_pretrained("t5-large", torch_dtype=torch.float16, 
                                             device_map="auto")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large")
device = "mps" if torch.backends.mps.is_available() else "cpu"
t5_model.to(device)
print(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


mps


In [None]:
# Load GPT-2 model and tokenizer
model_name = "openai-community/gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
# Use MPS (Apple Silicon) if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [5]:
def mask_text(text, mask_ratio=0.15, max_words=370):
    words = text.split()

    # Truncate text
    if len(words) > max_words:
        words = words[:max_words]

    num_masks = int(len(words) * mask_ratio)

    # Randomly select spans to mask
    mask_indices = sorted(random.sample(range(len(words) - 1), num_masks))
    for i, idx in enumerate(mask_indices):
        words[idx] = f"<extra_id_{i}>"
        if idx + 1 < len(words):  # Ensure a 2-word span
            words[idx + 1] = ""

    return " ".join(words)

def replace_masks(texts):
    """Generate T5 model outputs for masked texts."""
    n_expected = [text.count("<extra_id_") for text in texts]
    stop_id = t5_tokenizer.encode(f"<extra_id_{max(n_expected)}>")[0]

    tokens = t5_tokenizer(texts, return_tensors="pt", padding=True)

    # Move input tensors to model's device just before passing to model
    with torch.no_grad():
        outputs = t5_model.generate(
            input_ids=tokens["input_ids"].to(t5_model.device),  
            attention_mask=tokens["attention_mask"].to(t5_model.device),  
            max_length=150,
            do_sample=True,
            top_p=0.9,
            num_return_sequences=1,
            eos_token_id=stop_id
        )
        
    outputs = outputs.detach().cpu()  # Move tensors to CPU and detach

    return t5_tokenizer.batch_decode(outputs, skip_special_tokens=False)

def extract_fills(texts):
    """Extract the generated fills from T5's output."""
    extracted_fills = []
    for text in texts:
        text = text.replace("<pad>", "").replace("</s>", "").strip()
        
        # Use regex to extract text inside <extra_id_X> tokens
        fills = re.findall(r"<extra_id_\d+>\s*(.*?)\s*(?=<extra_id_\d+>|$)", text)

        # Clean extracted tokens
        extracted_fills.append([fill.strip() for fill in fills])

    return extracted_fills

def apply_extracted_fills(masked_texts, extracted_fills):
    """Replace mask tokens in the masked texts with generated fills."""
    filled_texts = []
    
    for masked_text, fills in zip(masked_texts, extracted_fills):
        if not fills:
            filled_texts.append(masked_text)
            continue

        # Iterate through expected mask positions and replace them
        for i, fill in enumerate(fills):
            masked_text = masked_text.replace(f"<extra_id_{i}>", fill, 1)

        filled_texts.append(masked_text)

    return filled_texts

In [6]:
def average_log_prob(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    
    # Get log probabilities
    log_probs = -outputs.loss  # Negative loss gives the log probability
    avg_log_prob = log_probs.item()
    
    return avg_log_prob

In [7]:
file_path = "./SubtaskB/subtaskB_train.jsonl"
with open(file_path, "r", encoding="utf-8") as file:
    data_human = [json.loads(line) for line in file if json.loads(line).get("model") == "human"]

# Print first 3 records
print(data_human[0])

{'text': ' It will work with the Forza Motorsport disc in. Simply it will load but this problem will occur if your disc is dirty or damaged. Clean it with water on the back of the disc. There can be scratches because discs are fragile and can break easily. So always clean it with a soft, lint-free cloth.;\n, If you have the game that includes the bonus feature, Xbox Live Arcade, You will get the dashboard menu that has "Forza Motorsport" and "Xbox Live Arcade". Choose Xbox Live Arcade if you want to play some arcade games, If you want to play Forza Motorsport, Simply press "A" on the Forza Motorsport Feature.\n\n\nThe Main Menu features one of those choices after loading or creating a profile on the game.\n\nArcade Race: This is simply the quick race mode. You can race at a circuit, sprint, or a very interesting race. Just simply choose a class and choose a car and then choose a race and the race will load, Also the cars are featured in multiplayer, career, and free run mode.\nCareer M

In [None]:
log_probs_per_text_base = []
log_probs_per_text_transformed = []

for j in range(50):
    original_text = " ".join(data_human[j]["text"].split()[:50])
    #original_text = "It will work with the Forza Motorsport disc in. Simply it will load but this problem will occur if your disc is dirty or damaged. Clean it with water on the back of the disc."
    log_probs = []
    for i in range(25):
        print(f"The original text is: {original_text}")
        masked_text = mask_text(original_text)
        print(f"The masked text is: {masked_text}")
        raw_fills = replace_masks([masked_text])
        print(f"The raw fills are: {raw_fills}")
        extracted_fills = extract_fills(raw_fills)
        print(f"The extracted fills are: {extracted_fills}")
        perturbed_text = apply_extracted_fills([masked_text], extracted_fills)[0]
        print(f"The perturbed text is: {perturbed_text}")
        avg_log_prob = average_log_prob(perturbed_text)
        log_probs.append(avg_log_prob)  # Use `.item()` to extract the float
        #t5_tokenizer.clear_cache()
        #tokenizer.clear_cache()
    avg_log_prob_not = average_log_prob(original_text)
    log_probs_per_text_base.append(avg_log_prob_not)  
    log_probs_per_text_transformed.append(log_probs)
    print(f"Average per-token log probability for base sentence {j + 1}: {avg_log_prob_not:.4f}")
    print(f"Average per-token log probability for transformed sentence {j + 1}: {(sum(log_probs) / len(log_probs)):.4f}, the minimum is {min(log_probs)} and the maximum is {max(log_probs)}")


The original text is: It will work with the Forza Motorsport disc in. Simply it will load but this problem will occur if your disc is dirty or damaged. Clean it with water on the back of the disc. There can be scratches because discs are fragile and can break easily. So always clean
The masked text is: It <extra_id_0>  with the <extra_id_1> <extra_id_2>  in. Simply it will load but this problem <extra_id_3>  if <extra_id_4>  is dirty or damaged. Clean it with <extra_id_5>  the back of the disc. There can be scratches because discs are fragile and <extra_id_6>  easily. So always clean
The raw fills are: ['<pad> <extra_id_0> will load <extra_id_1> first option <extra_id_2> name on the label. Put it back <extra_id_3> can happen <extra_id_4> the disc <extra_id_5> the disc cleaner on <extra_id_6> scratches <extra_id_7>']
The extracted fills are: [['will load', 'first option', 'name on the label. Put it back', 'can happen', 'the disc', 'the disc cleaner on', 'scratches', '']]
The perturbed t

KeyboardInterrupt: 

25 minutes:

Average per-token log probability for base sentence 1: -3.6250\
Average per-token log probability for transformed sentence 1: -4.0000, the minimum is -4.53125 and the maximum is -2.703125\
Average per-token log probability for base sentence 2: -3.7812\
Average per-token log probability for transformed sentence 2: -4.0119, the minimum is -4.59375 and the maximum is -2.765625\
Average per-token log probability for base sentence 3: -2.9219\
Average per-token log probability for transformed sentence 3: -3.4850, the minimum is -3.921875 and the maximum is -3.09375\
Average per-token log probability for base sentence 4: -3.2969\
Average per-token log probability for transformed sentence 4: -3.8494, the minimum is -4.15625 and the maximum is -3.421875\
Average per-token log probability for base sentence 5: -3.2656\
Average per-token log probability for transformed sentence 5: -3.6963, the minimum is -4.28125 and the maximum is -2.921875\
Average per-token log probability for base sentence 6: -3.6094\
Average per-token log probability for transformed sentence 6: -4.0769, the minimum is -4.5625 and the maximum is -3.390625\
Average per-token log probability for base sentence 7: -3.8438\
Average per-token log probability for transformed sentence 7: -4.1931, the minimum is -4.6875 and the maximum is -3.203125\
Average per-token log probability for base sentence 8: -3.2656\
Average per-token log probability for transformed sentence 8: -3.7462, the minimum is -4.0625 and the maximum is -2.859375\
Average per-token log probability for base sentence 9: -3.2656\
Average per-token log probability for transformed sentence 9: -3.6637, the minimum is -3.9375 and the maximum is -2.859375\