In [37]:
# testing documentation code
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# model_id = "google-bert/bert-base-cased"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForMaskedLM.from_pretrained(model_id)

# text = "The capital of France is [MASK]."
# inputs = tokenizer(text, return_tensors="pt")
# outputs = model(**inputs)

# # To get predictions for the mask:
# masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
# predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
# predicted_token = tokenizer.decode(predicted_token_id)
# print("Predicted token:", predicted_token)
# # Predicted token:  Paris

Some weights of the model checkpoint at google-bert/bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted token: Paris


In [39]:
# unzip file
import zipfile
import os

zip_path = "/kaggle/input/competitions/billion-word-imputation/test_v2.txt.zip"
extract_to = "/kaggle/working/"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"Extracted to: {extract_to}")
print(os.listdir(extract_to))

Extracted to: /kaggle/working/
['.virtual_documents', 'test_v2.txt']


In [41]:
import os
import torch
import pandas as pd
import numpy as np
import csv
import zipfile
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tqdm.auto import tqdm

In [42]:
# --- CONFIGURATION ---
MODEL_NAME = "google-bert/bert-base-cased" 
INPUT_FILE = "/kaggle/working/test_v2.txt"
OUTPUT_FILE = "submission.csv"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

Running on: cuda


In [44]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.to(device)
# model.eval()

Some weights of the model checkpoint at google-bert/bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [46]:
def solve_sentence(sentence, model, tokenizer, device):
    """
    Finds the missing word by testing [MASK] at every possible position.
    Returns: The reconstructed sentence with the missing word filled in.
    """
    # Split by whitespace. This preserves the dataset's specific tokenization (e.g. "word ,")
    words = sentence.strip().split()
    
    if len(words) == 0:
        return sentence

    # Generate Hypotheses: Insert [MASK] at every possible index (0 to len)
    candidates_text = []
    for i in range(len(words) + 1):
        # Construct: words_before + [MASK] + words_after
        candidate_words = words[:i] + [tokenizer.mask_token] + words[i:]
        candidates_text.append(" ".join(candidate_words))

    # Tokenize all hypotheses as a single batch
    # (BERT max length is 512, truncation prevents crashes on rare long sentences)
    encoded_inputs = tokenizer(
        candidates_text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    ).to(device)

    # Inference (No Grad for speed/memory)
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        logits = outputs.logits

    # Scoring Variables
    best_score = -float('inf')
    best_word = ""
    best_insert_idx = -1

    # Evaluate each hypothesis
    # We look at the confidence of the model prediction at the [MASK] position
    mask_token_id = tokenizer.mask_token_id
    
    for i in range(len(candidates_text)):
        input_ids = encoded_inputs["input_ids"][i]
        
        # Locate the mask in the tokenized sequence
        # Note: Tokenizer might split words, so mask index != word index
        mask_positions = (input_ids == mask_token_id).nonzero(as_tuple=True)[0]
        
        if len(mask_positions) == 0:
            continue # Truncation might have cut it off
            
        mask_idx = mask_positions[0].item()

        # Get logits for the mask
        mask_logits = logits[i, mask_idx, :]
        
        # Get top prediction
        probs = torch.softmax(mask_logits, dim=0)
        top_prob, top_id = torch.topk(probs, 1)
        
        score = top_prob.item()
        
        # If this hypothesis is more confident than previous ones, pick it
        if score > best_score:
            best_score = score
            best_word = tokenizer.decode([top_id.item()]).strip()
            best_insert_idx = i

    # Reconstruct the sentence using the original list to preserve spacing
    final_words = words[:best_insert_idx] + [best_word] + words[best_insert_idx:]
    return " ".join(final_words)

In [47]:
df = pd.read_csv(INPUT_FILE, quotechar='"')
print(f"Total rows to process: {len(df)}")

results = []
ids = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Imputing"):
    original_sent = row['sentence']
    try:
        fixed_sent = solve_sentence(original_sent, model, tokenizer, device)
        results.append(fixed_sent)
        ids.append(row['id'])        
    except Exception as e:
        print(f"Error on ID {row['id']}: {e}")
        results.append(original_sent) # Fallback: return original
        ids.append(row['id'])


print(f"Saving to {OUTPUT_FILE}")
submission = pd.DataFrame({
    "id": ids,
    "sentence": results
})

# Kaggle Format: id,"sentence"
# We use quoting=csv.QUOTE_NONNUMERIC to quote strings but not numbers
submission.to_csv(
    OUTPUT_FILE,
    index=False,
    sep=',',
    encoding='utf-8',
    quotechar='"',
    quoting=csv.QUOTE_NONNUMERIC,
    doublequote=True
)

print("Saved successfully.")

Total rows to process: 306681


Imputing:   0%|          | 0/306681 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print("\n--- SANITY CHECK (First 5 Rows) ---")
with open(OUTPUT_FILE, 'r') as f:
    for i in range(5):
        print(f.readline().strip())
print("-----------------------------------")