In [18]:
import torch
from transformers import DistilBertTokenizer, DistilBertForMaskedLM

# Load the tokenizer and model

from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForMaskedLM



In [19]:

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForMaskedLM.from_pretrained('distilbert-base-cased')


In [40]:
def get_mask_pred(sentence, idx, model, tokenizer):
    # Define the input sentence and the token to be masked


    # Tokenize the sentence and get the index of the token to be masked
    tokens = tokenizer.tokenize(sentence)
    token_index = idx

    # Mask the token by replacing it with the [MASK] token
    tokens[token_index] = '[MASK]'

    # Convert the tokens back to input IDs and add the special [CLS] and [SEP] tokens
    input_ids = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokens + ['[SEP]'])

    # Convert the input IDs to a PyTorch tensor
    input_tensor = torch.tensor([input_ids])

    # Generate predictions for the masked token
    with torch.no_grad():
        predictions = model(input_tensor)[0]

        # Get the predicted probabilities for the masked token
        mask_index = input_ids.index(tokenizer.mask_token_id)
        probs = predictions[0, mask_index].softmax(dim=0)

        # Get the index of the predicted token with the highest probability
        predicted_index = torch.argmax(probs).item()

    # Convert the predicted token index back to a token and print it
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)

    return probs

In [1]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /home/niallt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /home/niallt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

We're going to use the wikitext (link) dataset with the distilbert-base-cased (link) model checkpoint.

Start by loading the wikitext-2-raw-v1 version of that dataset, and take the 11th example (index 10) of the train split.
We'll tokenize this using the appropriate tokenizer, and we'll mask the sixth token (index 5) the sequence.

When using the distilbert-base-cased checkpoint to unmask that (sixth token, index 5) token, what is the most probable predicted token (please provide the decoded token, and not the ID)?

In [13]:
dataset['train'][11]

{'text': " Troops are divided into five classes : Scouts , Shocktroopers , Engineers , Lancers and Armored Soldier . Troopers can switch classes by changing their assigned weapon . Changing class does not greatly affect the stats gained while in a previous class . With victory in battle , experience points are awarded to the squad , which are distributed into five different attributes shared by the entire squad , a feature differing from early games ' method of distributing to different unit types . \n"}

In [22]:
idx = 10
token_index = 5
sentence = dataset['train'][idx]['text']


In [41]:
mask_probs = get_mask_pred(sentence, 5, model, tokenizer)

mechanic


In [43]:
mask_probs.topk(5)

torch.return_types.topk(
values=tensor([0.1970, 0.1784, 0.1634, 0.0570, 0.0364]),
indices=tensor([19459,  5418,  1449, 11556,  2395]))

### use fill pipeline

In [23]:
# Tokenize the sentence and get the index of the token to be masked
tokens = tokenizer.tokenize(sentence)

# Mask the token by replacing it with the [MASK] token
tokens[token_index] = '[MASK]'

# Convert the tokens back to input IDs and add the special [CLS] and [SEP] tokens
input_ids = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokens + ['[SEP]'])

# Convert the input IDs to a PyTorch tensor
input_tensor = torch.tensor([input_ids])

In [34]:
reconstructed_sentence = tokenizer.convert_tokens_to_string(tokens)

In [35]:
reconstructed_sentence

'The game\'s battle [MASK], the BliTZ system, is carried over directly from Valkyira Chronicles. During missions, players select each unit using a top @ - @ down perspective of the battlefield map : once a character is selected, the player moves the character around the battlefield in third @ - @ person. A character can only act once per @ - @ turn, but characters can be granted multiple turns at the expense of other characters\'turns. Each character has a field and distance of movement limited by their Action Gauge. Up to nine characters can be assigned to a single mission. During gameplay, characters will call out if something happens to them, such as their health points ( HP ) getting low or being knocked out by enemy attacks. Each character has specific " Potentials ", skills unique to each character. They are divided into " Personal Potential ", which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character, and " Bat

In [9]:

nlp_fill = pipeline('fill-mask', model = model, tokenizer = tokenizer)

In [37]:
nlp_fill(reconstructed_sentence)

[{'score': 0.1970197558403015,
  'token': 19459,
  'token_str': 'mechanic',
  'sequence': 'The game\'s battle mechanic, the BliTZ system, is carried over directly from Valkyira Chronicles. During missions, players select each unit using a top @ - @ down perspective of the battlefield map : once a character is selected, the player moves the character around the battlefield in third @ - @ person. A character can only act once per @ - @ turn, but characters can be granted multiple turns at the expense of other characters\'turns. Each character has a field and distance of movement limited by their Action Gauge. Up to nine characters can be assigned to a single mission. During gameplay, characters will call out if something happens to them, such as their health points ( HP ) getting low or being knocked out by enemy attacks. Each character has specific " Potentials ", skills unique to each character. They are divided into " Personal Potential ", which are innate skills that remain unaltered