In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

# Instantiate a tokenizer with a BPE model
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# Instantiate a BPE trainer with special tokens
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Set the pre-tokenizer
tokenizer.pre_tokenizer = Whitespace()

# Specify files for training
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

# Train the tokenizer
tokenizer.train(files, trainer)

# Save the trained tokenizer
tokenizer.save("data/tokenizer-wiki.json")

# Reload the tokenizer from file
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")

# Use the tokenizer
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
print(output.ids)
print(output.offsets[9])

# Post-processing
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# Encode sentences and sentence pairs
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)

output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)
print(output.type_ids)

# Encoding multiple sentences in a batch
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
output = tokenizer.encode_batch(
    [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)

# Enable padding
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[1].tokens)
print(output[1].attention_mask)






Exception: The system cannot find the path specified. (os error 3)

In [4]:
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip


SyntaxError: invalid syntax (3706347959.py, line 1)