In [20]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from tokenizers.pre_tokenizers import Whitespace, Sequence, Digits, Punctuation
from tokenizers.normalizers import Lowercase
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import RobertaProcessing
from tokenizers.decoders import BPEDecoder
from glob import glob
from tokenizers import ByteLevelBPETokenizer

In [21]:
data_path = "mbti_processed.csv"
tokenizer_model_path = r"models/tokenizer"
if not os.path.isdir(tokenizer_model_path):
    os.makedirs(tokenizer_model_path)

In [22]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,user_id,Intro/Extra-vert,Intuition/Sensing,Think/Feel,Judge/Perceive,type,posts
0,0,I,N,F,J,INFJ,<URL>
1,0,I,N,F,J,INFJ,<URL>
2,0,I,N,F,J,INFJ,enfp and intj moments <URL> sportscenter not t...
3,0,I,N,F,J,INFJ,What has been the most life-changing experienc...
4,0,I,N,F,J,INFJ,<URL> <URL> On repeat for most of today.


In [26]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])

tokenizer.train_from_iterator([str(x) for x in data['posts'].values],
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>"])
tokenizer.post_processor = RobertaProcessing(
    cls=("<s>", tokenizer.token_to_id("<s>")),
    sep=("</s>", tokenizer.token_to_id("</s>"))
)






In [27]:
tokenizer.save_model(tokenizer_model_path)

['models/tokenizer/vocab.json', 'models/tokenizer/merges.txt']

In [28]:
from transformers import RobertaTokenizerFast
# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_model_path, max_len=512)

file models/tokenizer/config.json not found
file models/tokenizer/config.json not found


In [29]:
tokenizer.encode_plus(
            "Hello, y'all! How are you 😁 ?",
            # add_special_tokens=True,
            max_length=512,
            padding="max_length",
            # return_token_type_ids=True,
            # return_tensors="pt",
            return_attention_mask=False,
            truncation=True,
        )

{'input_ids': [0, 44, 12129, 16, 225, 93, 11, 1046, 5, 225, 44, 4111, 225, 997, 225, 962, 225, 177, 258, 251, 228, 225, 35, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 