In [1]:
import nlp.wordpiece as wp
import pandas as pd
import nltk
from string import punctuation
from collections import defaultdict

In [2]:
text = """
Years ago, the fearsome Pirate King, Gol D. Roger was executed leaving a huge pile of treasure and the famous "One Piece" behind. 
Whoever claims the "One Piece" will be named the new King of the Pirates.
Monkey D. Luffy, a boy who consumed a "Devil Fruit," decides to follow in the footsteps of his idol, the pirate Shanks, and find the One Piece. 
It helps, of course, that his body has the properties of rubber and that he's surrounded by a bevy of skilled fighters and thieves to help him along the way.
Luffy will do anything to get the One Piece and become King of the Pirates!"""
print(len(text.split()))

109


In [3]:
wt = wp.WordPieceTokenizer(text=text, maximum_size=500)

In [4]:
example = wt.score_pairs(normalize=False)
print(example.head()) 

t   #h    15
#h  #e    12
#n  #d     8
P   #i     7
#i  #n     7
dtype: int64


In [5]:
wt.train(normalize=False)

 43%|████▎     | 214/500 [00:00<00:00, 2646.62it/s]


In [6]:
tokenized = []
for tokens in wt.corpus:
    tokenized.extend(list(tokens))
print(" | ".join(tokenized))

Years | ago, | the | fearsome | Pirate | King, | Gol | D. | Roger | was | executed | leaving | a | huge | pile | of | treasure | and | the | famous | "One | Piece" | behind. | Whoever | claims | the | "One | Piece" | will | be | named | the | new | King | of | the | Pirates. | Monkey | D. | Luffy, | a | boy | who | consumed | a | "Devil | Fruit," | decides | to | follow | in | the | footsteps | of | his | idol, | the | pirate | Shanks, | and | find | the | One | Piece. | It | helps, | of | course, | that | his | body | has | the | properties | of | rubber | and | that | he's | surrounded | by | a | bevy | of | skilled | fighters | and | thieves | to | help | him | along | the | way. | Luffy | will | do | anything | to | get | the | One | Piece | and | become | King | of | the | Pirates!


In [7]:
V = wt.fancy_vocabulary
V.sort_values(ascending=True)

Y          1
Luffy,     1
boy        1
who        1
con        1
          ..
#s        22
#n        23
#o        25
#i        29
#e        67
Length: 275, dtype: int64

In [8]:
test = "pirates use to assault ships on the sea"
print(wt.tokenize_text(test))

['pirate', '#s', '[UNK]', 'to', 'a', '#s', '#s', '#a', '#u', '#l', '#t', 's', '#h', '#i', '#p', '#s', 'o', '#n', 'the', 's', '#ea']


In [9]:
test = "king of the pirates"
print(wt.tokenize_text(test))

['[UNK]', 'of', 'the', 'pirate', '#s']


## Process explained

In [10]:
text = """
Years ago, the fearsome Pirate King, Gol D. Roger was executed leaving a huge pile of treasure and the famous "One Piece" behind. 
Whoever claims the "One Piece" will be named the new King of the Pirates.
Monkey D. Luffy, a boy who consumed a "Devil Fruit," decides to follow in the footsteps of his idol, the pirate Shanks, and find the One Piece. 
It helps, of course, that his body has the properties of rubber and that he's surrounded by a bevy of skilled fighters and thieves to help him along the way.
Luffy will do anything to get the One Piece and become King of the Pirates!
"""
example = "consumed a"
current = ""
normalize = False 
wt = wp.WordPieceTokenizer(text=text, maximum_size=500)
iterations = list(range(wt.max_vocabulary_size))
for i in iterations:
    check = wt.update(normalize=normalize)
    tokens = wt.tokenize_text(example)
    if tokens != current:
        print("Iterazione {}: {}".format(i, " ".join(tokens)))
        current = tokens
    if not check:
        break 

Iterazione 0: c #o #n #s #u #m #e #d a
Iterazione 16: c #o #n #s #u #m #ed a
Iterazione 34: c #o #n #s #u #med a
Iterazione 35: c #on #s #u #med a
Iterazione 58: c #on #su #med a
Iterazione 116: con #su #med a
Iterazione 117: consu #med a
Iterazione 118: consumed a


In [11]:
len(wt.vocabulary)

275

In [12]:
for w, s in wt.fancy_vocabulary.items():
    if w.startswith('c'):
        print(w, s)

c 3
cl 1
cla 1
clai 1
claim 1
claims 1
con 1
consu 1
consumed 1
cou 1
cours 1
course 1
course, 1


In [13]:
from transformers import BertTokenizer

In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

## BERT Tokenizer

In [15]:
example = "mix mixing mixed mixes mixer mixade"
example = "play playng playing plays played planning"

tokens = tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Testo originale:", example)
print("Token:", tokens)
print("Token IDs:", token_ids)

Testo originale: play playng playing plays played planning
Token: ['play', 'play', '##ng', 'playing', 'plays', 'played', 'planning']
Token IDs: [2377, 2377, 3070, 2652, 3248, 2209, 4041]


In [16]:
from collections import defaultdict

In [19]:
vocabulary = defaultdict(lambda: 0)
sentences = ["we used to play boardgames a lot", "people love to play games"]
for sentence in sentences:
    tokens = tokenizer.tokenize(sentence)
    for token in tokens:
        vocabulary[token] += 1
V = pd.Series(vocabulary)

In [20]:
V.sort_values(ascending=False)

to         2
play       2
we         1
used       1
board      1
##games    1
a          1
lot        1
people     1
love       1
games      1
dtype: int64