### 1. Hugging Face Tokenizers

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer("This is an example sentence.", truncation = True, padding = True, return_tensors = 'pt')
print(tokens)

{'input_ids': tensor([[ 101, 2023, 2003, 2019, 2742, 6251, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


### 2. Spacy

In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp("This is an example sentence.")
tokens = [token.text for token in doc]
print(tokens)

['This', 'is', 'an', 'example', 'sentence', '.']


### 3. NLTK (Natural Language Toolkit)

In [12]:
from nltk.tokenize import word_tokenize

sentence = "This is an example sentence."
tokens = word_tokenize(sentence)
print(tokens)

['This', 'is', 'an', 'example', 'sentence', '.']


### 4. Stanford NLP (Stanza)

In [13]:
import stanza

stanza.download("en")
nlp = stanza.Pipeline(lang = "en", processors = "tokenize")
doc = nlp("This is an example sentence.")
tokens = [word.text for sentence in doc.sentences for word in sentence.words]
print(tokens)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-01 09:35:37 INFO: Downloaded file to C:\Users\Ciya\stanza_resources\resources.json
2025-01-01 09:35:37 INFO: Downloading default packages for language: en (English) ...
2025-01-01 09:35:38 INFO: File exists: C:\Users\Ciya\stanza_resources\en\default.zip
2025-01-01 09:35:41 INFO: Finished downloading models and saved to C:\Users\Ciya\stanza_resources
2025-01-01 09:35:41 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-01 09:35:41 INFO: Downloaded file to C:\Users\Ciya\stanza_resources\resources.json
2025-01-01 09:35:41 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-01-01 09:35:41 INFO: Using device: cpu
2025-01-01 09:35:41 INFO: Loading: tokenize
2025-01-01 09:35:41 INFO: Loading: mwt
2025-01-01 09:35:41 INFO: Done loading processors!


['This', 'is', 'an', 'example', 'sentence', '.']


### 5. SentencePiece

In [9]:
import sentencepiece as spm

# Train a SentencePiece model (if not using a pre-trained model)
spm.SentencePieceTrainer.train(input = './data/data.txt', model_prefix = 'tokenizer', vocab_size = '868')

# Load and tokenize
sp = spm.SentencePieceProcessor(model_file = 'tokenizer.model')
tokens = sp.encode("This is an example setence.", out_type = str)
print(tokens)

['▁Th', 'is', '▁is', '▁an', '▁example', '▁set', 'ence', '.']


### 6. OpenAI Tokenizer

In [15]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokens = tokenizer("This is an example sentence.")
print(tokens)

{'input_ids': [1212, 318, 281, 1672, 6827, 13], 'attention_mask': [1, 1, 1, 1, 1, 1]}
