# Tokenization in Python 

## Brute force method

In [1]:
text = "Dr. Smith works at UTS. He teaches AI. He's moving to L.A.! For his whereabouts, visit his homepage at http://www.example.com/."
print(text.split())

['Dr.', 'Smith', 'works', 'at', 'UTS.', 'He', 'teaches', 'AI.', "He's", 'moving', 'to', 'L.A.!', 'For', 'his', 'whereabouts,', 'visit', 'his', 'homepage', 'at', 'http://www.example.com/.']


## Using NLTK

In [4]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Vitali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
text = "Dr. Smith works at UTS. He teaches AI. He's moving to L.A.! For his whereabouts, visit his homepage at http://www.example.com/."

# Word Tokenization
word_tokens = word_tokenize(text)
print(word_tokens)
# ['Dr.', 'Smith', 'works', 'at', 'UTS.', 'He', 'teaches', 'AI', '.']

# Sentence Tokenization
sent_tokens = sent_tokenize(text)
print(sent_tokens)
# ['Dr. Smith works at UTS.', 'He teaches AI.']

['Dr.', 'Smith', 'works', 'at', 'UTS', '.', 'He', 'teaches', 'AI', '.', 'He', "'s", 'moving', 'to', 'L.A.', '!', 'For', 'his', 'whereabouts', ',', 'visit', 'his', 'homepage', 'at', 'http', ':', '//www.example.com/', '.']
['Dr. Smith works at UTS.', 'He teaches AI.', "He's moving to L.A.!", 'For his whereabouts, visit his homepage at http://www.example.com/.']


## Using SpaCy

In [None]:
!python -m spacy download en_core_web_sm

In [11]:
import spacy

In [13]:
nlp = spacy.load("en_core_web_sm")



In [14]:
text = "Dr. Smith works at UTS. He teaches AI. He's moving to L.A.! For his whereabouts, visit his homepage at http://www.example.com/."

doc = nlp(text)

# Word Tokenization
word_tokens = [token.text for token in doc]
print(word_tokens)
# ['Dr.', 'Smith', 'works', 'at', 'UTS', '.', 'He', 'teaches', 'AI', '.']

# Sentence Tokenization
sent_tokens = [sent.text for sent in doc.sents]
print(sent_tokens)
# ['Dr. Smith works at UTS.', 'He teaches AI.']

['Dr.', 'Smith', 'works', 'at', 'UTS', '.', 'He', 'teaches', 'AI', '.', 'He', "'s", 'moving', 'to', 'L.A.', '!', 'For', 'his', 'whereabouts', ',', 'visit', 'his', 'homepage', 'at', 'http://www.example.com/.']
['Dr. Smith works at UTS.', 'He teaches AI.', "He's moving to L.A.!", 'For his whereabouts, visit his homepage at http://www.example.com/.']


## Using Hugging Face’s BERT Tokenizer

In [17]:
from transformers import AutoTokenizer

In [19]:
# Load a pre-trained WordPiece tokenizer (BERT-based)
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [21]:
# Tokenize the German compound word
word = "Rechtsschutzversicherungsgesellschaften"
tokens = tokenizer.tokenize(word)

print(tokens)

['Rechtsschutz', '##versicherungs', '##gesellschaften']
