### Tokenization 
Tokenization is a fundamental process in Natural 
Language Processing (NLP) that involves breaking 
down a stream of text into smaller units called 
tokens.

- Corpus --> Paragraphs
- Documents --> Sentences
- Vocabulary --> unique words
- Words --> words present in Corpus

In [1]:
corpus = """A short paragraph develop's a single, focused idea through a topic sentence, supporting sentences, and a concluding sentence. It typically ranges from one to five sentences, making it concise and easy for the reader to understand.
"""
print(corpus)

A short paragraph develop's a single, focused idea through a topic sentence, supporting sentences, and a concluding sentence. It typically ranges from one to five sentences, making it concise and easy for the reader to understand.



In [2]:
## tokenization
## paragraphs --> sentences
import nltk
from nltk.tokenize import sent_tokenize

documents = sent_tokenize(corpus)
print(documents)

["A short paragraph develop's a single, focused idea through a topic sentence, supporting sentences, and a concluding sentence.", 'It typically ranges from one to five sentences, making it concise and easy for the reader to understand.']


In [3]:
### paragraph --> words

from nltk.tokenize import word_tokenize
words = word_tokenize(corpus)
print(words)

['A', 'short', 'paragraph', 'develop', "'s", 'a', 'single', ',', 'focused', 'idea', 'through', 'a', 'topic', 'sentence', ',', 'supporting', 'sentences', ',', 'and', 'a', 'concluding', 'sentence', '.', 'It', 'typically', 'ranges', 'from', 'one', 'to', 'five', 'sentences', ',', 'making', 'it', 'concise', 'and', 'easy', 'for', 'the', 'reader', 'to', 'understand', '.']


In [4]:
### sentence to words

for sentence in documents:
    print(word_tokenize(sentence))

['A', 'short', 'paragraph', 'develop', "'s", 'a', 'single', ',', 'focused', 'idea', 'through', 'a', 'topic', 'sentence', ',', 'supporting', 'sentences', ',', 'and', 'a', 'concluding', 'sentence', '.']
['It', 'typically', 'ranges', 'from', 'one', 'to', 'five', 'sentences', ',', 'making', 'it', 'concise', 'and', 'easy', 'for', 'the', 'reader', 'to', 'understand', '.']


In [5]:
### punctuation 

from nltk.tokenize import wordpunct_tokenize
words = wordpunct_tokenize(corpus)
print(words)

['A', 'short', 'paragraph', 'develop', "'", 's', 'a', 'single', ',', 'focused', 'idea', 'through', 'a', 'topic', 'sentence', ',', 'supporting', 'sentences', ',', 'and', 'a', 'concluding', 'sentence', '.', 'It', 'typically', 'ranges', 'from', 'one', 'to', 'five', 'sentences', ',', 'making', 'it', 'concise', 'and', 'easy', 'for', 'the', 'reader', 'to', 'understand', '.']


In [6]:
## fullstop not as separate word
### use TreebankWordDetokenizer

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['A',
 'short',
 'paragraph',
 'develop',
 "'s",
 'a',
 'single',
 ',',
 'focused',
 'idea',
 'through',
 'a',
 'topic',
 'sentence',
 ',',
 'supporting',
 'sentences',
 ',',
 'and',
 'a',
 'concluding',
 'sentence.',
 'It',
 'typically',
 'ranges',
 'from',
 'one',
 'to',
 'five',
 'sentences',
 ',',
 'making',
 'it',
 'concise',
 'and',
 'easy',
 'for',
 'the',
 'reader',
 'to',
 'understand',
 '.']