# Tokenization Types

In [1]:
import nltk

In [2]:
corpus = """ Before getting started with the examples, we will set the system up with NLTK and other dependent Python libraries.
The pip installer can be used to install NLTK, with an optional installation of numpy, as follows:

"""

In [3]:
print(corpus)

 Before getting started with the examples, we will set the system up with NLTK and other dependent Python libraries.
The pip installer can be used to install NLTK, with an optional installation of numpy, as follows:




## Keyword Tokerizer
- The Natural Language Toolkit (NLTK) provides a convenient way to tokenize text into keywords or tokens.
- If you have specific keywords in mind that you want to tokenize around, you might need a custom approach using NLTK’s RegexpTokenizer.

In [6]:
from nltk.tokenize import RegexpTokenizer

# Define the keywords as regular expressions
keyword_tokenizer = RegexpTokenizer(r'\w+')

# Tokenize the text
keywords = keyword_tokenizer.tokenize(corpus)

# Display the keywords
print(keywords)


['Before', 'getting', 'started', 'with', 'the', 'examples', 'we', 'will', 'set', 'the', 'system', 'up', 'with', 'NLTK', 'and', 'other', 'dependent', 'Python', 'libraries', 'The', 'pip', 'installer', 'can', 'be', 'used', 'to', 'install', 'NLTK', 'with', 'an', 'optional', 'installation', 'of', 'numpy', 'as', 'follows']


## Character Tokerizer

- Character tokenization refers to breaking down a string into individual characters, which can be useful in various natural language processing tasks, especially when working with languages where character-level understanding is important, or for tasks like text generation or analyzing text at a granular level.

In [7]:
from nltk.tokenize import SyllableTokenizer

# Using the SyllableTokenizer to tokenize into characters
char_tokenizer = SyllableTokenizer()

# Tokenize text by characters
char_tokens = [char for token in char_tokenizer.tokenize(corpus) for char in token]

# Display the character tokens
print(char_tokens)


[' ', 'B', 'e', 'f', 'o', 'r', 'e', ' ', 'g', 'e', 't', 't', 'i', 'n', 'g', ' ', 's', 't', 'a', 'r', 't', 'e', 'd', ' ', 'w', 'i', 't', 'h', ' ', 't', 'h', 'e', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 's', ',', ' ', 'w', 'e', ' ', 'w', 'i', 'l', 'l', ' ', 's', 'e', 't', ' ', 't', 'h', 'e', ' ', 's', 'y', 's', 't', 'e', 'm', ' ', 'u', 'p', ' ', 'w', 'i', 't', 'h', ' ', 'N', 'L', 'T', 'K', ' ', 'a', 'n', 'd', ' ', 'o', 't', 'h', 'e', 'r', ' ', 'd', 'e', 'p', 'e', 'n', 'd', 'e', 'n', 't', ' ', 'P', 'y', 't', 'h', 'o', 'n', ' ', 'l', 'i', 'b', 'r', 'a', 'r', 'i', 'e', 's', '.', '\n', 'T', 'h', 'e', ' ', 'p', 'i', 'p', ' ', 'i', 'n', 's', 't', 'a', 'l', 'l', 'e', 'r', ' ', 'c', 'a', 'n', ' ', 'b', 'e', ' ', 'u', 's', 'e', 'd', ' ', 't', 'o', ' ', 'i', 'n', 's', 't', 'a', 'l', 'l', ' ', 'N', 'L', 'T', 'K', ',', ' ', 'w', 'i', 't', 'h', ' ', 'a', 'n', ' ', 'o', 'p', 't', 'i', 'o', 'n', 'a', 'l', ' ', 'i', 'n', 's', 't', 'a', 'l', 'l', 'a', 't', 'i', 'o', 'n', ' ', 'o', 'f', ' ', 'n', 'u', 'm'

'


## Classic tokenizer
- simple word tokenizer that splits text into tokens (words) based on spaces, punctuation, and other basic delimiters

In [8]:
import nltk
from nltk.tokenize import TreebankWordTokenizer

# Initialize the classic tokenizer
tokenizer = TreebankWordTokenizer()

# Tokenize the text
tokens = tokenizer.tokenize(corpus)

# Display the tokens
print(tokens)


['Before', 'getting', 'started', 'with', 'the', 'examples', ',', 'we', 'will', 'set', 'the', 'system', 'up', 'with', 'NLTK', 'and', 'other', 'dependent', 'Python', 'libraries.', 'The', 'pip', 'installer', 'can', 'be', 'used', 'to', 'install', 'NLTK', ',', 'with', 'an', 'optional', 'installation', 'of', 'numpy', ',', 'as', 'follows', ':']


## lowercase tokenizer
- A lowercase tokenizer is a variant of a tokenizer that not only splits the text into tokens (words, characters, etc.) but also converts all tokens to lowercase. 
- This can be useful in natural language processing (NLP) tasks where you want to treat words in a case-insensitive manner.

In [10]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize and convert to lowercase
tokens = [token.lower() for token in word_tokenize(corpus)]

# Display the tokens
print(tokens)

['before', 'getting', 'started', 'with', 'the', 'examples', ',', 'we', 'will', 'set', 'the', 'system', 'up', 'with', 'nltk', 'and', 'other', 'dependent', 'python', 'libraries', '.', 'the', 'pip', 'installer', 'can', 'be', 'used', 'to', 'install', 'nltk', ',', 'with', 'an', 'optional', 'installation', 'of', 'numpy', ',', 'as', 'follows', ':']


## Whitespace Tokenizer:
- Splits text based on whitespace (spaces, tabs, newlines).

In [14]:
from nltk.tokenize import WhitespaceTokenizer
text = "    Hello World, Welcome to new    chapter in NLP"
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(text)
print("Whitespace Tokenizer:", whitespace_tokens)

Whitespace Tokenizer: ['Hello', 'World,', 'Welcome', 'to', 'new', 'chapter', 'in', 'NLP']


## PunktSentenceTokenizer
- A sentence tokenizer that doesn't rely on a pre-defined list of abbreviations.

In [20]:
from nltk.tokenize import PunktSentenceTokenizer

text = " Hello! All, This is Vamsidhar Reddy. I am working as a data scientist. I have a good experience in Data Science" 

punkt_tokenizer = PunktSentenceTokenizer()
punkt_tokens = punkt_tokenizer.tokenize(text)
print("Punkt Sentence Tokenizer:", punkt_tokens)

Punkt Sentence Tokenizer: [' Hello!', 'All, This is Vamsidhar Reddy.', 'I am working as a data scientist.', 'I have a good experience in Data Science']
