### Split

In [5]:
sentence = "The brown fox is quick and he is jumping over the lazy dog"

words = sentence.split()

words

['The',
 'brown',
 'fox',
 'is',
 'quick',
 'and',
 'he',
 'is',
 'jumping',
 'over',
 'the',
 'lazy',
 'dog']

### Using List Comprehension

In [8]:
test_list = ['Geeks for Geeks', 'is', 'best computer science portal']

print([x.split() for x in test_list])

print([y for y in [x.split() for x in test_list]])

[['Geeks', 'for', 'Geeks'], ['is'], ['best', 'computer', 'science', 'portal']]
[['Geeks', 'for', 'Geeks'], ['is'], ['best', 'computer', 'science', 'portal']]


### Sentence Tokenize : Split Paragraph or Corpus into list of Sentences

In [10]:
from nltk.tokenize import sent_tokenize

text = "God is Great! I won a lotter."

print(sent_tokenize(text))

['God is Great!', 'I won a lotter.']


### Word Tokenize

In [11]:
from nltk.tokenize import word_tokenize

text = "God is Great! I won a lotter."

print(word_tokenize(text))

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lotter', '.']


### MWE Tokenizer : Multi-Word Expression
##### Here Certain group of multiple words are treated as one entity during tokenization, such as 'United States of America' ,'People's replublic of China', 'not only' 'but also' 

In [14]:
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize

tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
tokenizer.add_mwe(('in', 'spite', 'of'))

print(tokenizer.tokenize(word_tokenize('In a little or a little bit or a lot in spite of')))

tokeniser1 = MWETokenizer([('United','States','of','America')])

print(tokeniser1.tokenize(word_tokenize("I Live in United States of America")))

['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
['I', 'Live', 'in', 'United_States_of_America']


###  RegexpTokenizer 
#### Extract the tokens from string by using regular expression 

In [16]:
from nltk.tokenize import RegexpTokenizer 

tokenizer = RegexpTokenizer("[\w']+") 

text = "Let's see how it's working."
tokenizer.tokenize(text) 

["Let's", 'see', 'how', "it's", 'working']

In [7]:
# import RegexpTokenizer() method from nltk 
from nltk.tokenize import RegexpTokenizer 
    
# Create a reference variable for Class RegexpTokenizer 
tk = RegexpTokenizer('\s+', gaps = True) 
    
# Create a string input 
gfg = "I love Python"
    
# Use tokenize method 
geek = tk.tokenize(gfg) 
    
print(geek) 

['I', 'love', 'Python']


###  Whitespace Tokenizer:
#### This Tokenizer splits a string whenever a space, tab, or newline character is present

In [17]:
# WhitespaceTokenizer : we are able to extract the tokens from string of words or sentences without whitespaces, 
#                       new line and tabs

from nltk.tokenize import WhitespaceTokenizer

sentence =  "The 2 QUICK Brown-Foxes\tjumped over\nthe lazy dog's\n\nbone."

WhitespaceTokenizer().tokenize(sentence)

['The',
 '2',
 'QUICK',
 'Brown-Foxes',
 'jumped',
 'over',
 'the',
 'lazy',
 "dog's",
 'bone.']

In [4]:
# import WhitespaceTokenizer() method from nltk 
from nltk.tokenize import WhitespaceTokenizer 
     
# Create a reference variable for Class WhitespaceTokenizer 
tk = WhitespaceTokenizer() 
     
# Create a string input 
gfg = "GeeksforGeeks \nis\t for geeks"
     
# Use tokenize method 
geek = tk.tokenize(gfg) 
     
print(geek) 

['GeeksforGeeks', 'is', 'for', 'geeks']


### Tweet Tokenizer

##### This is specifically designed for tokenizing tweets.  it is capable of dealing with emotions and expressions od sentiment

In [18]:
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()

sentence = "This is a cooool #dummysmiley:-) :-P <3 and some arrows < > -> <--"

tknzr.tokenize(sentence)

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

### WordPunkt Tokenizer

##### This splits a text into a list of alphabetical characters, digits, and non - aphabetical characters

In [20]:
# WordPunctTokenizer:Extract the tokens from string of words or sentences in the form of Alphabetic and Non-Alphabetic character

from nltk.tokenize import WordPunctTokenizer

tokenize = WordPunctTokenizer()
tokenize.tokenize("Let's see how it's working. 12")

# Here observe above It separates the punctuations from the words

['Let', "'", 's', 'see', 'how', 'it', "'", 's', 'working', '.', '12']

In [6]:
# import WordPunctTokenizer() method from nltk 
from nltk.tokenize import WordPunctTokenizer 
     
# Create a reference variable for Class WordPunctTokenizer 
tk = WordPunctTokenizer() 
     
# Create a string input 
gfg = "The price\t of burger \nin BurgerKing is Rs.36.\n"
     
# Use tokenize method 
geek = tk.tokenize(gfg) 
     
print(geek) 

['The', 'price', 'of', 'burger', 'in', 'BurgerKing', 'is', 'Rs', '.', '36', '.']


### LineTokenizer : Tokenize based on the '\n'

In [26]:
# LineTokenizer : Tokenize based on the '\n'

from nltk.tokenize import LineTokenizer

text = "My name is maximum Decimus Meridius. Commander of the Armier of the North, General of the Felix Legions \
        and loyal servant to the true emperor, Mercus Aurelius. \nFather to a murdered son. husband to a murdered \
        wife. \nAnd I will have my vengeance, in this"

print(LineTokenizer().tokenize(text))
print(len(LineTokenizer().tokenize(text)))


['My name is maximum Decimus Meridius. Commander of the Armier of the North, General of the Felix Legions         and loyal servant to the true emperor, Mercus Aurelius. ', 'Father to a murdered son. husband to a murdered         wife. ', 'And I will have my vengeance, in this']
3


### Space Tokenizer : Tokenize based on the 'space'

In [27]:
# SpaceTokenizer : we are able to extract the tokens from string of words on the basis of space between them

from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import LineTokenizer

text = "My name is maximum Decimus Meridius. Commander of the Armier of the North, General of the Felix Legions \
        and loyal servant to the true emperor, Mercus Aurelius. \nFather to a murdered son. husband to a murdered \
        wife. \nAnd I will have my vengeance, in this"

print(SpaceTokenizer().tokenize(text))


['My', 'name', 'is', 'maximum', 'Decimus', 'Meridius.', 'Commander', 'of', 'the', 'Armier', 'of', 'the', 'North,', 'General', 'of', 'the', 'Felix', 'Legions', '', '', '', '', '', '', '', '', 'and', 'loyal', 'servant', 'to', 'the', 'true', 'emperor,', 'Mercus', 'Aurelius.', '\nFather', 'to', 'a', 'murdered', 'son.', 'husband', 'to', 'a', 'murdered', '', '', '', '', '', '', '', '', 'wife.', '\nAnd', 'I', 'will', 'have', 'my', 'vengeance,', 'in', 'this']


In [2]:
# import SpaceTokenizer() method from nltk 
from nltk.tokenize import SpaceTokenizer 
     
# Create a reference variable for Class SpaceTokenizer 
tk = SpaceTokenizer() 
     
# Create a string input 
gfg = "Geeksfor Geeks.. .$$&* \nis\t for geeks"
     
# Use tokenize method 
geek = tk.tokenize(gfg) 
     
print(geek) 

['Geeksfor', 'Geeks..', '.$$&*', '\nis\t', 'for', 'geeks']


### SExprTokenizer

In [3]:
# SExprTokenizer : It actually looking for proper brackets to make tokens.

# import SExprTokenizer() method from nltk 
from nltk.tokenize import SExprTokenizer 
     
# Create a reference variable for Class SExprTokenizer 
tk = SExprTokenizer() 
     
# Create a string input 
gfg = "( a * ( b + c ))ab( a-c )"
     
# Use tokenize method 
geek = tk.tokenize(gfg) 
     
print(geek) 

['( a * ( b + c ))', 'ab', '( a-c )']


### TabTokenizer : 
##### Extract the tokens from string of words on the basis of tabs between them

In [29]:
# import TabTokenizer() method from nltk 
from nltk.tokenize import TabTokenizer 
     
# Create a reference variable for Class TabTokenizer 
tk = TabTokenizer() 
     
# Create a string input 
gfg = "Geeksfor\tGeeks..\t.$$&* \nis\t for geeks"
     
# Use tokenize method 
geek = tk.tokenize(gfg) 
     
print(geek) 

['Geeksfor', 'Geeks..', '.$$&* \nis', ' for geeks']
