# NLTK Library

## Sentence Tokenizer

In [12]:
import nltk
nltk.download('punkt')  # Download the necessary resources

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
text = "Hello! How are you doing? I hope everything is going well. My name is Parth Jaju. What is your name? Wow, that's great!"
sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence)

Hello!
How are you doing?
I hope everything is going well.
My name is Parth Jaju.
What is your name?
Wow, that's great!


## Word Tokenizer

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/flyandlure/datasets/master/titles_and_descriptions.csv')
df.head()

Unnamed: 0,url,title,description
0,https://practicaldatascience.co.uk/data-scienc...,How to create a Python virtual environment for...,Learn how to create a virtual environment for ...
1,https://practicaldatascience.co.uk/data-scienc...,How to engineer date features using Pandas,In time series datasets dates often hold the k...
2,https://practicaldatascience.co.uk/machine-lea...,How to impute missing numeric values in your d...,Cleverly filling in the gaps when numeric data...
3,https://practicaldatascience.co.uk/machine-lea...,How to interpret the confusion matrix,The confusion matrix can tell you more about y...
4,https://practicaldatascience.co.uk/machine-lea...,How to use mean encoding in your machine learn...,Learn how to use the mean encoding technique t...


In [None]:
df['text'] = df['title'] + " " + df['description']
df.head()

Unnamed: 0,url,title,description,text
0,https://practicaldatascience.co.uk/data-scienc...,How to create a Python virtual environment for...,Learn how to create a virtual environment for ...,How to create a Python virtual environment for...
1,https://practicaldatascience.co.uk/data-scienc...,How to engineer date features using Pandas,In time series datasets dates often hold the k...,How to engineer date features using Pandas In ...
2,https://practicaldatascience.co.uk/machine-lea...,How to impute missing numeric values in your d...,Cleverly filling in the gaps when numeric data...,How to impute missing numeric values in your d...
3,https://practicaldatascience.co.uk/machine-lea...,How to interpret the confusion matrix,The confusion matrix can tell you more about y...,How to interpret the confusion matrix The conf...
4,https://practicaldatascience.co.uk/machine-lea...,How to use mean encoding in your machine learn...,Learn how to use the mean encoding technique t...,How to use mean encoding in your machine learn...


In [None]:
df['text'].dropna(inplace=True)
df['text'] = df['text'].astype(str)
df.head()

Unnamed: 0,url,title,description,text
0,https://practicaldatascience.co.uk/data-scienc...,How to create a Python virtual environment for...,Learn how to create a virtual environment for ...,How to create a Python virtual environment for...
1,https://practicaldatascience.co.uk/data-scienc...,How to engineer date features using Pandas,In time series datasets dates often hold the k...,How to engineer date features using Pandas In ...
2,https://practicaldatascience.co.uk/machine-lea...,How to impute missing numeric values in your d...,Cleverly filling in the gaps when numeric data...,How to impute missing numeric values in your d...
3,https://practicaldatascience.co.uk/machine-lea...,How to interpret the confusion matrix,The confusion matrix can tell you more about y...,How to interpret the confusion matrix The conf...
4,https://practicaldatascience.co.uk/machine-lea...,How to use mean encoding in your machine learn...,Learn how to use the mean encoding technique t...,How to use mean encoding in your machine learn...


In [None]:
df['tokenized_sents'] = df.apply(lambda row: word_tokenize(row['text']), axis=1)
df['tokenized_sents']

0      [How, to, create, a, Python, virtual, environm...
1      [How, to, engineer, date, features, using, Pan...
2      [How, to, impute, missing, numeric, values, in...
3      [How, to, interpret, the, confusion, matrix, T...
4      [How, to, use, mean, encoding, in, your, machi...
                             ...                        
217    [Practical, Data, Science, Practical, Data, Sc...
218    [Practical, Data, Science, Practical, Data, Sc...
219    [Practical, Data, Science, Practical, Data, Sc...
220    [Practical, Data, Science, Practical, Data, Sc...
221                                                [nan]
Name: tokenized_sents, Length: 222, dtype: object

In [None]:
df.head()

Unnamed: 0,url,title,description,text,tokenized_sents
0,https://practicaldatascience.co.uk/data-scienc...,How to create a Python virtual environment for...,Learn how to create a virtual environment for ...,How to create a Python virtual environment for...,"[How, to, create, a, Python, virtual, environm..."
1,https://practicaldatascience.co.uk/data-scienc...,How to engineer date features using Pandas,In time series datasets dates often hold the k...,How to engineer date features using Pandas In ...,"[How, to, engineer, date, features, using, Pan..."
2,https://practicaldatascience.co.uk/machine-lea...,How to impute missing numeric values in your d...,Cleverly filling in the gaps when numeric data...,How to impute missing numeric values in your d...,"[How, to, impute, missing, numeric, values, in..."
3,https://practicaldatascience.co.uk/machine-lea...,How to interpret the confusion matrix,The confusion matrix can tell you more about y...,How to interpret the confusion matrix The conf...,"[How, to, interpret, the, confusion, matrix, T..."
4,https://practicaldatascience.co.uk/machine-lea...,How to use mean encoding in your machine learn...,Learn how to use the mean encoding technique t...,How to use mean encoding in your machine learn...,"[How, to, use, mean, encoding, in, your, machi..."


## Whitespace Tokenizer

In [None]:
str = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi id erat eleifend, vehicula massa malesuada, rutrum justo. Mauris id purus erat. Ut et gravida urna. Nam eu metus dignissim, vehicula augue et, euismod urna. Sed convallis varius convallis. Curabitur blandit enim sit amet tempus tempor. Suspendisse ullamcorper eget dui quis euismod. Aenean id turpis diam. Phasellus lacus magna, mollis non mattis vel, pellentesque in erat. Suspendisse neque elit, posuere vitae ligula gravida, tincidunt vestibulum enim. Mauris sit amet condimentum risus, eget ullamcorper lectus.

Vivamus vel magna tempor metus sagittis vehicula. Duis ultrices dapibus neque ut eleifend. Nulla nibh dolor, vehicula suscipit egestas et, finibus vel justo. Donec sed molestie tellus, a condimentum lectus. Ut efficitur quis urna quis mattis. Nam at orci diam. Proin sit amet nisi sed lectus fringilla interdum. Mauris facilisis libero vel velit accumsan, non condimentum ex malesuada. Phasellus eros felis, consectetur vitae dolor sit amet, gravida iaculis eros. Maecenas varius mi augue, nec ornare turpis pulvinar vitae. Cras in magna hendrerit, dapibus sapien dictum, bibendum felis. Aenean maximus nec ligula sit amet congue. Integer purus urna, porttitor ut lorem vitae, imperdiet ornare massa. Duis pellentesque vulputate nibh in congue.

Ut pharetra imperdiet ultricies. Suspendisse purus leo, sollicitudin hendrerit nulla in, ullamcorper finibus mauris. Aliquam tincidunt eros eget enim lobortis gravida. Nullam egestas id ante sed ullamcorper. Aliquam ut gravida nibh. Fusce at tellus suscipit, ultricies elit in, tincidunt ligula. Aenean semper ligula risus, vitae sodales tortor sollicitudin eget. Ut suscipit vel tortor sed ullamcorper. Quisque viverra ex mi, pretium dapibus lorem dignissim ut. Phasellus eget pharetra lectus. Ut ornare erat commodo purus volutpat auctor. Vestibulum rutrum, turpis ac efficitur consequat, metus nunc ornare nulla, ac commodo mauris lectus pretium velit. Nam sagittis nulla nec sem maximus malesuada in sed tortor. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Pellentesque rhoncus arcu lorem, id iaculis lectus varius sit amet. Donec ante ex, consectetur vel eleifend sit amet, sollicitudin vitae dui.

Nam in libero dignissim, iaculis sem ut, posuere nulla. Phasellus sit amet vestibulum nulla, sed egestas arcu. Nunc dignissim ligula vel molestie consectetur. Sed non risus varius justo auctor sollicitudin. Phasellus in orci erat. Praesent feugiat vestibulum lorem, quis gravida mi posuere at. Integer sagittis mattis nibh. Nunc molestie tristique laoreet. Donec eget massa semper risus aliquet ornare. Vivamus id aliquet elit. Nam ornare velit eget venenatis laoreet. Quisque tincidunt libero magna, ac sodales neque interdum in. In malesuada risus vitae laoreet posuere. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam eu metus leo.

Fusce porta elit risus, cursus blandit enim dictum nec. Aliquam eu quam in purus mattis tempor in non quam. Maecenas dapibus nulla vel aliquam semper. Donec id molestie sem. Aenean a lacus vel diam venenatis condimentum ut nec quam. Sed ut metus ac ipsum facilisis venenatis. Sed ut lectus at est hendrerit aliquet. Vestibulum ac leo aliquam, interdum elit quis, porttitor justo. Suspendisse luctus eros non lacinia lacinia. Donec et ligula ut sapien vulputate placerat. Interdum et malesuada fames ac ante ipsum primis in faucibus. Quisque ac nunc eu nisi elementum rutrum. Nulla quis dapibus nibh, sed euismod orci."""

In [None]:
from nltk.tokenize import WhitespaceTokenizer
tk = WhitespaceTokenizer()

# Create a string input


# Use tokenize method
whitespace_tokens = tk.tokenize(str)

print(whitespace_tokens)

['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.', 'Morbi', 'id', 'erat', 'eleifend,', 'vehicula', 'massa', 'malesuada,', 'rutrum', 'justo.', 'Mauris', 'id', 'purus', 'erat.', 'Ut', 'et', 'gravida', 'urna.', 'Nam', 'eu', 'metus', 'dignissim,', 'vehicula', 'augue', 'et,', 'euismod', 'urna.', 'Sed', 'convallis', 'varius', 'convallis.', 'Curabitur', 'blandit', 'enim', 'sit', 'amet', 'tempus', 'tempor.', 'Suspendisse', 'ullamcorper', 'eget', 'dui', 'quis', 'euismod.', 'Aenean', 'id', 'turpis', 'diam.', 'Phasellus', 'lacus', 'magna,', 'mollis', 'non', 'mattis', 'vel,', 'pellentesque', 'in', 'erat.', 'Suspendisse', 'neque', 'elit,', 'posuere', 'vitae', 'ligula', 'gravida,', 'tincidunt', 'vestibulum', 'enim.', 'Mauris', 'sit', 'amet', 'condimentum', 'risus,', 'eget', 'ullamcorper', 'lectus.', 'Vivamus', 'vel', 'magna', 'tempor', 'metus', 'sagittis', 'vehicula.', 'Duis', 'ultrices', 'dapibus', 'neque', 'ut', 'eleifend.', 'Nulla', 'nibh', 'dolor,', 'vehicula', 'su

## Word_punct Tokenizer

In [None]:
from nltk.tokenize import WordPunctTokenizer

# Create a reference variable for Class WordPunctTokenizer
wptk = WordPunctTokenizer()

# Create a string input
s = '''Good muffins cost ＄3.88\nin New York.  Please buy me
... two of them.\n\nThanks.'''

# Use tokenize method
punct_tks = wptk.tokenize(s)

print(punct_tks)

['Good', 'muffins', 'cost', '＄', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', '...', 'two', 'of', 'them', '.', 'Thanks', '.']


## TreeBankWordTokenizer

In [None]:
from nltk.tokenize import TreebankWordTokenizer
s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks. They'll save and invest more. hi, my name can't hello,'''
tbw_tks = TreebankWordTokenizer().tokenize(s)
print(tbw_tks)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.', 'They', "'ll", 'save', 'and', 'invest', 'more.', 'hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']


## TweetTokenizer

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tks = TweetTokenizer().tokenize(s)
print(tweet_tks)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', "They'll", 'save', 'and', 'invest', 'more', '.', 'hi', ',', 'my', 'name', "can't", 'hello', ',']


## MWETokenizer

In [None]:
from nltk.tokenize import MWETokenizer

mwe = MWETokenizer([('N', 'Y'),('New', 'York')])
mwe.add_mwe(('save', 'and', 'interest'))

s = "I live in New York. I use save and interest strategy."

mwe_tks = mwe.tokenize(s.split())

print(mwe_tks)

['I', 'live', 'in', 'New', 'York.', 'I', 'use', 'save_and_interest', 'strategy.']


# Split Function

In [None]:
st = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks. They'll save and invest more. hi, my name can't hello,'''
st.split()

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them.',
 'Thanks.',
 "They'll",
 'save',
 'and',
 'invest',
 'more.',
 'hi,',
 'my',
 'name',
 "can't",
 'hello,']

# TextBlob Library

In [None]:
from textblob import TextBlob
st = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks. They'll save and invest more. hi, my name can't hello,'''
blob_object = TextBlob(st)
text_words = blob_object.words
print(text_words)
print(len(text_words))

['Good', 'muffins', 'cost', '3.88', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks', 'They', "'ll", 'save', 'and', 'invest', 'more', 'hi', 'my', 'name', 'ca', "n't", 'hello']
26


# SpaCy Library

In [None]:
import spacy
stri = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks. They'll save and invest more. hi, my name can't hello,'''
nlp = spacy.load("en_core_web_sm")
doc = nlp(stri)
for token in doc:
  print(token, token.idx)

Good 0
muffins 5
cost 13
$ 18
3.88 19

 23
in 24
New 27
York 31
. 35
  37
Please 38
buy 45
me 49

 51
two 52
of 56
them 59
. 63

 64
Thanks 65
. 71
They 73
'll 77
save 81
and 86
invest 90
more 97
. 101
hi 103
, 105
my 107
name 110
ca 115
n't 117
hello 121
, 126


# Gensim Library

In [None]:
from gensim.utils import tokenize

text = '''Associative arrays are particularly useful when you want to store and retrieve data based on descriptive identifiers (keys) rather than numerical indices. They are commonly used for tasks like storing configuration settings, representing database records, managing user profiles, and more.'''
list(tokenize(text))

['Associative',
 'arrays',
 'are',
 'particularly',
 'useful',
 'when',
 'you',
 'want',
 'to',
 'store',
 'and',
 'retrieve',
 'data',
 'based',
 'on',
 'descriptive',
 'identifiers',
 'keys',
 'rather',
 'than',
 'numerical',
 'indices',
 'They',
 'are',
 'commonly',
 'used',
 'for',
 'tasks',
 'like',
 'storing',
 'configuration',
 'settings',
 'representing',
 'database',
 'records',
 'managing',
 'user',
 'profiles',
 'and',
 'more']

# Keras Library

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

ntoken = Tokenizer(num_words = 20)
text = '''Associative arrays are particularly useful when you want to store and retrieve data based on descriptive identifiers (keys) rather than numerical indices. They are commonly used for tasks like storing configuration settings, representing database records, managing user profiles, and more.'''
ntoken.fit_on_texts(text)
list_words = text_to_word_sequence(text)
print(list_words)


['associative', 'arrays', 'are', 'particularly', 'useful', 'when', 'you', 'want', 'to', 'store', 'and', 'retrieve', 'data', 'based', 'on', 'descriptive', 'identifiers', 'keys', 'rather', 'than', 'numerical', 'indices', 'they', 'are', 'commonly', 'used', 'for', 'tasks', 'like', 'storing', 'configuration', 'settings', 'representing', 'database', 'records', 'managing', 'user', 'profiles', 'and', 'more']
