In [62]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.




In [63]:
import nltk

## Tokenization

In [64]:
txt = "I am a student. I am a teacher. I am a doctor."
txt

'I am a student. I am a teacher. I am a doctor.'

In [65]:
txt.split('.')

['I am a student', ' I am a teacher', ' I am a doctor', '']

In [66]:
txt.split(' ')

['I',
 'am',
 'a',
 'student.',
 'I',
 'am',
 'a',
 'teacher.',
 'I',
 'am',
 'a',
 'doctor.']

In [67]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
word_tokenize(txt)

['I',
 'am',
 'a',
 'student',
 '.',
 'I',
 'am',
 'a',
 'teacher',
 '.',
 'I',
 'am',
 'a',
 'doctor',
 '.']

In [69]:
sent_tokenize(txt)

['I am a student.', 'I am a teacher.', 'I am a doctor.']

In [70]:
for word in word_tokenize(txt):
    if (word == '.'):
        print('End of sentence')
        continue
    print(word)
    

I
am
a
student
End of sentence
I
am
a
teacher
End of sentence
I
am
a
doctor
End of sentence


## Stemming And Lemmatization

In [71]:
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer, PorterStemmer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [72]:
stem = PorterStemmer()
lem = WordNetLemmatizer()

## Lemmatization

In [73]:
print(lem.lemmatize('running', pos='v'))
print(lem.lemmatize('running', pos='n'))
print(lem.lemmatize('runs'))

run
running
run


## Stemming

In [74]:
print(stem.stem('running'))
print(stem.stem('running'))
print(stem.stem('runs'))

run
run
run


## StopWords

In [75]:
txt = "This is not a good time to talk."
txt

'This is not a good time to talk.'

In [76]:
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [78]:
txt = word_tokenize(txt)
txt

['This', 'is', 'not', 'a', 'good', 'time', 'to', 'talk', '.']

In [79]:
for word in txt:
    if word in stopwords.words('english'):
        txt.remove(word)
txt

['This', 'not', 'good', 'time', 'talk', '.']

## Corpus and Vocabulary

In [80]:
corpus = "India, officially the Rebulic of India. is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; it's Andaman and NIcobar Islands share a maritime border with Thailand and Indonesia."

In [81]:
corpus

"India, officially the Rebulic of India. is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; it's Andaman and NIcobar Islands share a maritime border with Thailand and Indonesia."

## Stop Word Removal

In [82]:
words = []

for word in word_tokenize(corpus):
    if (word.lower() not in stopwords.words('english')) and (len(word) >= 2):
        words.append(word)

In [83]:
words

['India',
 'officially',
 'Rebulic',
 'India',
 'country',
 'South',
 'Asia',
 'seventh-largest',
 'country',
 'area',
 'second-most',
 'populous',
 'country',
 'populous',
 'democracy',
 'world',
 'Bounded',
 'Indian',
 'Ocean',
 'south',
 'Arabian',
 'Sea',
 'southwest',
 'Bay',
 'Bengal',
 'southeast',
 'shares',
 'land',
 'borders',
 'Pakistan',
 'west',
 'China',
 'Nepal',
 'Bhutan',
 'north',
 'Bangladesh',
 'Myanmar',
 'east',
 'Indian',
 'Ocean',
 'India',
 'vicinity',
 'Sri',
 'Lanka',
 'Maldives',
 "'s",
 'Andaman',
 'NIcobar',
 'Islands',
 'share',
 'maritime',
 'border',
 'Thailand',
 'Indonesia']

In [84]:
vocab = len(set(words))

In [85]:
from nltk.tokenize import sent_tokenize

In [86]:
for sent in sent_tokenize(corpus):
    print(sent)

India, officially the Rebulic of India.
is a country in South Asia.
It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world.
Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.
In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; it's Andaman and NIcobar Islands share a maritime border with Thailand and Indonesia.


## Vocab with Keras

In [87]:
import tensorflow as tf

In [88]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [89]:
tok = Tokenizer()

In [90]:
corp = ['coffee is hot', 'water is cold']

tok.fit_on_texts(corp)

In [91]:
tok.word_index

{'is': 1, 'coffee': 2, 'hot': 3, 'water': 4, 'cold': 5}

In [92]:
tok.texts_to_sequences(corp)

[[2, 1, 3], [4, 1, 5]]

### Learn about adding OVV and Limiting the number of words

In [101]:
tok = Tokenizer(num_words=4, oov_token='black', filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
corp = ['water is hot', 'black coffee is cold!']

tok.fit_on_texts(corp)
tok.word_index

{'black': 5, 'is': 2, 'water': 3, 'hot': 4, 'coffee': 6, 'cold': 7}

In [102]:
tok.texts_to_sequences(corp)

[[3, 2, 5], [5, 5, 2, 5]]