## Getting Started with NLTK and Tokenization

In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ramtej\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
txt="hello geeks,we're hoping u r doinng well"

In [3]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [6]:
word_tokenize(txt)

['hello', 'geeks', ',', 'we', "'re", 'hoping', 'u', 'r', 'doinng', 'well']

In [7]:
sent_tokenize(txt)

["hello geeks,we're hoping u r doinng well"]

In [9]:
for word in word_tokenize(txt):
    if(word!='.'):
        print(word)

hello
geeks
,
we
're
hoping
u
r
doinng
well


In [10]:
for word in word_tokenize(txt):
    if(len(word)>=2):
        print(word)

hello
geeks
we
're
hoping
doinng
well


## Stemming and Lemmatization

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ramtej\AppData\Roaming\nltk_data...


True

In [12]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ramtej\AppData\Roaming\nltk_data...


True

In [13]:
from nltk.stem import WordNetLemmatizer,PorterStemmer

In [22]:
stm=PorterStemmer()
lem=WordNetLemmatizer()

In [23]:
print(stm.stem('change'))
print(stm.stem('changes'))
print(stm.stem('changed'))
print(stm.stem('changer'))

chang
chang
chang
changer


In [24]:
print(lem.lemmatize('change'))
print(lem.lemmatize('changes'))
print(lem.lemmatize('changed'))
print(lem.lemmatize('changer'))

change
change
changed
changer


## Stopwords using NLTK

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ramtej\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.corpus import stopwords

In [28]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [35]:
txt="this is not a good time to talk"

In [38]:
txt=word_tokenize(txt)
stopwords_list = stopwords.words('english')


for word in txt:
    if word.lower() not in stopwords_list:
        print(word)

good
time
talk


## Corpus and Vocabulary

### Vocabulary :- 
#### means number of unique words in a given corpus

#### Removing stopwords and taking only unique words

In [1]:
corpus="It seems like there's an issue with the stopwords variable or how it's being used in your code. The error message indicates that stopwords is of type WordListCorpusReader and it's not iterable."

In [4]:
stopwords_list = stopwords.words('english')

words=[]
for word in word_tokenize(corpus):
    if word.lower() not in stopwords_list and (len(word)>=2):
        words.append(word.lower())
        
vocab=list(set(words))

In [5]:
vocab

['error',
 'message',
 'indicates',
 'type',
 'like',
 'seems',
 'variable',
 'wordlistcorpusreader',
 'iterable',
 'stopwords',
 'used',
 "'s",
 'code',
 'issue']

## Vocabulary with Keras

In [60]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [62]:
tok=Tokenizer()

In [63]:
corp=['coffe is hot','water is cold']

In [64]:
tok.fit_on_texts(corp)

In [65]:
tok.word_index

{'is': 1, 'coffe': 2, 'hot': 3, 'water': 4, 'cold': 5}

In [66]:
tok.texts_to_sequences(corp)

[[2, 1, 3], [4, 1, 5]]

In [67]:
tok=Tokenizer(oov_token='black')
tok.fit_on_texts(corp)
tok.texts_to_sequences(['water is hot','black coffee if cold'])

[[5, 2, 4], [1, 1, 1, 6]]

In [70]:
tok=Tokenizer(num_words=4)
tok.fit_on_texts(corp)
tok.word_index

{'is': 1, 'coffe': 2, 'hot': 3, 'water': 4, 'cold': 5}

In [71]:
tok.texts_to_sequences(['water is hot','black coffee if cold'])

[[1, 3], []]