#### Installing the NLTK Library by 
``` !pip install nltk ```

In [1]:
import nltk

# download both punkt and punkt_tab (needed in latest versions)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
# Download the required POS tagger model if not already present
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package ma

True

In [2]:
corpus = """ Hello Welcome, to Vansh NLP Tutorials.
Please do watch the entire couse! to become expert in NLP """

In [3]:
# Tokenization
from nltk.tokenize import sent_tokenize
documents = sent_tokenize(corpus)

In [4]:
type(documents)

list

In [5]:
for idx,sent in enumerate(documents,start=1):
    print(idx,sent)


1  Hello Welcome, to Vansh NLP Tutorials.
2 Please do watch the entire couse!
3 to become expert in NLP


In [6]:
## Paragraph--> words
## Sentence--> words
from nltk import word_tokenize
word = word_tokenize(corpus)
print(word)

['Hello', 'Welcome', ',', 'to', 'Vansh', 'NLP', 'Tutorials', '.', 'Please', 'do', 'watch', 'the', 'entire', 'couse', '!', 'to', 'become', 'expert', 'in', 'NLP']


In [7]:
for idx, sent in enumerate(documents,start=1):
    print(f"{idx}. words are{word_tokenize(sent)}")

1. words are['Hello', 'Welcome', ',', 'to', 'Vansh', 'NLP', 'Tutorials', '.']
2. words are['Please', 'do', 'watch', 'the', 'entire', 'couse', '!']
3. words are['to', 'become', 'expert', 'in', 'NLP']


In [8]:
# To include . with the last word this TreeBankWordTokenizer library is used
from nltk.tokenize import TreebankWordTokenizer
tok = TreebankWordTokenizer()
tok.tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Vansh',
 'NLP',
 'Tutorials.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'couse',
 '!',
 'to',
 'become',
 'expert',
 'in',
 'NLP']

## Stemming

#### 1. PorterStemmer

In [9]:
words = ["eating","eat","eaten","writing","write","wrote","learn","learning","learned"]

In [10]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
for word in words:
    print(word+ "...>"+ stemming.stem(word))

eating...>eat
eat...>eat
eaten...>eaten
writing...>write
write...>write
wrote...>wrote
learn...>learn
learning...>learn
learned...>learn


#### 2. RegexpStemmer

In [11]:
from nltk.stem import RegexpStemmer
reg_stem = RegexpStemmer('ing$|s$|e$|able$',min=4)
print(reg_stem.stem("eating"))
print(reg_stem.stem("dates"))
print(reg_stem.stem("admirable"))



eat
date
admir


#### 3. Snowball Stemmer

In [12]:
from nltk.stem import SnowballStemmer
snb = SnowballStemmer(language='english')
for word in words:
    print(word +'--->'+ snb.stem(word))


eating--->eat
eat--->eat
eaten--->eaten
writing--->write
write--->write
wrote--->wrote
learn--->learn
learning--->learn
learned--->learn


## LEMMATIZATION

### 1. Wordnet Lemmatizer

In [13]:
from nltk.stem import WordNetLemmatizer
lmt = WordNetLemmatizer()
for word in words:
    print(word+"--->"+lmt.lemmatize(word=word,pos='v'))


eating--->eat
eat--->eat
eaten--->eat
writing--->write
write--->write
wrote--->write
learn--->learn
learning--->learn
learned--->learn


In [14]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# List of English words that are not useful in paragraph
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [16]:
paragraph = "Natural Language Processing (NLP) is a field of" \
" artificial intelligence that focuses on enabling computers to understand," \
" interpret, and generate human language. It combines linguistics, computer science," \
" and machine learning to process large amounts of natural language data. Common" \
" applications of NLP include text classification, sentiment analysis, machine" \
" translation, chatbots, and speech recognition. By bridging the gap between human " \
"communication and computer understanding, NLP plays a crucial role in making technology more " \
"interactive and intelligent."

In [17]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [18]:
sentences = nltk.sent_tokenize(paragraph)

In [19]:
## Apply Stopword then apply stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words("english"))]
    sentences[i] = ' '.join(words) # Converting all the words in sentences

In [20]:
sentences

['natur languag process ( nlp ) field artifici intellig focus enabl comput understand , interpret , gener human languag .',
 'it combin linguist , comput scienc , machin learn process larg amount natur languag data .',
 'common applic nlp includ text classif , sentiment analysi , machin translat , chatbot , speech recognit .',
 'by bridg gap human commun comput understand , nlp play crucial role make technolog interact intellig .']

In [21]:
## Apply Stopword then apply snowball 
from nltk.stem import SnowballStemmer
snowballstemmer = SnowballStemmer('english')
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [snowballstemmer.stem(word) for word in words if word not in set(stopwords.words("english"))]
    sentences[i] = ' '.join(words) # Converting all the words in sentences

sentences

['natur languag process ( nlp ) field artifici intellig focus enabl comput understand , interpret , gener human languag .',
 'combin linguist , comput scienc , machin learn process larg amount natur languag data .',
 'common applic nlp includ text classif , sentiment analysi , machin translat , chatbot , speech recognit .',
 'bridg gap human commun comput understand , nlp play crucial role make technolog interact intellig .']

In [22]:
## Apply Stopword then apply lemmatization
from nltk.stem import WordNetLemmatizer
wordlem = WordNetLemmatizer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [wordlem.lemmatize(word,pos="n") for word in words if word not in set(stopwords.words("english"))]
    sentences[i] = ' '.join(words) # Converting all the words in sentences

sentences

['natur languag process ( nlp ) field artifici intellig focus enabl comput understand , interpret , gener human languag .',
 'combin linguist , comput scienc , machin learn process larg amount natur languag data .',
 'common applic nlp includ text classif , sentiment analysi , machin translat , chatbot , speech recognit .',
 'bridg gap human commun comput understand , nlp play crucial role make technolog interact intellig .']

In [23]:
words2 = nltk.word_tokenize(paragraph)
print(words)

['bridg', 'gap', 'human', 'commun', 'comput', 'understand', ',', 'nlp', 'play', 'crucial', 'role', 'make', 'technolog', 'interact', 'intellig', '.']


In [24]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [25]:
tag_elements = nltk.pos_tag(words2, tagset=None, lang='eng')
nltk.ne_chunk(tag_elements).draw()