In [1]:
# !pip install nltk==3.5

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

example_string = """Working on Text preprocessing. Natural Language Processing is amazing. Enjoy the learning."""

In [4]:
sent_tokenize(example_string)

['Working on Text preprocessing.',
 'Natural Language Processing is amazing.',
 'Enjoy the learning.']

In [5]:
word_tokenize(example_string)

['Working',
 'on',
 'Text',
 'preprocessing',
 '.',
 'Natural',
 'Language',
 'Processing',
 'is',
 'amazing',
 '.',
 'Enjoy',
 'the',
 'learning',
 '.']

In [6]:
nltk.download("stopwords")
from nltk.corpus import stopwords

"""Let's tokenize example_string by word and store the resulting list in words_in_list"""
words_in_list = word_tokenize(example_string)

"""
We have a list of the words in `words_in_list`, so the next step is to create a set of stop words to filter `words_in_list`. 
For this example, we’ll need to focus on stop words in `english`.
"""
stop_words = set(stopwords.words("english"))

"""Create an empty list, to hold all the words in words_in_list that aren’t stop words."""
filtered_list = [word for word in words_in_list if word.lower() not in stop_words]  

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
print(filtered_list)

['Working', 'Text', 'preprocessing', '.', 'Natural', 'Language', 'Processing', 'amazing', '.', 'Enjoy', 'learning', '.']


In [8]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(word) for word in filtered_list]
print(stemmed_words)

['work', 'text', 'preprocess', '.', 'natur', 'languag', 'process', 'amaz', '.', 'enjoy', 'learn', '.']


In [9]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

string_for_lemmatizing = "The friends of DeSoto love scarves."
filtered_list = word_tokenize(string_for_lemmatizing)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_list]

print(lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']


In [10]:
lemmatizer.lemmatize("worst")

'worst'

In [11]:
lemmatizer.lemmatize("worst", pos='a')

'bad'

In [12]:
import nltk
nltk.download('tagsets')

nltk.help.upenn_tagset('NN')  # Replace 'NN' with any tag

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [13]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk import pos_tag

text = "Machine learning models are improving every day."

text_words = word_tokenize(text)
tags = pos_tag(text_words)
tags

[nltk_data] Downloading package punkt to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Machine', 'NN'),
 ('learning', 'NN'),
 ('models', 'NNS'),
 ('are', 'VBP'),
 ('improving', 'VBG'),
 ('every', 'DT'),
 ('day', 'NN'),
 ('.', '.')]

In [14]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk import pos_tag

sentence = "The quick brown fox jumps over the lazy dog"
words = word_tokenize(sentence)
tags = pos_tag(words)
tags

[nltk_data] Downloading package punkt to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'VBZ'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [15]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [16]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
tree = cp.parse(tags)
tree.draw()  # Opens a tree window

In [17]:
grammar = r"""
  NP: {<.*>+}         # Chunk everything
      }<VB.*>{        # Chink (exclude) verbs
"""
cp = nltk.RegexpParser(grammar)
tree = cp.parse(tags)
tree.draw()

In [18]:
grammar = r"""
  NP: {<.*>+}         # Chunk everything
      }<VB.*>{        # Then exclude verbs
"""

cp = nltk.RegexpParser(grammar)
tree = cp.parse(tags)
tree.draw()