In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
paragraph =""" Human language is filled with ambiguities that make it incredibly difficult to write software that accurately determines the intended meaning of text or voice data. Homonyms, homophones, sarcasm, idioms, metaphors,
grammar and usage exceptions, variations in sentence structure these just a few of the irregularities of human language that
take humans years to learn, but that programmers must teach natural language-driven applications to recognize and understand
accurately from the start, if those applications are going to be useful."""

In [4]:
sentences = nltk.sent_tokenize(paragraph)

In [5]:
sentences

[' Human language is filled with ambiguities that make it incredibly difficult to write software that accurately determines the intended meaning of text or voice data.',
 'Homonyms, homophones, sarcasm, idioms, metaphors,\ngrammar and usage exceptions, variations in sentence structure these just a few of the irregularities of human language that\ntake humans years to learn, but that programmers must teach natural language-driven applications to recognize and understand \naccurately from the start, if those applications are going to be useful.']

In [6]:
words = nltk.word_tokenize(paragraph)

In [7]:
words

['Human',
 'language',
 'is',
 'filled',
 'with',
 'ambiguities',
 'that',
 'make',
 'it',
 'incredibly',
 'difficult',
 'to',
 'write',
 'software',
 'that',
 'accurately',
 'determines',
 'the',
 'intended',
 'meaning',
 'of',
 'text',
 'or',
 'voice',
 'data',
 '.',
 'Homonyms',
 ',',
 'homophones',
 ',',
 'sarcasm',
 ',',
 'idioms',
 ',',
 'metaphors',
 ',',
 'grammar',
 'and',
 'usage',
 'exceptions',
 ',',
 'variations',
 'in',
 'sentence',
 'structure',
 'these',
 'just',
 'a',
 'few',
 'of',
 'the',
 'irregularities',
 'of',
 'human',
 'language',
 'that',
 'take',
 'humans',
 'years',
 'to',
 'learn',
 ',',
 'but',
 'that',
 'programmers',
 'must',
 'teach',
 'natural',
 'language-driven',
 'applications',
 'to',
 'recognize',
 'and',
 'understand',
 'accurately',
 'from',
 'the',
 'start',
 ',',
 'if',
 'those',
 'applications',
 'are',
 'going',
 'to',
 'be',
 'useful',
 '.']

Stemming

In [8]:
from nltk.stem import PorterStemmer

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
from nltk.corpus import stopwords

In [11]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [12]:
stemmer = PorterStemmer()

In [13]:
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
  sentences[i] = ' '.join(words)

In [14]:
words

['homonym',
 ',',
 'homophon',
 ',',
 'sarcasm',
 ',',
 'idiom',
 ',',
 'metaphor',
 ',',
 'grammar',
 'usag',
 'except',
 ',',
 'variat',
 'sentenc',
 'structur',
 'irregular',
 'human',
 'languag',
 'take',
 'human',
 'year',
 'learn',
 ',',
 'programm',
 'must',
 'teach',
 'natur',
 'language-driven',
 'applic',
 'recogn',
 'understand',
 'accur',
 'start',
 ',',
 'applic',
 'go',
 'use',
 '.']

Lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
from nltk.corpus import stopwords

In [17]:
sentences = nltk.sent_tokenize(paragraph)

In [18]:
lemmatizer = WordNetLemmatizer()

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  sentences[i] = ' ' .join(words)

In [21]:
words

['Homonyms',
 ',',
 'homophone',
 ',',
 'sarcasm',
 ',',
 'idiom',
 ',',
 'metaphor',
 ',',
 'grammar',
 'usage',
 'exception',
 ',',
 'variation',
 'sentence',
 'structure',
 'irregularity',
 'human',
 'language',
 'take',
 'human',
 'year',
 'learn',
 ',',
 'programmer',
 'must',
 'teach',
 'natural',
 'language-driven',
 'application',
 'recognize',
 'understand',
 'accurately',
 'start',
 ',',
 'application',
 'going',
 'useful',
 '.']

In [22]:
word_tokenizer = nltk.word_tokenize(paragraph)

In [23]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [24]:
pos = nltk.pos_tag(word_tokenizer)

In [25]:
pos

[('Human', 'JJ'),
 ('language', 'NN'),
 ('is', 'VBZ'),
 ('filled', 'VBN'),
 ('with', 'IN'),
 ('ambiguities', 'NNS'),
 ('that', 'WDT'),
 ('make', 'VBP'),
 ('it', 'PRP'),
 ('incredibly', 'RB'),
 ('difficult', 'JJ'),
 ('to', 'TO'),
 ('write', 'VB'),
 ('software', 'NN'),
 ('that', 'WDT'),
 ('accurately', 'RB'),
 ('determines', 'VBZ'),
 ('the', 'DT'),
 ('intended', 'JJ'),
 ('meaning', 'NN'),
 ('of', 'IN'),
 ('text', 'NN'),
 ('or', 'CC'),
 ('voice', 'NN'),
 ('data', 'NNS'),
 ('.', '.'),
 ('Homonyms', 'NNP'),
 (',', ','),
 ('homophones', 'NNS'),
 (',', ','),
 ('sarcasm', 'NN'),
 (',', ','),
 ('idioms', 'NNS'),
 (',', ','),
 ('metaphors', 'NNS'),
 (',', ','),
 ('grammar', 'NN'),
 ('and', 'CC'),
 ('usage', 'JJ'),
 ('exceptions', 'NNS'),
 (',', ','),
 ('variations', 'NNS'),
 ('in', 'IN'),
 ('sentence', 'NN'),
 ('structure', 'NN'),
 ('these', 'DT'),
 ('just', 'RB'),
 ('a', 'DT'),
 ('few', 'JJ'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('irregularities', 'NNS'),
 ('of', 'IN'),
 ('human', 'JJ'),
 ('languag

In [26]:
for i in range(0, len(pos)):
  if(pos[i][1]=='NNS'):
    print(pos[i][0])

ambiguities
data
homophones
idioms
metaphors
exceptions
variations
irregularities
humans
years
programmers
applications
applications


In [27]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [28]:
nltk.help.upenn_tagset('VBG')

VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...
