In [4]:
import nltk

In [5]:
import warnings
warnings.filterwarnings('ignore')


In [7]:
from nltk import word_tokenize
from nltk import pos_tag
from nltk import ne_chunk
barack = """Barack Hussein Obama (born August 4, 1961) is an American politician 
who served as the 44th President of the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the first African American to assume the presidency 
and previously served as a United States Senator from Illinois (2005–2008)."""
tokenised_barack = word_tokenize(barack)
pos_list = pos_tag(tokenised_barack)
print(ne_chunk(pos_list))


(S
  (PERSON Barack/NNP)
  (PERSON Hussein/NNP Obama/NNP)
  (/(
  born/VBN
  August/NNP
  4/CD
  ,/,
  1961/CD
  )/)
  is/VBZ
  an/DT
  (GPE American/JJ)
  politician/NN
  who/WP
  served/VBD
  as/IN
  the/DT
  44th/CD
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  from/IN
  January/NNP
  20/CD
  ,/,
  2009/CD
  ,/,
  to/TO
  January/NNP
  20/CD
  ,/,
  2017/CD
  ./.
  A/DT
  member/NN
  of/IN
  the/DT
  (ORGANIZATION Democratic/NNP Party/NNP)
  ,/,
  he/PRP
  was/VBD
  the/DT
  first/JJ
  (ORGANIZATION African/JJ American/NNP)
  to/TO
  assume/VB
  the/DT
  presidency/NN
  and/CC
  previously/RB
  served/VBD
  as/IN
  a/DT
  (GPE United/NNP States/NNPS)
  Senator/NNP
  from/IN
  (GPE Illinois/NNP)
  (/(
  2005–2008/CD
  )/)
  ./.)


In [8]:
from nltk import RegexpParser
from nltk import word_tokenize
from nltk import pos_tag
barack = """Barack Hussein Obama II born August 4, 1961) is an American politician
who served as the 44th President of 
the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the 
first African American to assume the presidency and previously
served as a United States Senator from Illinois (2005–2008)."""
grammar = r"""Place: {<NNP><NNPS>+}
           Date: {<NNP><CD><,><CD>}
           Person: {<NNP>+}
           """
tokenised_barack = word_tokenize(barack)
pos_list = pos_tag(tokenised_barack)
regParser = RegexpParser(grammar)
reg_lines = regParser.parse(pos_list)
print(reg_lines)


(S
  (Person Barack/NNP Hussein/NNP Obama/NNP II/NNP)
  born/VBD
  (Date August/NNP 4/CD ,/, 1961/CD)
  )/)
  is/VBZ
  an/DT
  American/JJ
  politician/NN
  who/WP
  served/VBD
  as/IN
  the/DT
  44th/CD
  (Person President/NNP)
  of/IN
  the/DT
  (Place United/NNP States/NNPS)
  from/IN
  (Date January/NNP 20/CD ,/, 2009/CD)
  ,/,
  to/TO
  (Date January/NNP 20/CD ,/, 2017/CD)
  ./.
  A/DT
  member/NN
  of/IN
  the/DT
  (Person Democratic/NNP Party/NNP)
  ,/,
  he/PRP
  was/VBD
  the/DT
  first/JJ
  African/JJ
  (Person American/NNP)
  to/TO
  assume/VB
  the/DT
  presidency/NN
  and/CC
  previously/RB
  served/VBD
  as/IN
  a/DT
  (Place United/NNP States/NNPS)
  (Person Senator/NNP)
  from/IN
  (Person Illinois/NNP)
  (/(
  2005–2008/CD
  )/)
  ./.)


### POS tagger

In [9]:
sent1 = "The race officials refused to permit the team to race today"
print(pos_tag(word_tokenize(sent1)))


[('The', 'DT'), ('race', 'NN'), ('officials', 'NNS'), ('refused', 'VBD'), ('to', 'TO'), ('permit', 'VB'), ('the', 'DT'), ('team', 'NN'), ('to', 'TO'), ('race', 'NN'), ('today', 'NN')]


###### However, if you observe the statement and consult your knowledge of the English grammar, you will realize that the word 'race' here is being used as a noun in the first occurrence and as a verb in the second.

In [10]:
sent2 = "That gentleman wants some water to water the plants"
print(pos_tag(word_tokenize(sent2)))

[('That', 'DT'), ('gentleman', 'NN'), ('wants', 'VBZ'), ('some', 'DT'), ('water', 'NN'), ('to', 'TO'), ('water', 'NN'), ('the', 'DT'), ('plants', 'NNS')]


###### These mistakes in tagging are primarily because of how the taggers classify words and on what kind of data they have been trained.Observe that the POS tagger gets the classification right for the below statement indicating that the error is not by default.

In [11]:
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
print(nltk.pos_tag(text))

[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]


In [12]:
# importing a predefined corpus
from nltk.corpus import brown
# getting the most common tag in the brown corpus
tags = [tag for (word,tag) in brown.tagged_words()]
most_common_tag = nltk.FreqDist(tags).max()
print(most_common_tag)

from nltk import DefaultTagger
default_tagger = DefaultTagger(most_common_tag)
def_tagged_barack = default_tagger.tag(tokenised_barack)
print(def_tagged_barack)



NN
[('Barack', 'NN'), ('Hussein', 'NN'), ('Obama', 'NN'), ('II', 'NN'), ('born', 'NN'), ('August', 'NN'), ('4', 'NN'), (',', 'NN'), ('1961', 'NN'), (')', 'NN'), ('is', 'NN'), ('an', 'NN'), ('American', 'NN'), ('politician', 'NN'), ('who', 'NN'), ('served', 'NN'), ('as', 'NN'), ('the', 'NN'), ('44th', 'NN'), ('President', 'NN'), ('of', 'NN'), ('the', 'NN'), ('United', 'NN'), ('States', 'NN'), ('from', 'NN'), ('January', 'NN'), ('20', 'NN'), (',', 'NN'), ('2009', 'NN'), (',', 'NN'), ('to', 'NN'), ('January', 'NN'), ('20', 'NN'), (',', 'NN'), ('2017', 'NN'), ('.', 'NN'), ('A', 'NN'), ('member', 'NN'), ('of', 'NN'), ('the', 'NN'), ('Democratic', 'NN'), ('Party', 'NN'), (',', 'NN'), ('he', 'NN'), ('was', 'NN'), ('the', 'NN'), ('first', 'NN'), ('African', 'NN'), ('American', 'NN'), ('to', 'NN'), ('assume', 'NN'), ('the', 'NN'), ('presidency', 'NN'), ('and', 'NN'), ('previously', 'NN'), ('served', 'NN'), ('as', 'NN'), ('a', 'NN'), ('United', 'NN'), ('States', 'NN'), ('Senator', 'NN'), ('from'

In [13]:
from nltk import word_tokenize
sent1 = "the quick brown fox jumps over the lazy dog"
training_tags = pos_tag(word_tokenize(sent1))
print(training_tags)

# Now let us use these tags to train the NgramTagger
ngram_tagger = nltk.NgramTagger(n=2,train=[training_tags]) #Here when we set n=2, we are creating a bigram tagger


[('the', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


In [14]:
sent2 = "the lazy dog was jumped over by the quick brown fox"
sent2_tags = ngram_tagger.tag(word_tokenize(sent2))
print(sent2_tags)


[('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('was', None), ('jumped', None), ('over', None), ('by', None), ('the', None), ('quick', None), ('brown', None), ('fox', None)]


In [15]:
print(list(nltk.ngrams(pos_tag(word_tokenize(sent1)),n=2)))

[(('the', 'DT'), ('quick', 'JJ')), (('quick', 'JJ'), ('brown', 'NN')), (('brown', 'NN'), ('fox', 'NN')), (('fox', 'NN'), ('jumps', 'VBZ')), (('jumps', 'VBZ'), ('over', 'IN')), (('over', 'IN'), ('the', 'DT')), (('the', 'DT'), ('lazy', 'JJ')), (('lazy', 'JJ'), ('dog', 'NN'))]


In [16]:
print(list(nltk.ngrams(word_tokenize(sent2),n=2)))

[('the', 'lazy'), ('lazy', 'dog'), ('dog', 'was'), ('was', 'jumped'), ('jumped', 'over'), ('over', 'by'), ('by', 'the'), ('the', 'quick'), ('quick', 'brown'), ('brown', 'fox')]


In [17]:
barack = """Barack Hussein Obama II born August 4, 1961) is an American politician
who served as the 44th President of 
the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the 
first African American to assume the presidency and previously
served as a United States Senator from Illinois (2005–2008)."""
bush = """George Walker Bush (born July 6, 1946) is an American politician who served as the 43rd President
 of the United States from 2001 to 2009.
He had previously served as the 46th Governor of Texas from 1995 to 2000.
Bush was born New Haven, Connecticut, and grew up in Texas. 
After graduating from Yale University in 1968 and Harvard Business School in 1975, he worked in the oil industry.
Bush married Laura Welch in 1977 and unsuccessfully ran for the U.S. House of Representatives shortly thereafter. 
He later co-owned the Texas Rangers baseball team before defeating Ann Richards in the 1994 Texas gubernatorial election. 
Bush was elected President of the United States in 2000 when he defeated Democratic incumbent 
Vice President Al Gore after a close and controversial win that involved a stopped recount in Florida. 
He became the fourth person to be elected president while receiving fewer popular votes than his opponent.
Bush is a member of a prominent political family and is the eldest son of Barbara and George H. W. Bush, 
the 41st President of the United States. 
He is only the second president to assume the nation's highest office after his father, following the footsteps
 of John Adams and his son, John Quincy Adams.
His brother, Jeb Bush, a former Governor of Florida, was a candidate for the Republican presidential nomination
 in the 2016 presidential election. 
His paternal grandfather, Prescott Bush, was a U.S. Senator from Connecticut."""
pos_tag_barack = pos_tag(word_tokenize(barack))
pos_tag_bush = pos_tag(word_tokenize(bush))
trump = """Donald John Trump (born June 14, 1946) is the 45th and current President of the United States.
Before entering politics, he was a businessman and television personality. 
Trump was born and raised in the New York City borough of Queens, and received an economics degree from the
 Wharton School of the University of Pennsylvania. 
He took charge of his family's real estate business in 1971, renamed it The Trump Organization, and expanded 
it from Queens and Brooklyn into Manhattan. 
The company built or renovated skyscrapers, hotels, casinos, and golf courses. 
Trump later started various side ventures, including licensing his name for real estate and consumer products.
He managed the company until his 2017 inauguration. 
He co-authored several books, including The Art of the Deal. He owned the Miss Universe and Miss USA beauty 
pageants from 1996 to 2015, and he produced and hosted the reality television show The Apprentice from 2003 to 2015.
Forbes estimates his net worth to be $3.1 billion."""
unigram_tag = nltk.UnigramTagger(train=[pos_tag_barack,pos_tag_bush])
trump_tags = unigram_tag.tag(word_tokenize(trump))
print(trump_tags)


[('Donald', None), ('John', 'NNP'), ('Trump', None), ('(', '('), ('born', 'VBN'), ('June', None), ('14', None), (',', ','), ('1946', 'CD'), (')', ')'), ('is', 'VBZ'), ('the', 'DT'), ('45th', None), ('and', 'CC'), ('current', None), ('President', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('.', '.'), ('Before', None), ('entering', None), ('politics', None), (',', ','), ('he', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('businessman', None), ('and', 'CC'), ('television', None), ('personality', None), ('.', '.'), ('Trump', None), ('was', 'VBD'), ('born', 'VBN'), ('and', 'CC'), ('raised', None), ('in', 'IN'), ('the', 'DT'), ('New', 'NNP'), ('York', None), ('City', None), ('borough', None), ('of', 'IN'), ('Queens', None), (',', ','), ('and', 'CC'), ('received', None), ('an', 'DT'), ('economics', None), ('degree', None), ('from', 'IN'), ('the', 'DT'), ('Wharton', None), ('School', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('University', 'NNP'), ('of', 'IN'), ('Pennsylva

In [18]:
default_tagger = DefaultTagger('NN')
patterns = [(r'.*\'s$', 'NN$'),(r'.*es$', 'VBZ'),(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),(r'[Aa][Nn][Dd]','CC'),(r'.*ed$', 'VBD'),(r',',','),(r'.*ould$', 'MD'),(r'.*ing$', 'VBG'),(r'.*s$', 'NNS')]
regexp_tagger = nltk.RegexpTagger(patterns,backoff=default_tagger)
unigram_tag = nltk.UnigramTagger(train=[pos_tag_barack,pos_tag_bush],backoff=regexp_tagger)
trump_tags = unigram_tag.tag(word_tokenize(trump))
print(trump_tags)


[('Donald', 'NN'), ('John', 'NNP'), ('Trump', 'NN'), ('(', '('), ('born', 'VBN'), ('June', 'NN'), ('14', 'CD'), (',', ','), ('1946', 'CD'), (')', ')'), ('is', 'VBZ'), ('the', 'DT'), ('45th', 'NN'), ('and', 'CC'), ('current', 'NN'), ('President', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('.', '.'), ('Before', 'NN'), ('entering', 'VBG'), ('politics', 'NNS'), (',', ','), ('he', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('businessman', 'NN'), ('and', 'CC'), ('television', 'NN'), ('personality', 'NN'), ('.', '.'), ('Trump', 'NN'), ('was', 'VBD'), ('born', 'VBN'), ('and', 'CC'), ('raised', 'VBD'), ('in', 'IN'), ('the', 'DT'), ('New', 'NNP'), ('York', 'NN'), ('City', 'NN'), ('borough', 'NN'), ('of', 'IN'), ('Queens', 'NNS'), (',', ','), ('and', 'CC'), ('received', 'VBD'), ('an', 'DT'), ('economics', 'NNS'), ('degree', 'NN'), ('from', 'IN'), ('the', 'DT'), ('Wharton', 'NN'), ('School', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('University', 'NNP'), ('of', 'IN'), ('Pen