# Tokenizing words & Sentences

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
example_text='Hello there, how are you doing today? the weather is great and pthon is awesome. The sky is pink and not blue'
print(sent_tokenize(example_text))
print(word_tokenize(example_text))

['Hello there, how are you doing today?', 'the weather is great and pthon is awesome.', 'The sky is pink and not blue']
['Hello', 'there', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'the', 'weather', 'is', 'great', 'and', 'pthon', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pink', 'and', 'not', 'blue']


In [3]:
for i in word_tokenize(example_text):
    print(i)

Hello
there
,
how
are
you
doing
today
?
the
weather
is
great
and
pthon
is
awesome
.
The
sky
is
pink
and
not
blue


# Stopwords

In [4]:
from nltk.corpus import stopwords

In [5]:
example_sentence = "This is an example showing off stop word filteration"
stop_words=set(stopwords.words('english'))

In [6]:
print(stop_words)

{'have', 'mustn', 'its', 'am', 'further', 'don', "you'll", 'has', "aren't", 'your', 'then', 'here', 'it', "mustn't", 'now', 're', 'ourselves', 'this', 'can', 'ain', 'into', "hadn't", 'a', 'hers', 'up', 'how', 'wasn', 'not', 'under', 'for', 'or', 'there', 's', "wouldn't", 'below', 'too', "doesn't", 'theirs', 'on', 'what', 'did', 'by', 'few', 'during', 'all', 'because', 'been', 'does', 'which', 'own', 'himself', 'after', 'be', 'who', 'same', 'so', 'doesn', 'y', 'do', 'over', "shouldn't", 'he', "that'll", 'her', 'once', 'those', 'than', "shan't", 'she', 'before', 'had', 'wouldn', 'him', 'no', 'itself', 'couldn', 'between', "isn't", 'through', "mightn't", 'where', 'about', "didn't", 'they', 'our', 'that', 'why', 'was', 'only', 'each', 'down', 'an', 'herself', 'are', 'shan', 'yourselves', "haven't", "don't", 'yours', 'out', "hasn't", 'isn', "wasn't", 'nor', 'hadn', 'won', 'both', 'yourself', 'against', "she's", 'you', 'm', 'doing', 'of', 'haven', 'weren', "won't", "couldn't", 'as', 'at', 'd

In [7]:
words=word_tokenize(example_sentence)

In [8]:
filtered_sentence=[]
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
filtered_sentence

['This', 'example', 'showing', 'stop', 'word', 'filteration']

In [9]:
#using this using list comprehension:
filtered_sentence2=[w for w in words if not w in stop_words]

In [10]:
filtered_sentence2

['This', 'example', 'showing', 'stop', 'word', 'filteration']

# Stemming 

In [11]:
from nltk.stem import PorterStemmer

In [12]:
ps= PorterStemmer()

In [13]:
Example=['python','pythoner','pythoning','pythoned','pythonly']

In [14]:
for w in Example:
    print(ps.stem(w))

python
python
python
python
pythonli


In [15]:
new_text= 'It is very important to be pythonly whule you are pythoning with python. ALl pythoners have pythoned poorly at least once'

In [16]:
words=word_tokenize(new_text)

In [17]:
for w in words:
    print(ps.stem(w))

It
is
veri
import
to
be
pythonli
whule
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc


# Part of Speech Tagging

In [18]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer # is an unsupervised machine learning sentence tokenizer
#it comes trained but can be retrained too


In [19]:
sample_text=state_union.raw('2006-GWBush.txt')
train_text=state_union.raw('2005-GWBush.txt')

In [20]:
custom_sent_tokenier=PunktSentenceTokenizer(train_text) #trainging punk on train

In [21]:
tokenized=custom_sent_tokenier.tokenize(sample_text)

In [22]:
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

In [23]:
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

[('Dictatorships', 'NNP'), ('shelter', 'NN'), ('terrorists', 'NNS'), (',', ','), ('and', 'CC'), ('feed', 'VB'), ('resentment', 'NN'), ('and', 'CC'), ('radicalism', 'NN'), (',', ','), ('and', 'CC'), ('seek', 'JJ'), ('weapons', 'NNS'), ('of', 'IN'), ('mass', 'NN'), ('destruction', 'NN'), ('.', '.')]
[('Democracies', 'NNS'), ('replace', 'VB'), ('resentment', 'NN'), ('with', 'IN'), ('hope', 'NN'), (',', ','), ('respect', 'VB'), ('the', 'DT'), ('rights', 'NNS'), ('of', 'IN'), ('their', 'PRP$'), ('citizens', 'NNS'), ('and', 'CC'), ('their', 'PRP$'), ('neighbors', 'NNS'), (',', ','), ('and', 'CC'), ('join', 'VB'), ('the', 'DT'), ('fight', 'NN'), ('against', 'IN'), ('terror', 'NN'), ('.', '.')]
[('Every', 'DT'), ('step', 'NN'), ('toward', 'IN'), ('freedom', 'NN'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('makes', 'VBZ'), ('our', 'PRP$'), ('country', 'NN'), ('safer', 'NN'), ('--', ':'), ('so', 'IN'), ('we', 'PRP'), ('will', 'MD'), ('act', 'VB'), ('boldly', 'RB'), ('in', 'IN'), ('freedom',

[('By', 'IN'), ('allowing', 'VBG'), ('radical', 'JJ'), ('Islam', 'NNP'), ('to', 'TO'), ('work', 'VB'), ('its', 'PRP$'), ('will', 'MD'), ('--', ':'), ('by', 'IN'), ('leaving', 'VBG'), ('an', 'DT'), ('assaulted', 'JJ'), ('world', 'NN'), ('to', 'TO'), ('fend', 'VB'), ('for', 'IN'), ('itself', 'PRP'), ('--', ':'), ('we', 'PRP'), ('would', 'MD'), ('signal', 'VB'), ('to', 'TO'), ('all', 'PDT'), ('that', 'IN'), ('we', 'PRP'), ('no', 'DT'), ('longer', 'RBR'), ('believe', 'VBP'), ('in', 'IN'), ('our', 'PRP$'), ('own', 'JJ'), ('ideals', 'NNS'), (',', ','), ('or', 'CC'), ('even', 'RB'), ('in', 'IN'), ('our', 'PRP$'), ('own', 'JJ'), ('courage', 'NN'), ('.', '.')]
[('But', 'CC'), ('our', 'PRP$'), ('enemies', 'NNS'), ('and', 'CC'), ('our', 'PRP$'), ('friends', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('certain', 'JJ'), (':', ':'), ('The', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('will', 'MD'), ('not', 'RB'), ('retreat', 'VB'), ('from', 'IN'), ('the', 'DT'), ('world', 'NN'), (',', ','), ('and', 'CC

[('With', 'IN'), ('so', 'RB'), ('much', 'JJ'), ('in', 'IN'), ('the', 'DT'), ('balance', 'NN'), (',', ','), ('those', 'DT'), ('of', 'IN'), ('us', 'PRP'), ('in', 'IN'), ('public', 'JJ'), ('office', 'NN'), ('have', 'VBP'), ('a', 'DT'), ('duty', 'NN'), ('to', 'TO'), ('speak', 'VB'), ('with', 'IN'), ('candor', 'NN'), ('.', '.')]
[('A', 'DT'), ('sudden', 'JJ'), ('withdrawal', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('forces', 'NNS'), ('from', 'IN'), ('Iraq', 'NNP'), ('would', 'MD'), ('abandon', 'VB'), ('our', 'PRP$'), ('Iraqi', 'NNP'), ('allies', 'NNS'), ('to', 'TO'), ('death', 'NN'), ('and', 'CC'), ('prison', 'NN'), (',', ','), ('would', 'MD'), ('put', 'VB'), ('men', 'NNS'), ('like', 'IN'), ('bin', 'NN'), ('Laden', 'NNP'), ('and', 'CC'), ('Zarqawi', 'NNP'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('a', 'DT'), ('strategic', 'JJ'), ('country', 'NN'), (',', ','), ('and', 'CC'), ('show', 'VBP'), ('that', 'IN'), ('a', 'DT'), ('pledge', 'NN'), ('from', 'IN'), ('America', 'NNP'), ('means', 'VB

[('For', 'IN'), ('people', 'NNS'), ('everywhere', 'RB'), (',', ','), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('is', 'VBZ'), ('a', 'DT'), ('partner', 'NN'), ('for', 'IN'), ('a', 'DT'), ('better', 'JJR'), ('life', 'NN'), ('.', '.')]
[('Short-changing', 'VBG'), ('these', 'DT'), ('efforts', 'NNS'), ('would', 'MD'), ('increase', 'VB'), ('the', 'DT'), ('suffering', 'NN'), ('and', 'CC'), ('chaos', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('world', 'NN'), (',', ','), ('undercut', 'JJ'), ('our', 'PRP$'), ('long-term', 'JJ'), ('security', 'NN'), (',', ','), ('and', 'CC'), ('dull', 'VB'), ('the', 'DT'), ('conscience', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('country', 'NN'), ('.', '.')]
[('I', 'PRP'), ('urge', 'VBP'), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), ('to', 'TO'), ('serve', 'VB'), ('the', 'DT'), ('interests', 'NNS'), ('of', 'IN'), ('America', 'NNP'), ('by', 'IN'), ('showing', 'VBG'), ('the', 'DT'), ('compassion', 'NN'), ('of', 'IN'), ('America', 'NNP'), ('.', '.')]

[('Others', 'NNS'), ('say', 'VBP'), ('that', 'IN'), ('the', 'DT'), ('government', 'NN'), ('needs', 'VBZ'), ('to', 'TO'), ('take', 'VB'), ('a', 'DT'), ('larger', 'JJR'), ('role', 'NN'), ('in', 'IN'), ('directing', 'VBG'), ('the', 'DT'), ('economy', 'NN'), (',', ','), ('centralizing', 'VBG'), ('more', 'JJR'), ('power', 'NN'), ('in', 'IN'), ('Washington', 'NNP'), ('and', 'CC'), ('increasing', 'VBG'), ('taxes', 'NNS'), ('.', '.')]
[('We', 'PRP'), ('hear', 'VBP'), ('claims', 'NNS'), ('that', 'IN'), ('immigrants', 'NNS'), ('are', 'VBP'), ('somehow', 'RB'), ('bad', 'JJ'), ('for', 'IN'), ('the', 'DT'), ('economy', 'NN'), ('--', ':'), ('even', 'RB'), ('though', 'IN'), ('this', 'DT'), ('economy', 'NN'), ('could', 'MD'), ('not', 'RB'), ('function', 'VB'), ('without', 'IN'), ('them', 'PRP'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('All', 'PDT'), ('these', 'DT'), ('are', 'VBP'), ('forms', 'NNS'), ('of', 'IN'), ('economic', 'JJ'), ('retreat', 'NN'), (',', ','), ('and'

[('Congress', 'NNP'), ('did', 'VBD'), ('not', 'RB'), ('act', 'VB'), ('last', 'JJ'), ('year', 'NN'), ('on', 'IN'), ('my', 'PRP$'), ('proposal', 'NN'), ('to', 'TO'), ('save', 'VB'), ('Social', 'NNP'), ('Security', 'NNP'), ('--', ':'), ('(', '('), ('applause', 'NN'), (')', ')'), ('--', ':'), ('yet', 'RB'), ('the', 'DT'), ('rising', 'VBG'), ('cost', 'NN'), ('of', 'IN'), ('entitlements', 'NNS'), ('is', 'VBZ'), ('a', 'DT'), ('problem', 'NN'), ('that', 'WDT'), ('is', 'VBZ'), ('not', 'RB'), ('going', 'VBG'), ('away', 'RB'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('And', 'CC'), ('every', 'DT'), ('year', 'NN'), ('we', 'PRP'), ('fail', 'VBP'), ('to', 'TO'), ('act', 'VB'), (',', ','), ('the', 'DT'), ('situation', 'NN'), ('gets', 'VBZ'), ('worse', 'JJR'), ('.', '.')]
[('So', 'RB'), ('tonight', 'JJ'), (',', ','), ('I', 'PRP'), ('ask', 'VBP'), ('you', 'PRP'), ('to', 'TO'), ('join', 'VB'), ('me', 'PRP'), ('in', 'IN'), ('creating', 'VBG'), ('a', 'DT'), ('commission', 'NN

[('We', 'PRP'), ('will', 'MD'), ('increase', 'VB'), ('our', 'PRP$'), ('research', 'NN'), ('in', 'IN'), ('better', 'JJR'), ('batteries', 'NNS'), ('for', 'IN'), ('hybrid', 'JJ'), ('and', 'CC'), ('electric', 'JJ'), ('cars', 'NNS'), (',', ','), ('and', 'CC'), ('in', 'IN'), ('pollution-free', 'JJ'), ('cars', 'NNS'), ('that', 'WDT'), ('run', 'VBP'), ('on', 'IN'), ('hydrogen', 'NN'), ('.', '.')]
[('We', 'PRP'), ("'ll", 'MD'), ('also', 'RB'), ('fund', 'VB'), ('additional', 'JJ'), ('research', 'NN'), ('in', 'IN'), ('cutting-edge', 'JJ'), ('methods', 'NNS'), ('of', 'IN'), ('producing', 'VBG'), ('ethanol', 'NN'), (',', ','), ('not', 'RB'), ('just', 'RB'), ('from', 'IN'), ('corn', 'NN'), (',', ','), ('but', 'CC'), ('from', 'IN'), ('wood', 'NN'), ('chips', 'NNS'), ('and', 'CC'), ('stalks', 'NNS'), (',', ','), ('or', 'CC'), ('switch', 'VB'), ('grass', 'NN'), ('.', '.')]
[('Our', 'PRP$'), ('goal', 'NN'), ('is', 'VBZ'), ('to', 'TO'), ('make', 'VB'), ('this', 'DT'), ('new', 'JJ'), ('kind', 'NN'), ('of'

[('Welfare', 'NN'), ('cases', 'NNS'), ('have', 'VBP'), ('dropped', 'VBN'), ('by', 'IN'), ('more', 'JJR'), ('than', 'IN'), ('half', 'NN'), ('over', 'IN'), ('the', 'DT'), ('past', 'JJ'), ('decade', 'NN'), ('.', '.')]
[('Drug', 'NN'), ('use', 'NN'), ('among', 'IN'), ('youth', 'NN'), ('is', 'VBZ'), ('down', 'RB'), ('19', 'CD'), ('percent', 'NN'), ('since', 'IN'), ('2001', 'CD'), ('.', '.')]
[('There', 'EX'), ('are', 'VBP'), ('fewer', 'JJR'), ('abortions', 'NNS'), ('in', 'IN'), ('America', 'NNP'), ('than', 'IN'), ('at', 'IN'), ('any', 'DT'), ('point', 'NN'), ('in', 'IN'), ('the', 'DT'), ('last', 'JJ'), ('three', 'CD'), ('decades', 'NNS'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('number', 'NN'), ('of', 'IN'), ('children', 'NNS'), ('born', 'VBN'), ('to', 'TO'), ('teenage', 'VB'), ('mothers', 'NNS'), ('has', 'VBZ'), ('been', 'VBN'), ('falling', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('dozen', 'NN'), ('years', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('row', 'NN'), ('.', '.')]
[('(', '('), ('Applause

[('Lincoln', 'NNP'), ('could', 'MD'), ('have', 'VB'), ('accepted', 'VBN'), ('peace', 'NN'), ('at', 'IN'), ('the', 'DT'), ('cost', 'NN'), ('of', 'IN'), ('disunity', 'NN'), ('and', 'CC'), ('continued', 'JJ'), ('slavery', 'NN'), ('.', '.')]
[('Martin', 'NNP'), ('Luther', 'NNP'), ('King', 'NNP'), ('could', 'MD'), ('have', 'VB'), ('stopped', 'VBN'), ('at', 'IN'), ('Birmingham', 'NNP'), ('or', 'CC'), ('at', 'IN'), ('Selma', 'NNP'), (',', ','), ('and', 'CC'), ('achieved', 'VBD'), ('only', 'RB'), ('half', 'PDT'), ('a', 'DT'), ('victory', 'NN'), ('over', 'IN'), ('segregation', 'NN'), ('.', '.')]
[('The', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('could', 'MD'), ('have', 'VB'), ('accepted', 'VBN'), ('the', 'DT'), ('permanent', 'JJ'), ('division', 'NN'), ('of', 'IN'), ('Europe', 'NNP'), (',', ','), ('and', 'CC'), ('been', 'VBN'), ('complicit', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('oppression', 'NN'), ('of', 'IN'), ('others', 'NNS'), ('.', '.')]
[('Today', 'NN'), (',', ','), ('having', 'VBG'

# Chunking

Text chunking, also referred to as shallow parsing, is a task that follows Part-Of-Speech Tagging and that adds more structure to the sentence. The result is a grouping of the words in “chunks”. Here’s a quick example:

In [24]:
#def process_content():
#    try:
#        for i in tokenized:
#            words=nltk.word_tokenize(i)
#            tagged=nltk.pos_tag(words)
#            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" #its a Regex
#            chunkParser= nltk.RegexpParser(chunkGram)
#            chunked = chunkParser.parse(tagged)
#            #print(chunked)
#            chunked.draw()
#            #print(tagged)
#    except Exception as e:
#        print(str(e))
"""Need to learn about it more"""

'Need to learn about it more'

In [25]:
"""process_content()"""

'process_content()'

# Chinking

Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

In [26]:
#def process_content():
#    try:
#        for i in tokenized:
#            words=nltk.word_tokenize(i)
#            tagged=nltk.pos_tag(words)
#            chunkGram= r"""Chunk: {<.*>+}
#                                    }<VB.?|IN|DT>+{"""
#            #Explaining the regex:
#            #{<.*>} means get everything 0 or more repetation
#            #}<VB.?|IN|DT>+{ means keeping out verb, proposition, determinant 1 or more
#            chunkParser= nltk.RegexpParser(chunkGram)
#            chunked = chunkParser.parse(tagged)
#            print(chunked)
#            chunked.draw()
#            #print(tagged)
#    except Exception as e:
#        print(str(e))

In [27]:
#process_content()

# Named-entity recognition

In [28]:
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged)
            print(namedEnt)
            #namedEnt.draw()
            
    except Exception as e:
        print(str(e))

In [29]:
process_content()

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (ORGANIZATION Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (GPE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  

(S
  And/CC
  one/CD
  of/IN
  the/DT
  main/JJ
  sources/NNS
  of/IN
  reaction/NN
  and/CC
  opposition/NN
  is/VBZ
  radical/JJ
  Islam/NNP
  --/:
  the/DT
  perversion/NN
  by/IN
  a/DT
  few/JJ
  of/IN
  a/DT
  noble/JJ
  faith/NN
  into/IN
  an/DT
  ideology/NN
  of/IN
  terror/NN
  and/CC
  death/NN
  ./.)
(S
  Terrorists/NNS
  like/IN
  bin/NN
  Laden/NNP
  are/VBP
  serious/JJ
  about/IN
  mass/NN
  murder/NN
  --/:
  and/CC
  all/DT
  of/IN
  us/PRP
  must/MD
  take/VB
  their/PRP$
  declared/JJ
  intentions/NNS
  seriously/RB
  ./.)
(S
  They/PRP
  seek/VBP
  to/TO
  impose/VB
  a/DT
  heartless/NN
  system/NN
  of/IN
  totalitarian/JJ
  control/NN
  throughout/IN
  the/DT
  (GPE Middle/NNP East/NNP)
  ,/,
  and/CC
  arm/NN
  themselves/PRP
  with/IN
  weapons/NNS
  of/IN
  mass/NN
  murder/NN
  ./.)
(S
  Their/PRP$
  aim/NN
  is/VBZ
  to/TO
  seize/VB
  power/NN
  in/IN
  (GPE Iraq/NNP)
  ,/,
  and/CC
  use/VB
  it/PRP
  as/IN
  a/DT
  safe/JJ
  haven/NN
  to/TO
  launch/VB

  ./.)
(S
  Yet/RB
  ,/,
  there/EX
  is/VBZ
  a/DT
  difference/NN
  between/IN
  responsible/JJ
  criticism/NN
  that/WDT
  aims/VBZ
  for/IN
  success/NN
  ,/,
  and/CC
  defeatism/NN
  that/WDT
  refuses/VBZ
  to/TO
  acknowledge/VB
  anything/NN
  but/CC
  failure/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  (PERSON Hindsight/NNP)
  alone/RB
  is/VBZ
  not/RB
  wisdom/JJ
  ,/,
  and/CC
  second-guessing/NN
  is/VBZ
  not/RB
  a/DT
  strategy/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  With/IN
  so/RB
  much/JJ
  in/IN
  the/DT
  balance/NN
  ,/,
  those/DT
  of/IN
  us/PRP
  in/IN
  public/JJ
  office/NN
  have/VBP
  a/DT
  duty/NN
  to/TO
  speak/VB
  with/IN
  candor/NN
  ./.)
(S
  A/DT
  sudden/JJ
  withdrawal/NN
  of/IN
  our/PRP$
  forces/NNS
  from/IN
  (GPE Iraq/NNP)
  would/MD
  abandon/VB
  our/PRP$
  (GPE Iraqi/NNP)
  allies/NNS
  to/TO
  death/NN
  and/CC
  prison/NN
  ,/,
  would/MD
  put/VB
  men/NNS
  like/IN
  bin/NN
  (PERSON Laden/NNP)

  ./.)
(S
  And/CC
  our/PRP$
  nation/NN
  hopes/VBZ
  one/CD
  day/NN
  to/TO
  be/VB
  the/DT
  closest/JJS
  of/IN
  friends/NNS
  with/IN
  a/DT
  free/JJ
  and/CC
  democratic/JJ
  (GPE Iran/NNP)
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  To/TO
  overcome/VB
  dangers/NNS
  in/IN
  our/PRP$
  world/NN
  ,/,
  we/PRP
  must/MD
  also/RB
  take/VB
  the/DT
  offensive/JJ
  by/IN
  encouraging/VBG
  economic/JJ
  progress/NN
  ,/,
  and/CC
  fighting/VBG
  disease/NN
  ,/,
  and/CC
  spreading/VBG
  hope/NN
  in/IN
  hopeless/JJ
  lands/NNS
  ./.)
(S
  (GPE Isolationism/NNP)
  would/MD
  not/RB
  only/RB
  tie/VB
  our/PRP$
  hands/NNS
  in/IN
  fighting/VBG
  enemies/NNS
  ,/,
  it/PRP
  would/MD
  keep/VB
  us/PRP
  from/IN
  helping/VBG
  our/PRP$
  friends/NNS
  in/IN
  desperate/JJ
  need/NN
  ./.)
(S
  We/PRP
  show/VBP
  compassion/JJ
  abroad/RB
  because/IN
  (GSP Americans/NNPS)
  believe/VBP
  in/IN
  the/DT
  God-given/NNP
  dignity/NN
  and/CC
  worth/NN
  

(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Even/RB
  in/IN
  the/DT
  face/NN
  of/IN
  higher/JJR
  energy/NN
  prices/NNS
  and/CC
  natural/JJ
  disasters/NNS
  ,/,
  the/DT
  (GPE American/JJ)
  people/NNS
  have/VBP
  turned/VBN
  in/IN
  an/DT
  economic/JJ
  performance/NN
  that/WDT
  is/VBZ
  the/DT
  envy/NN
  of/IN
  the/DT
  world/NN
  ./.)
(S
  The/DT
  (GPE American/JJ)
  economy/NN
  is/VBZ
  preeminent/JJ
  ,/,
  but/CC
  we/PRP
  can/MD
  not/RB
  afford/VB
  to/TO
  be/VB
  complacent/JJ
  ./.)
(S
  In/IN
  a/DT
  dynamic/JJ
  world/NN
  economy/NN
  ,/,
  we/PRP
  are/VBP
  seeing/VBG
  new/JJ
  competitors/NNS
  ,/,
  like/IN
  (GPE China/NNP)
  and/CC
  (GPE India/NNP)
  ,/,
  and/CC
  this/DT
  creates/VBZ
  uncertainty/NN
  ,/,
  which/WDT
  makes/VBZ
  it/PRP
  easier/JJR
  to/TO
  feed/VB
  people/NNS
  's/POS
  fears/NNS
  ./.)
(S
  So/IN
  we/PRP
  're/VBP
  seeing/VBG
  some/DT
  old/JJ
  temptations/NNS
  return/NN
  ./.)
(S
  Protectionists/NNS
  wan

(S
  And/CC
  we/PRP
  must/MD
  have/VB
  a/DT
  rational/JJ
  ,/,
  humane/JJ
  guest/JJS
  worker/NN
  program/NN
  that/WDT
  rejects/VBZ
  amnesty/JJ
  ,/,
  allows/VBZ
  temporary/JJ
  jobs/NNS
  for/IN
  people/NNS
  who/WP
  seek/VBP
  them/PRP
  legally/RB
  ,/,
  and/CC
  reduces/NNS
  smuggling/VBG
  and/CC
  crime/NN
  at/IN
  the/DT
  border/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Keeping/VBG
  (GPE America/NNP)
  competitive/JJ
  requires/VBZ
  affordable/JJ
  health/NN
  care/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Our/PRP$
  government/NN
  has/VBZ
  a/DT
  responsibility/NN
  to/TO
  provide/VB
  health/NN
  care/NN
  for/IN
  the/DT
  poor/JJ
  and/CC
  the/DT
  elderly/JJ
  ,/,
  and/CC
  we/PRP
  are/VBP
  meeting/VBG
  that/IN
  responsibility/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  For/IN
  all/DT
  Americans/NNPS
  --/:
  for/IN
  all/DT
  Americans/NNPS
  ,/,
  we/PRP
  must/MD
  confront/VB
  the/DT
  ris

(S
  Tonight/NNP
  I/PRP
  propose/VBP
  to/TO
  train/VB
  70,000/CD
  high/JJ
  school/NN
  teachers/NNS
  to/TO
  lead/VB
  advanced-placement/JJ
  courses/NNS
  in/IN
  math/NN
  and/CC
  science/NN
  ,/,
  bring/VBG
  30,000/CD
  math/NN
  and/CC
  science/NN
  professionals/NNS
  to/TO
  teach/VB
  in/IN
  classrooms/NNS
  ,/,
  and/CC
  give/VB
  early/JJ
  help/NN
  to/TO
  students/NNS
  who/WP
  struggle/VBP
  with/IN
  math/NN
  ,/,
  so/IN
  they/PRP
  have/VBP
  a/DT
  better/JJR
  chance/NN
  at/IN
  good/JJ
  ,/,
  high-wage/JJ
  jobs/NNS
  ./.)
(S
  If/IN
  we/PRP
  ensure/VB
  that/IN
  (GPE America/NNP)
  's/POS
  children/NNS
  succeed/VB
  in/IN
  life/NN
  ,/,
  they/PRP
  will/MD
  ensure/VB
  that/IN
  (GPE America/NNP)
  succeeds/VBZ
  in/IN
  the/DT
  world/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Preparing/VBG
  our/PRP$
  nation/NN
  to/TO
  compete/VB
  in/IN
  the/DT
  world/NN
  is/VBZ
  a/DT
  goal/NN
  that/IN
  all/DT
  of/IN
  us/PRP


(S
  This/DT
  year/NN
  we/PRP
  will/MD
  add/VB
  resources/NNS
  to/TO
  encourage/VB
  young/JJ
  people/NNS
  to/TO
  stay/VB
  in/IN
  school/NN
  ,/,
  so/RB
  more/JJR
  of/IN
  (GPE America/NNP)
  's/POS
  youth/NN
  can/MD
  raise/VB
  their/PRP$
  sights/NNS
  and/CC
  achieve/VBP
  their/PRP$
  dreams/NNS
  ./.)
(S
  A/DT
  hopeful/JJ
  society/NN
  comes/VBZ
  to/TO
  the/DT
  aid/NN
  of/IN
  fellow/JJ
  citizens/NNS
  in/IN
  times/NNS
  of/IN
  suffering/NN
  and/CC
  emergency/NN
  --/:
  and/CC
  stays/NNS
  at/IN
  it/PRP
  until/IN
  they/PRP
  're/VBP
  back/RB
  on/IN
  their/PRP$
  feet/NNS
  ./.)
(S
  So/RB
  far/RB
  the/DT
  federal/JJ
  government/NN
  has/VBZ
  committed/VBN
  $/$
  85/CD
  billion/CD
  to/TO
  the/DT
  people/NNS
  of/IN
  the/DT
  (ORGANIZATION Gulf/NNP Coast/NNP)
  and/CC
  (GSP New/NNP Orleans/NNP)
  ./.)
(S
  We/PRP
  're/VBP
  removing/VBG
  debris/NN
  and/CC
  repairing/NN
  highways/NNS
  and/CC
  rebuilding/VBG
  stronger/JJR
  le

# Lemmatization

In [30]:
from nltk.stem import WordNetLemmatizer

In [31]:
lemmatizer= WordNetLemmatizer()

In [32]:
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("cat"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better", pos="a")) #a stands for adjective
print(lemmatizer.lemmatize("run",'v')) #v stands for verb
print(lemmatizer.lemmatize("best", pos='a'))

cat
cactus
goose
cat
rock
python
better
good
run
best


# NLTK Corpora

In [33]:
#Example
from nltk.corpus import gutenberg

In [34]:
sample = gutenberg.raw('bible-kjv.txt')

In [35]:
tok =sent_tokenize(sample)

In [36]:
print(tok[5:15])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

In [37]:
#let's try Indian Corpus
from nltk.corpus import indian

In [38]:
indian.fileids()

['bangla.pos', 'hindi.pos', 'marathi.pos', 'telugu.pos']

In [39]:
for f in indian.fileids():
    print(f," ", len(indian.raw(f)))

bangla.pos   118283
hindi.pos   103087
marathi.pos   229853
telugu.pos   138273


In [40]:
for f in indian.fileids():
    print(f," ", len(indian.words(f)))

bangla.pos   10281
hindi.pos   9408
marathi.pos   19066
telugu.pos   9999


In [41]:
for f in indian.fileids():
    print(f," ", len(indian.sents(f)))

bangla.pos   896
hindi.pos   540
marathi.pos   1197
telugu.pos   994


In [42]:
hindi_send=indian.sents('hindi.pos')

In [43]:
for i in hindi_send:
    print(i)

['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक']
['संयुक्त', 'राष्ट्र', '।']
['इराक', 'के', 'विदेश', 'मंत्री', 'ने', 'अमरीका', 'के', 'उस', 'प्रस्ताव', 'का', 'मजाक', 'उड़ाया', 'है', ',', 'जिसमें', 'अमरीका', 'ने', 'संयुक्त', 'राष्ट्र', 'के', 'प्रतिबंधों', 'को', 'इराकी', 'नागरिकों', 'के', 'लिए', 'कम', 'हानिकारक', 'बनाने', 'के', 'लिए', 'कहा', 'है', '।']
['विदेश', 'मंत्री', 'का', 'कहना', 'है', 'कि', 'चूंकि', 'बगदाद', 'संयुक्त', 'राष्ट्र', 'की', 'मांगों', 'का', 'पालन', 'करते', 'हुए', 'अपने', 'भारी', 'विनाशकारी', 'हथियारों', 'को', 'नष्ट', 'कर', 'रहा', 'है', '।']
['लिहाजा', 'प्रतिबंधों', 'को', 'पूर्ण', 'रूप', 'से', 'उठा', 'दिया', 'जाना', 'चाहिए', '।']
['विदेश', 'मंत्री', 'मोहम्मद', 'सईद', 'का', 'कहना', 'है', 'कि', 'वे', 'इसे', "'सुव्यवस्थित", "प्रतिबंध'", 'कह', 'कर', 'आम', 'राय', 'और', 'सुरक्षा', 'परिषद', 'को', 'छल', 'रहे', 'हैं', '।']
['बेनजीर', 'की', 'सुनवाई', 'स्थगित']
['कराची', '।']
['पाकिस्तान', 'की', 'पूर्व', 'प्रधानमंत्री', 'बेनजीर', 'भुट्टो', 'पर', 'लगे', 'भ्रष्टाचार', 'के', 'आरोपों', 'के', '

['लड़कियों', 'को', 'हथियार', 'चलाना', ',', 'जासूसी', 'करना', 'और', 'मानव', 'बम', 'बन', 'कर', 'इस्लाम', 'के', 'नाम', 'पर', 'शहीद', 'होने', 'की', 'ट्रेनिंग', 'खुलेआम', 'दी', 'जाती', 'है', '।']
['आने', 'वाले', 'समय', 'में', 'इस', 'कारण', 'यदि', 'साक्षरता', 'का', 'प्रतिशत', 'और', 'भी', 'घट', 'जाए', 'तो', 'आश्चर्य', 'की', 'बात', 'नहीं', '।']
['पाकिस्तान', 'मानवअधिकार', 'आयोग', 'ने', 'बतलाया', 'है', 'कि', 'सैनिक', 'शासन', 'के', 'दौरान', 'धार्मिक', 'कटटरता', 'बढ़ी', 'है', '।']
['फौजी', 'सरकार', 'अपनी', 'रक्षा', 'के', 'लिये', 'इस्लाम', 'के', 'नाम', 'का', 'उपयोग', 'करती', 'है', 'और', 'यह', 'बतलाने', 'की', 'कोशिश', 'करती', 'है', 'कि', 'लोकतंत्र', 'से', 'इस्लाम', 'का', 'कोई', 'लेना', 'देना', 'नहीं', '।']
['यहां', 'तक', 'कहा', 'जाता', 'है', 'कि', 'उदाहरण', 'के', 'लिये', 'इजिप्ट', 'और', 'सऊदी', 'की', 'सरकार', 'है', '।']
['दोनों', 'ही', 'प्राचीनतम', 'इस्लामी', 'देश', 'हैं', '।']
['क्या', 'वहां', 'लोकतंत्र', 'है', '?']
['यदि', 'नहीं', 'तो', 'फिर', 'पाकिस्तान', 'में', 'क्या', 'आवश्यकता', 'है', '।']
['इस

['यह', 'खबर', 'विश्वसनीय', 'सूत्रों', 'से', 'बुधवार', 'को', 'मिली', '।']
['ट्रेवर', 'ऑस्ट्र्रेलिया', 'के', 'दो', 'महान', 'क्रिकेटरों', 'इयान', 'और', 'ग्रेग', 'चैपल', 'के', 'भाई', 'हैं', '।']
['ट्रेवर', 'की', 'चर्चा', '१९८१', 'में', 'जोरों', 'से', 'हुई', 'जब', 'उन्हें', 'उनके', 'भाई', 'ग्रेग', 'ने', 'एक', 'महत्त्वपूर्ण', 'एकदिवसीय', 'मैच', 'में', 'न्यू', 'जीलैंड', 'को', 'जीत', 'से', 'वंचित', 'करने', 'के', 'लिए', 'अंतिम', 'गेंद', 'अंडरआर्म', 'डालने', 'का', 'आदेश', 'दिया', 'था', '।']
['उस', 'अंतिम', 'गंेद', 'को', 'छक्का', 'के', 'लिए', 'मैदान', 'से', 'बाहर', 'कर', 'न्यू', 'जीलैंड', 'की', 'टीम', 'मैच', 'जीत', 'जा', 'सकती', 'थी', '।']
['सूत्र', 'के', 'अनुसार', 'टीम', 'के', 'फिजियो', 'जॉन', 'ग्लास्टर', 'होंगे', '।']
['ट्रेवर', 'और', 'जॉन', 'जल्दी', 'ही', 'अपने', 'कार्यभार', 'संभाल', 'लेंगे', '।']
['अनुबंध', 'के', 'तहत', 'हर', 'वर्ष', 'बांगलादेश', 'के', 'दो', 'क्रिकेटरों', 'को', 'ऑस्ट्रेलियाई', 'क्रिकेट', 'अकादमी', 'में', 'प्रशिक्षण', 'दिया', 'जायेगा', '।']
['ऑस्ट्रेलिया', 'के', 'क्रिकेट', 'वि

['हैरिस', '१४', 'और', 'ओरम', '१६', 'रन', 'बनाकर', 'नाबाद', 'रहे', '।', 'शतक', 'बनाने', 'वाले', 'नाथन', 'एस्टल', 'को', 'मैन', 'ऑफ', 'द', 'मैच', 'घोषित', 'किया', 'गया', '।']
['न्यू', 'जीलैंड', 'की', 'पारी', 'की', 'शुरुआत', 'जबरदस्त', 'रही', '।', 'फ्लेमिंग', 'और', 'एस्टल', 'की', 'सलामी', 'जोड़ी', 'ने', 'मिलकर', 'पहले', 'विकेट', 'के', 'लिए', '१९३', 'रनों', 'की', 'भागीदारी', 'की', '।']
['वकार', 'की', 'गेंद', 'पर', 'बोल्ड', 'होने', 'से', 'पूर्व', 'फ्लेमिंग', 'ने', '९१', 'गेंदों', 'का', 'सामना', 'करते', 'हुए', 'पांच', 'चौकों', 'और', 'दो', 'छक्के', 'की', 'मदद', 'से', '६०', 'रनों', 'की', 'पारी', 'खेली', '।']
['दूसरा', 'विकेट', 'रोजर', 'टुस', 'का', 'गिरा', 'जिन्हें', 'वसीम', 'अकरम', 'ने', 'शाहिद', 'आफरिदी', 'के', 'हाथों', '१७', 'रन', 'पर', 'लपकवाया', '।']
['तीसरा', 'विकेट', 'नाथन', 'एस्टल', 'का', 'गिरा', 'जिन्होंने', '११९', 'रनों', 'की', 'पारी', 'खेली', '।']
['उन्होंने', '११६', 'गेंदों', 'का', 'सामना', 'कर', '२१', 'चौके', 'लगाए', '।']
['उन्हें', 'वकार', 'ने', 'सलीम', 'ईलाही', 'के', 'हाथों', 'कैच'

['बजट', 'सत्र', 'शुक्र', 'से']
['पटना', '।']
['बिहार', 'विधानसभा', 'का', 'बजट', 'सत्र', 'शुक्रवार', 'से', 'शुरू', 'हो', 'रहा', 'है', '।']
['इसमें', 'पंचायत', 'चुनाव', 'में', 'आरक्षण', 'का', 'मुद्दा', 'मुख्य', 'रहेगा', '।']
['इस', 'मुद्दे', 'पर', 'सभी', 'विपक्षी', 'दलों', 'ने', 'अपनी', 'कमर', 'कस', 'ली', 'है', '।']
['इस', 'आरक्षण', 'को', 'लेकर', 'अधिसूचना', 'पहले', 'ही', 'जारी', 'कर', 'दी', 'गई', 'थी', '।']
['इस', 'मुद्दे', 'को', 'लेकर', 'सदन', 'में', 'शोर-शराबा', 'हो', 'सकता', 'है', '।']
['विपक्ष', 'के', 'नेता', 'सुशील', 'कुमार', 'मोदी', 'ने', 'कहा', 'कि', 'उनकी', 'पार्टी', 'पंचायत', 'चुनाव', 'में', 'आरक्षण', 'के', 'मुद्दे', 'के', 'अलावा', 'राज्य', 'सरकार', 'और', 'बिहार', 'कैडर', 'के', 'आईएएस', 'अधिकारियों', 'के', 'बीच', 'चल', 'रहे', 'संघर्ष', 'के', 'मुद्दे', 'को', 'भी', 'उठाएगी', '।']
['मोदी', 'ने', 'कहा', 'कि', 'भविष्य', 'की', 'रणनीति', 'की', 'योजना', 'भारतीय', 'जनता', 'पार्टी', 'लेजिस्लेटर', 'पार्टी', 'के', 'बैठक', 'में', 'ली', 'जाएगी', '।']
['बिहार', 'के', 'राज्यपाल', 'वी', '.', 'स

['वित्तमंत्री', 'वाला', 'के', 'पुत्र', 'पर', 'आरोप', 'है', 'कि', 'उन्होंने', 'रेसकोर्स', 'काम्पलेक्स', 'में', 'बहुमंजिली', 'इमारत', 'का', 'घटिया', 'निर्माण', 'किया', 'है', '।']
['वाला', 'पर', 'भारतीय', 'पैनल', 'कोड', '४०६', ',', '४२०', 'तथा', '११४', 'के', 'तहत', 'मामला', 'दर्ज', 'किया', 'गया', 'है', '।']
['भवन', 'निर्माण', 'में', 'लापरवाही', 'बरतने', 'की', 'शिकायत', 'रेसकोर्स', 'काम्पलेक्स', 'के', 'निवासियों', 'ने', 'पुलिस', 'को', 'दर्ज', 'कराई', 'थी', '।']
['पिछले', 'दिनों', '२६', 'जनवरी', 'को', 'आए', 'भयानक', 'भूकंप', 'में', 'वाला', 'द्वारा', 'बनाई', 'गई', 'बहुमंजिला', 'इमारत', 'में', 'दरारें', 'पड़', 'गई', 'थीं', '।']
['इसके', 'अलावा', 'तीन', 'अन्य', 'बिल्डरों', 'दिलखुश', 'सेठ', ',', 'अतुल', 'सेठ', 'व', 'नाजाभाई', 'पटेल', 'पर', 'भी', 'मामला', 'दर्ज', 'किया', 'गया', 'है', '।']
['इस', 'बीच', 'अहमदाबाद', 'के', "'गिरिराज", "एवेन्यू'", 'बिल्डिंग', 'के', '२', 'निर्माताओं', 'को', 'पुलिस', 'ने', 'गिरफ्तार', 'कर', 'लिया', 'है', '।']
['२६', 'जनवरी', 'को', 'इस', 'भवन', 'के', 'गिरने', 'से', '१०'

['मैकमिलन', 'ने', '८३', 'गेंदों', 'में', 'छह', 'चौकों', 'और', 'एक', 'छक्के', 'की', 'मदद', 'से', '६४', 'रनों', 'की', 'पारी', 'खेली', '।']
['न्यू', 'जीलैंड', 'के', '१५२', 'रन', 'के', 'योग', 'पर', 'मैकमिलन', 'को', 'अब्दुर', 'रज्जाक', 'ने', 'यूसुफ', 'योहाना', 'के', 'हाथों', 'लपकवाया', '।']
['छठा', 'विकेट', 'भी', '१५२', 'रन', 'पर', 'ही', 'गिरा', 'जब', 'क्रिस', 'हैरिस', 'को', 'अब्दुर', 'रज्जाक', 'ने', '११', 'रन', 'पर', 'बोल्ड', 'कर', 'दिया', '।']
['विट्टोरी', ',', 'टफी', ',', 'फ्रैक्लीन', 'और', 'मार्टिन', 'तेज', 'बल्लेबाजी', 'नही', 'कर', 'पाये', 'और', 'न्यू', 'जीलैंड', 'की', 'पारी', '२१५', 'रनों', 'पर', 'सिमट', 'गई', '।']
['३९', 'गेंदों', 'में', 'दो', 'चौकों', 'और', 'एक', 'छक्के', 'की', 'मदद', 'से', '३४', 'रन', 'बनाने', 'वाले', 'परोरे', 'अंत', 'तक', 'आउट', 'नहीं', 'हुए', '।']
['अकरम', 'और', 'सकलैन', 'ने', 'तीन-तीन', ',', 'रज्जाक', 'ने', 'दो', 'और', 'युनूस', 'तथा', 'महमूद', 'ने', 'एक-एक', 'विकेट', 'लिये', '।']
['इससे', 'पहले', 'पाकिस्तान', 'ने', 'निर्धारित', '५०', 'ओवर', 'में', 'नौ', 'विकेट',

In [44]:
hindi_words=indian.words('hindi.pos')

In [45]:
print(hindi_words[0:11])

['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', 'राष्ट्र', '।', 'इराक', 'के', 'विदेश']


In [46]:
first10=hindi_words[0:10]

In [47]:
for i in first10:
    print(i)

पूर्ण
प्रतिबंध
हटाओ
:
इराक
संयुक्त
राष्ट्र
।
इराक
के


# WordNet

With wordnet you can look for synonym, antonym and more

In [48]:
from nltk.corpus import wordnet

In [49]:
syns = wordnet.synsets("program")  # synsets() for synonym 
print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [50]:
print(syns[0].lemmas()[0].name()) #just the word

plan


In [51]:
print(syns[0].definition()) # the definition

a series of steps to be carried out or goals to be accomplished


In [52]:
# Example
print(syns[0].examples()) #gives a sereis of sentence that used the word

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [53]:
synonyms = []
antonyms = [] 

In [54]:
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))

{'adept', 'just', 'safe', 'respectable', 'practiced', 'secure', 'ripe', 'well', 'undecomposed', 'unspoilt', 'expert', 'in_force', 'skilful', 'commodity', 'trade_good', 'good', 'dependable', 'soundly', 'skillful', 'thoroughly', 'sound', 'beneficial', 'salutary', 'proficient', 'honorable', 'serious', 'effective', 'upright', 'honest', 'dear', 'goodness', 'unspoiled', 'estimable', 'near', 'in_effect', 'full', 'right'}
{'badness', 'bad', 'ill', 'evilness', 'evil'}


Symantics similarity

In [55]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))  # this function compares the word and tells on scale of 0-1 how much they are similar

0.9090909090909091


In [56]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
print(w1.wup_similarity(w2))
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2))

0.6956521739130435
0.32


# Text Classification

In [57]:
# here we are trying to make our own text classifier and get whether the given
# this is +ve or -ve, similar to sentiment analysis

In [58]:
import random #to shuffle up the dataset, not always necessary
from nltk.corpus import movie_reviews

In [59]:
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

In [60]:
random.shuffle(documents)

In [61]:
print(documents[1])

(['humanities', 'quest', 'for', 'knowledge', 'never', 'ends', '.', 'so', 'a', 'team', 'of', 'scientists', 'and', 'film', '-', 'makers', 'travel', 'to', 'the', 'amazon', 'to', 'search', 'for', 'a', 'legendary', 'indian', 'tribe', '.', 'the', 'party', 'consists', 'of', 'anthropologist', 'steven', 'cale', '(', 'eric', 'stoltz', ')', 'and', 'the', 'camera', 'team', 'consisting', 'of', 'terri', 'flores', '(', 'jennifer', 'lopez', ')', ',', 'danny', 'rich', '(', 'ice', 'cube', ')', ',', 'gary', 'dixon', '(', 'owen', 'wilson', ')', ',', 'denise', 'kahlberg', '(', 'kari', 'wuhrer', ')', 'and', 'warren', 'westridge', '(', 'jonathan', 'hyde', ')', '.', 'early', 'on', 'their', 'journey', 'they', 'meet', 'paul', 'sarone', '(', 'jon', 'voight', ')', 'whose', 'boat', 'is', 'stuck', 'on', 'the', 'shore', '.', 'they', 'agree', 'to', 'give', 'him', 'a', 'ride', 'to', 'the', 'next', 'village', '.', 'he', 'claims', 'to', 'know', 'the', 'area', 'well', 'and', 'can', 'be', 'useful', 'locating', 'the', 'nat

What we will do is that we will take every word from every review, and from those we will take the most popular words that were used. Then from those popular words, we will see which appeared in +ve or -ve text and then we will search for those words, and then which ever has more +ve and -ve words we classify that

In [62]:
all_words=[]
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [63]:
# lets see how many time the word stupid appeared
print(all_words['stupid'])

253


In [64]:
word_features = list(all_words.keys())[:3000]

In [65]:
def find_features(document):
    word = set(document)
    features = {}
    for w in word_features:
        features[w]= (w in words) #if the word in the document is present in the word_feature, true otherwise false
    return features

In [66]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



NameError: name 'document' is not defined

In [69]:
featureset=[(find_features(rev), category) for (rev, category) in documents]

# Naive Bayes
to see what is -ve and +ve sentiment

In [70]:
training_set= featureset[:1900]
testing_set= featureset[1900:]

In [71]:
classifer= nltk.NaiveBayesClassifier.train(training_set)

In [75]:
print("Accuacry is:", (nltk.classify.accuracy(classifer, testing_set))*100)
print(classifer.show_most_informative_features(25))

Accuacry is: 47.0
Most Informative Features
                     pad = False             neg : pos    =      1.0 : 1.0
                   leads = False             neg : pos    =      1.0 : 1.0
               cleveland = False             neg : pos    =      1.0 : 1.0
                    jump = False             neg : pos    =      1.0 : 1.0
               infertile = False             neg : pos    =      1.0 : 1.0
                 effects = False             neg : pos    =      1.0 : 1.0
                   crazy = False             neg : pos    =      1.0 : 1.0
                     end = False             neg : pos    =      1.0 : 1.0
                    note = False             neg : pos    =      1.0 : 1.0
               mcdormand = False             neg : pos    =      1.0 : 1.0
                  career = False             neg : pos    =      1.0 : 1.0
                    poor = False             neg : pos    =      1.0 : 1.0
                disaster = False             neg : pos  