In [1]:
import nltk
from nltk.tokenize import word_tokenize

We will see how we can remove content from chunks, this is the essence of chinking. Chinking is similar to chunking except we r basically identifying chunks of information we would like to remove from chunks identified by a chunk tagger.
First we need to tokenize input text and then label it with part of speech tags before we get into chinking and chunking.

In [2]:
sent = """We are going to chink this sentence to remove all nouns. All of the other
    words will be there. Except for the nouns"""

We r going to chink the sentence to remove all nouns. Next we will use part of speech tagging and word tokenization to prepare our sentence for chunking.

In [3]:
print(nltk.pos_tag(word_tokenize(sent)))
# our example sting is converted from string into list of tuples. Each tuple contains single word and it's corresponding 
# parts of speech tag.

[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('chink', 'VB'), ('this', 'DT'), ('sentence', 'NN'), ('to', 'TO'), ('remove', 'VB'), ('all', 'DT'), ('nouns', 'NNS'), ('.', '.'), ('All', 'DT'), ('of', 'IN'), ('the', 'DT'), ('other', 'JJ'), ('words', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('there', 'RB'), ('.', '.'), ('Except', 'IN'), ('for', 'IN'), ('the', 'DT'), ('nouns', 'NNS')]


In [4]:
# Now we have to chunk our sentence using a chunking rule. 
chunkRule = r"Chunk: {<.*>+}"
# This rule creates single chunk for whole sentence. We will pass this rule to nltk.RegexpParser

In [5]:
chunkParser = nltk.RegexpParser(chunkRule)
chunkSent = chunkParser.parse(nltk.pos_tag(word_tokenize(sent)))

In [6]:
chunkSent.draw() # we can tree visualization of our chunk sentence

Now we will implement chinking to exclude nouns from our chunk sentence. Let's modify our chunking rule. Chinking rules are defined inside } and{.

In [7]:
chunkRule = r"""Chunk: {<.*>+}
                        }<NN.?|NNS|NNP|NNPS>+{"""

In [8]:
# applying our chinking rule
chunkParser = nltk.RegexpParser(chunkRule)
chunkSent = chunkParser.parse(nltk.pos_tag(word_tokenize(sent)))

In [9]:
chunkSent.draw() # in our tree diagram, we can see our sentence is divided into several chunks rather than 1, in the nouns,
# for excluded according to our chinking rule.

In [10]:
# one more example of chunking and chinking rules. This time we r excluding the verbs instead of the nouns.
import nltk
from nltk.tokenize import word_tokenize
sent = "We are going to chink this sentence to remoe all verbs"
# applying chunking and chinking rules
chunkRule = r"""Chunk: {<.*>+}
                        }<VB.?|VBP|VBG>+{"""
chunkParser = nltk.RegexpParser(chunkRule)
# applying word tokenization, parts of speech tagging and regular expression parsing on our sample sentence.
chunkSent = chunkParser.parse(nltk.pos_tag(word_tokenize(sent)))
chunkSent.draw() # creating tree diagram.