## Chinking

In [1]:
import nltk
from nltk.corpus import stopwords,state_union
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer

In [3]:
## train and test text

train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [5]:
tokenized = custom_sent_tokenizer.tokenize(test_text)

In [15]:
def pos_tagging_chunking_chinking():
    try:
        for token in tokenized[13:15]:
            words = word_tokenize(token)
            pos_tagged = nltk.pos_tag(words)
            print(pos_tagged)
            chunkGram = r"""Chunk : {<.*>+}  
                               }<VB.?>|<IN><DT><TO>+{"""  
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(pos_tagged)
            chunked.draw()
    except Exception as e:
        print(e)
        
pos_tagging_chunking_chinking()

[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('In', 'IN'), ('this', 'DT'), ('decisive', 'JJ'), ('year', 'NN'), (',', ','), ('you', 'PRP'), ('and', 'CC'), ('I', 'PRP'), ('will', 'MD'), ('make', 'VB'), ('choices', 'NNS'), ('that', 'WDT'), ('determine', 'VBP'), ('both', 'DT'), ('the', 'DT'), ('future', 'NN'), ('and', 'CC'), ('the', 'DT'), ('character', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('country', 'NN'), ('.', '.')]


*   Chunk:: This is the name of the chunk or chunk label. It's an arbitrary label used to identify the chunk pattern.
*   *{<.*>+}: *This specifies a pattern for capturing any sequence of one or more words, regardless of their part-of-speech tags*. The .* * matches any part-of-speech tag, and the + indicates one or more occurrences.
*   }<VB.?>|<IN><DT><TO>+{: This part defines cases where the previous pattern should not be considered as a chunk. It specifies two alternative patterns separated by the | symbol:
    * <VB.?>: This matches any verb (VB) with an optional suffix (any character .).
    * <IN><DT><TO>+: This matches a sequence of preposition (IN), determiner (DT), and infinitive marker (TO) tags. It allows for one or more occurrences (+).

    
In summary, the chunking grammar pattern captures any sequence of one or more words, except for cases where the sequence contains a verb or a specific sequence of preposition, determiner, and infinitive marker tags. This pattern aims to identify and extract chunks that represent meaningful phrases or groupings in the text, excluding certain patterns that are typically not considered as chunks.