# **Chunking and Chinking**

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.chunk import RegexpParser

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
# Read the content of the file
with open('/content/sample.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

In [4]:
# Tokenize the text into sentences
sentences = sent_tokenize(text_data)

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
pos_tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences]

In [5]:
# Define the chunking grammar
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # Noun Phrase
    PP: {<IN><NP>}          # Prepositional Phrase
    VP: {<VB.*><NP|PP>*}    # Verb Phrase
"""

In [6]:
chunk_parser = RegexpParser(grammar)

# Perform chunking
chunked_sentences = [chunk_parser.parse(sentence) for sentence in pos_tagged_sentences]

# Chinking: remove everything that is not a noun phrase from NP
chinked_sentences = []
for sentence in chunked_sentences:
    for subtree in sentence.subtrees():
        if subtree.label() == 'NP':
            chinked_sentence = RegexpParser("NP: {<.*>+}").parse(subtree.leaves())
            chinked_sentences.append(chinked_sentence)

In [8]:
for i, sentence in enumerate(chunked_sentences):
    print(f"\nChunked Sentence {i + 1}:\n{sentence}")


Chunked Sentence 1:
(S (NP The/DT cat/NN) (VP chased/VBD (NP the/DT mouse/NN)) ./.)

Chunked Sentence 2:
(S
  (NP The/DT dog/NN)
  (VP barked/VBD)
  loudly/RB
  (PP at/IN (NP the/DT mailman/NN))
  ./.)

Chunked Sentence 3:
(S
  Birds/NNS
  (VP are/VBP)
  (VP flying/VBG)
  high/JJ
  (PP in/IN (NP the/DT sky/NN))
  ./.)

Chunked Sentence 4:
(S
  John/NNP
  (VP is/VBZ)
  (VP walking/VBG)
  his/PRP$
  (NP dog/NN)
  (PP in/IN (NP the/DT park/NN))
  ./.)

Chunked Sentence 5:
(S
  (NP The/DT sun/NN)
  (VP is/VBZ)
  (VP shining/VBG)
  brightly/RB
  (NP today/NN)
  ./.)

Chunked Sentence 6:
(S
  She/PRP
  (VP is/VBZ)
  (VP
    reading/VBG
    (NP a/DT book/NN)
    (PP about/IN (NP artificial/JJ intelligence/NN)))
  and/CC
  (NP machine/NN)
  (NP learning/NN)
  ./.)

Chunked Sentence 7:
(S
  It/PRP
  (VP is/VBZ (NP a/DT beautiful/JJ day/NN))
  outside/IN
  with/IN
  clear/JJ
  blue/JJ
  skies/NNS
  ./.)

Chunked Sentence 8:
(S
  (NP The/DT river/NN)
  (VP flows/VBZ)
  calmly/RB
  (PP through/IN

In [9]:
for i, sentence in enumerate(chinked_sentences):
    print(f"\nChinked Sentence {i + 1}:\n{sentence}")


Chinked Sentence 1:
(S (NP The/DT cat/NN))

Chinked Sentence 2:
(S (NP the/DT mouse/NN))

Chinked Sentence 3:
(S (NP The/DT dog/NN))

Chinked Sentence 4:
(S (NP the/DT mailman/NN))

Chinked Sentence 5:
(S (NP the/DT sky/NN))

Chinked Sentence 6:
(S (NP dog/NN))

Chinked Sentence 7:
(S (NP the/DT park/NN))

Chinked Sentence 8:
(S (NP The/DT sun/NN))

Chinked Sentence 9:
(S (NP today/NN))

Chinked Sentence 10:
(S (NP a/DT book/NN))

Chinked Sentence 11:
(S (NP artificial/JJ intelligence/NN))

Chinked Sentence 12:
(S (NP machine/NN))

Chinked Sentence 13:
(S (NP learning/NN))

Chinked Sentence 14:
(S (NP a/DT beautiful/JJ day/NN))

Chinked Sentence 15:
(S (NP The/DT river/NN))

Chinked Sentence 16:
(S (NP the/DT valley/NN))

Chinked Sentence 17:
(S (NP the/DT distance/NN))

Chinked Sentence 18:
(S (NP the/DT sound/NN))

Chinked Sentence 19:
(S (NP the/DT shore/NN))

Chinked Sentence 20:
(S (NP A/DT gentle/JJ breeze/NN))

Chinked Sentence 21:
(S (NP jump/NN))

Chinked Sentence 22:
(S (NP 

In [12]:
with open('sample.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

sentences = sent_tokenize(text_data)

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
pos_tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences]

# Chunking grammar
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # Noun Phrase
    PP: {<IN><NP>}          # Prepositional Phrase
    VP: {<VB.*><NP|PP>*}    # Verb Phrase
"""

chunk_parser = RegexpParser(grammar)

chunked_sentences = [chunk_parser.parse(sentence) for sentence in pos_tagged_sentences]

# Chinking grammar
chink_grammar = r"""
    NP: {<.*>+}          # Capture all inside NP
    }<VB.*>|<IN>{        # Remove verbs (VB) and prepositions (IN) from NP
"""

chinked_sentences = [RegexpParser(chink_grammar).parse(sentence) for sentence in chunked_sentences]

for i, (chunked, chinked) in enumerate(zip(chunked_sentences, chinked_sentences)):
    print(f"\nChunked Sentence {i + 1}:\n{chunked}")
    print(f"Chinked Sentence {i + 1}:\n{chinked}")


Chunked Sentence 1:
(S (NP The/DT cat/NN) (VP chased/VBD (NP the/DT mouse/NN)) ./.)
Chinked Sentence 1:
(S (NP (NP The/DT cat/NN) (VP chased/VBD (NP the/DT mouse/NN)) ./.))

Chunked Sentence 2:
(S
  (NP The/DT dog/NN)
  (VP barked/VBD)
  loudly/RB
  (PP at/IN (NP the/DT mailman/NN))
  ./.)
Chinked Sentence 2:
(S
  (NP
    (NP The/DT dog/NN)
    (VP barked/VBD)
    loudly/RB
    (PP at/IN (NP the/DT mailman/NN))
    ./.))

Chunked Sentence 3:
(S
  Birds/NNS
  (VP are/VBP)
  (VP flying/VBG)
  high/JJ
  (PP in/IN (NP the/DT sky/NN))
  ./.)
Chinked Sentence 3:
(S
  (NP
    Birds/NNS
    (VP are/VBP)
    (VP flying/VBG)
    high/JJ
    (PP in/IN (NP the/DT sky/NN))
    ./.))

Chunked Sentence 4:
(S
  John/NNP
  (VP is/VBZ)
  (VP walking/VBG)
  his/PRP$
  (NP dog/NN)
  (PP in/IN (NP the/DT park/NN))
  ./.)
Chinked Sentence 4:
(S
  (NP
    John/NNP
    (VP is/VBZ)
    (VP walking/VBG)
    his/PRP$
    (NP dog/NN)
    (PP in/IN (NP the/DT park/NN))
    ./.))

Chunked Sentence 5:
(S
  (NP The/