## Chunking using Shallow parsing

In [8]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [9]:
sentence = "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)

# chunk grammar for NP (Noun Phrase), VP (Verb Phrase), and PP (Prepositional Phrase)
chunk_grammar = r"""
    NP: {<DT>?<JJ>*<NN.*>}     # Determiner (optional) + Adjective (0 or more) + Noun
    VP: {<VB.*><NP|PP>*}       # Verb + (Optional NP or PP)
    PP: {<IN><NP>}             # Preposition + NP
"""
# Create chunk parser
chunk_parser = nltk.RegexpParser(chunk_grammar)
chunks = chunk_parser.parse(pos_tags)
print(chunks)

(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  (VP jumps/VBZ)
  (PP over/IN (NP the/DT lazy/JJ dog/NN)))


## Named entities in the document using Using Rule & Dictionary-Based Approaches

In [10]:
import re
import nltk

text = "Elon Musk is the CEO of Tesla Inc. He was born on 1971-06-28 in South Africa."

# Dictionary
companies = {"Tesla", "Google", "Microsoft", "Amazon"}
persons = {"Elon Musk", "Bill Gates", "Jeff Bezos"}
locations = {"South Africa", "United States", "India"}

# Rule-based entity extraction
entities = {}

# Recognizing dates (YYYY-MM-DD format)
date_pattern = r'\b\d{4}-\d{2}-\d{2}\b'
entities["DATE"] = re.findall(date_pattern, text)

entities["PERSON"] = [name for name in persons if name in text]

entities["ORG"] = [name for name in companies if name in text]

entities["GPE"] = [name for name in locations if name in text]

print(entities)


{'DATE': ['1971-06-28'], 'PERSON': ['Elon Musk'], 'ORG': ['Tesla'], 'GPE': ['South Africa']}


## Relations between the entities from the given sentence using rules

In [11]:
import re
def extract_custom_relations(text):

    pattern = r"(\w+) (was born in|is located in|works at|is a type of|and other) (\w+)"
    matches = re.findall(pattern, text)

    return [(match[0], match[1], match[2]) for match in matches]

text = "Musk was born in South Africa. Microsoft is located in the USA.\
         Sundar Pichai works at Google.Tulip is a type of flower.Eagles and other birds can fly"

print(extract_custom_relations(text))


[('Musk', 'was born in', 'South'), ('Microsoft', 'is located in', 'the'), ('Pichai', 'works at', 'Google'), ('Tulip', 'is a type of', 'flower'), ('Eagles', 'and other', 'birds')]
