<a href="https://colab.research.google.com/github/VyomaD/kdm_tech/blob/main/ICP_lab2/source/ICP_LAB2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
# Installing all nltk libraries and Spacy, neuralcoref
# Install nltk and then Spacy and then restart the runtime and then continue
# !pip install nltk
# !pip install spacy==2.1.0
# !python -m spacy download en_core_web_sm
# !python -m spacy download en
# !pip install neuralcoref
import nltk
import spacy
import neuralcoref
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag, word_tokenize, ChartParser
from nltk import sent_tokenize
from nltk.draw.tree import draw_trees
from nltk.stem import WordNetLemmatizer

nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<spacy.lang.en.English at 0x7f39d618d898>

In [48]:
# Lemmatize the sentence using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [74]:
# defining a function called as process_sentence with sent as parameter as input
def process_sentence(sent):
    r"""Processing a sentence to extract POS, lemma and NER for each word in 
    the sentence
    
    Args:
        sent: input sentence as a string
    
    Return:
        Returns a tuple of dictionaries for POS, lemma and NER
    """

    # initializing nltk word tokenizer library inside a variable word_tokens
    word_tokens = nltk.word_tokenize(sent)

    # finding pos of each word
    # used pos_tags to find all pos inside words
    pos_dict = dict(pos_tag(word_tokens))

    # finding lemma of each word
    # creating empty dict
    lemma_dict = {}
    # iterating each key and converting first letter of pos into lower case
    for word in word_tokens:
        # replacing 'j' to 'a' as lemmatizer accepts 'a' as adjective and 
        # nltk gives 'j' as adjective
        w_tag = pos_dict[word][0].lower().replace('j', 'a')
  
        # finding for each letter 
        # if first letter is present inside word tag then perform operation
        w_tag = w_tag if w_tag in ['a', 'r', 'n', 'v'] else None
        # if it is not in word tag then return the word itself
        if not w_tag:
            lemma_dict[word] = word
        else:
          # return lemma words
            lemma_dict[word] = lemmatizer.lemmatize(word, w_tag)

    # finding ner of each word
    doc = nlp(sent)
    # creating ner group for and defining key value based on text and label 
    ner_groups = [(X.text, X.label_) for X in doc.ents]
    # iterating each word and finding presence of Named entity relationship
    ner_dict = {}
    for word in word_tokens:
        for group in ner_groups:
            if word in group[0]:
            #if the word is present in groups then return named entity value
                ner_dict[word] = group[1]
        if word not in ner_dict.keys():
            #if not present then return 0
            ner_dict[word] = 0

    # returning all the values in dictonary (pos, lemma and named entity pair)
    return (pos_dict, lemma_dict, ner_dict)

In [None]:

# main program
# delivering the sentence using sent tokenize
text = "Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary."
print(text)

sent_tokens = nltk.sent_tokenize(text)

# print format table
print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))
# iterating each sentence with indexing 
for i, sent in enumerate(sent_tokens):
    print("[Sentence {}]".format(i+1))
    pos_dict, lemma_dict, ner_dict = process_sentence(sent)
# retrieving key value from dict
    keys = pos_dict.keys()
    for key in keys:
        # break
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(key, lemma_dict[key],
              pos_dict[key], ner_dict[key]))
    print("")



In [79]:
# parsing
# Chunk the pattern for parsing noun phrase using Regexp
# This rule says that an NP chunk should be formed whenever the chunker 
# finds an optional determiner (DT) followed by any number of adjectives (JJ) 
#and then a noun (NN).

# visualising three different types of grammar rules for parsin on three 
# different sentences to form a syntactic tree

pattern1 = 'NP: {<DT>?<JJ>*<NN>}'
pattern2 = 'NP: {<NN.?>*<VBD.?>*<JJ.?>*<CC>?}'
pattern3 = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """

# assigning pattern 1 in parser
cpharse1 = nltk.RegexpParser(pattern1)
# assigning pattern 2 in parser
cphrase2 = nltk.RegexpParser(pattern2)
# assigning pattern 3 in parser
cpharse3 = nltk.RegexpParser(pattern3)

print(f"Parsing sentence 1 using grammar rule: {pattern1}")
# get the first sentence
firstsent = sent_tokens[0]
firstsentparsed = process_sentence(firstsent)[0]
firstsentparsed = list(firstsentparsed.items())
#parsing processed sentence using regexp patterns
firstcs = cpharse1.parse(firstsentparsed)
print(firstcs)
print("")

print(f"Parsing sentence 2 using grammar rule: {pattern2}")
# get the second sentence
secondsent = sent_tokens[1]
secondsentparsed = process_sentence(secondsent)[0]
secondsentparsed = list(secondsentparsed.items())
#parsing processed sentence using regexp patterns
secondcs = cphrase2.parse(secondsentparsed)
print(secondcs)
print("")

print(f"Parsing sentence 3 using grammar rule: {pattern3}")
# get the third sentence
thirdsent = sent_tokens[2]
thirdsentparsed = process_sentence(thirdsent)[0]
thirdsentparsed = list(thirdsentparsed.items())
#parsing processed sentence using regexp patterns
thirdcs = cpharse3.parse(thirdsentparsed)
print(thirdcs)
print("")

Parsing sentence 1 using grammar rule: NP: {<DT>?<JJ>*<NN>}
(S
  (NP Xi/NN)
  Jinping/NNP
  is/VBZ
  a/DT
  Chinese/NNP
  (NP politician/NN)
  who/WP
  has/VBZ
  served/VBN
  as/IN
  General/NNP
  Secretary/NNP
  of/IN
  the/DT
  Communist/NNP
  Party/NNP
  (/(
  CCP/NNP
  )/)
  and/CC
  Chairman/NNP
  Central/NNP
  Military/NNP
  Commission/NNP
  CMC/NNP
  since/IN
  2012/CD
  ,/,
  President/NNP
  People/NNP
  's/POS
  Republic/NNP
  China/NNP
  PRC/NNP
  2013/CD
  ./.)

Parsing sentence 2 using grammar rule: NP: {<NN.?>*<VBD.?>*<JJ.?>*<CC>?}
(S
  He/PRP
  has/VBZ
  been/VBN
  the/DT
  (NP paramount/JJ)
  (NP leader/NN)
  of/IN
  (NP China/NNP)
  ,/,
  most/RBS
  (NP prominent/JJ political/JJ)
  in/IN
  (NP country/NN)
  since/IN
  2012/CD
  ./.)

Parsing sentence 3 using grammar rule: 
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLA

In [86]:
# co-coref resolution
co_coref_tokens = nlp(text)

def coref_mentions(co_coref_tokens):
    r"""Extracting all mentions in the given text
    
    Args:
        co_coref_tokens: input text parsed
    
    Return:
        Returns the list of all coreferences from the sentence
    """

    print('\nAll the "mentions" in the given text:')
    for cluster in co_coref_tokens._.coref_clusters:
        print(cluster.mentions)

if co_coref_tokens._.has_coref:
    print("Given text: " + text)
    coref_mentions(co_coref_tokens)


Given text: Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary.

All the "mentions" in the given text:
[Xi Jinping, He, he, his, he]
[China, China, the country]
