
## Split a sentence using Constituency Parsing

In [1]:
!pip install --quiet allennlp
!pip install --quiet spacy
!pip install --quiet allennlp-models
!python -m spacy download en_core_web_sm

2024-04-16 16:46:32.001098: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 16:46:32.001152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 16:46:32.002480: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-16 16:46:32.010448: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.3.0
  Downloadin

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
from allennlp_models.pretrained import load_predictor
predictor = load_predictor("structured-prediction-constituency-parser")



In [4]:
text = "The old wman was sitting under a tree and sipping coffee."
text = text.rstrip("?:!.,;")
print(text)

parser_output = predictor.predict(sentence=text)
print(parser_output)



The old wman was sitting under a tree and sipping coffee
{'class_probabilities': [[1.0, 4.2595849070181657e-10, 4.588931858340523e-15, 9.611629288969973e-15, 2.7223042613558346e-14, 9.185464888705752e-12, 1.6180245619897005e-13, 5.42407308304349e-12, 1.4101919632025783e-10, 2.4530889472518247e-12, 1.6918654723182235e-14, 3.968561590461661e-13, 1.562567408797122e-09, 7.0752079325484e-14, 2.5363188979610207e-11, 1.1523346296268833e-12, 2.181196067133854e-14, 6.128580282149798e-12, 4.509520794976352e-12, 4.3098136821503874e-13, 4.7031688439613806e-12, 2.148125566314718e-10, 1.1713987557726568e-09, 6.504242673968652e-13, 9.976786378377428e-14, 2.2270913954160199e-13, 2.6008067227634157e-12, 3.602696483154255e-12, 7.289600780641114e-14, 9.3219453423804e-13, 1.652045514277667e-11, 5.7820206955661035e-11, 3.091282091416403e-11, 1.0312491962802262e-12, 1.8750315970011977e-12, 1.795099138779399e-11, 8.609427732363117e-13, 1.5143600349404318e-12, 5.155272909951325e-12, 1.4156485012017939e-11, 1.

In [6]:
tree_string = parser_output['trees']
print(tree_string)

(S (NP (DT The) (JJ old) (NN wman)) (VP (VBD was) (VP (VP (VBG sitting) (PP (IN under) (NP (DT a) (NN tree)))) (CC and) (VP (VBG sipping) (NP (NN coffee))))))


In [10]:
from nltk import tokenize
from nltk.tree import Tree

tree = Tree.fromstring(tree_string)
# print(tree)
tree.pretty_print()

                           S                                          
      _____________________|________                                   
     |                              VP                                
     |         _____________________|_______                           
     |        |                             VP                        
     |        |                  ___________|________________          
     |        |                 VP               |           |        
     |        |      ___________|___             |           |         
     |        |     |               PP           |           VP       
     |        |     |       ________|___         |      _____|____     
     NP       |     |      |            NP       |     |          NP  
  ___|___     |     |      |         ___|___     |     |          |    
 DT  JJ  NN  VBD   VBG     IN       DT      NN   CC   VBG         NN  
 |   |   |    |     |      |        |       |    |     |          |    

#### Acronyms  
S : Sentence  
NP  : Noun Phase  
VP  : Verb Phase  
PP  : Prepositional Phase  
Det : Determiner  
N   : Noun
V   : Verb  
P   : Preposition  
VBD : Past Tense Verb  
JJ  : Adjective

In [11]:
# split at right most nounphrase or verbphrase

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

def get_right_most_VP_or_NP(parse_tree,last_NP = None,last_VP = None):
    if len(parse_tree.leaves()) == 1:
        return last_NP,last_VP
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_NP = last_subtree
    elif last_subtree.label() == "VP":
        last_VP = last_subtree

    return get_right_most_VP_or_NP(last_subtree,last_NP,last_VP)


last_nounphrase, last_verbphrase =  get_right_most_VP_or_NP(tree)
last_nounphrase_flattened = get_flattened(last_nounphrase)
last_verbphrase_flattened = get_flattened(last_verbphrase)

print ("Original Sentence ",text)
print ("last_nounphrase ",last_nounphrase )
print ("last_verbphrase ",last_verbphrase)
print ("\n ")
print ("last_nounphrase ",last_nounphrase_flattened )
print ("last_verbphrase ",last_verbphrase_flattened)

Original Sentence  The old wman was sitting under a tree and sipping coffee
last_nounphrase  (NP (NN coffee))
last_verbphrase  (VP (VBG sipping) (NP (NN coffee)))

 
last_nounphrase  coffee
last_verbphrase  sipping coffee


In [13]:
import re

# sub_string - sipping coffee
# main_string - The old woman was sitting under a tree and sipping coffee
# compare like below
# Theoldwomanwassittingunderatreeandsippingcoffee  || sippingcoffee
# oldwomanwassittingunderatreeandsippingcoffee || sippingcoffee
# womanwassittingunderatreeandsippingcoffee || sippingcoffee
# ...............
# andsippingcoffee || sippingcoffee
# sippingcoffee || sippingcoffee
def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ", "")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ", "")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])

    return None

longest_phrase_to_use = max(last_nounphrase_flattened, last_verbphrase_flattened,key = len)
print ("Ending phrase: ", longest_phrase_to_use)

longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)


split_sentence = get_termination_portion(text, longest_phrase_to_use)
print ("Original sentence : ",text)
print ("Original sentence after splitting at ending phrase: ",split_sentence)

Ending phrase:  sipping coffee
Original sentence :  The old wman was sitting under a tree and sipping coffee
Original sentence after splitting at ending phrase:  The old wman was sitting under a tree and
