
## 1) Split a sentence using Constituency Parsing

In [1]:
!pip install --quiet allennlp
!pip install --quiet spacy
!pip install --quiet allennlp-models
!python -m spacy download en_core_web_sm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
from allennlp_models.pretrained import load_predictor
predictor = load_predictor("structured-prediction-constituency-parser")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Output()

Output()

Output()

In [4]:
text = "The old woman was sitting under a tree and sipping coffee."
text = text.rstrip("?:!.,;")
print(text)

parser_output = predictor.predict(sentence=text)
print(parser_output)



The old woman was sitting under a tree and sipping coffee
{'class_probabilities': [[1.0, 2.2363871865493934e-10, 2.1188825637097862e-15, 6.99162890465386e-15, 5.056980242136621e-15, 3.163912274534164e-12, 6.6677606903702e-14, 4.1598599564984795e-12, 6.378007844087819e-11, 2.7916581806647356e-12, 6.635696766371657e-15, 2.251650878552433e-13, 1.8319451511317197e-09, 5.1168562388483746e-14, 1.8848825278361403e-11, 1.3322669790288844e-12, 7.69998011206953e-15, 6.265783031311756e-12, 3.3667081709293223e-12, 2.0015332978257777e-13, 3.2308040791295678e-12, 1.2262434123666566e-10, 9.285546220816343e-10, 3.8637046036529843e-13, 8.740616330759135e-14, 1.0080940260077248e-13, 7.664719011389598e-13, 3.5253334539769865e-12, 2.2906093458097652e-14, 6.031399338302101e-13, 1.765338222825541e-11, 4.3048474507312307e-11, 2.1693001561740033e-11, 4.3421248042442573e-13, 9.326534770176531e-13, 1.3978192735242256e-11, 4.353593765875352e-13, 8.127009265210261e-13, 3.139919314137929e-12, 7.455675833656361e-12

In [5]:
tree_string = parser_output['trees']
print(tree_string)

(S (NP (DT The) (JJ old) (NN woman)) (VP (VBD was) (VP (VP (VBG sitting) (PP (IN under) (NP (DT a) (NN tree)))) (CC and) (VP (VBG sipping) (NP (NN coffee))))))


In [6]:
from nltk import tokenize
from nltk.tree import Tree

tree = Tree.fromstring(tree_string)
# print(tree)
tree.pretty_print()

                            S                                          
      ______________________|________                                   
     |                               VP                                
     |          _____________________|_______                           
     |         |                             VP                        
     |         |                  ___________|________________          
     |         |                 VP               |           |        
     |         |      ___________|___             |           |         
     |         |     |               PP           |           VP       
     |         |     |       ________|___         |      _____|____     
     NP        |     |      |            NP       |     |          NP  
  ___|____     |     |      |         ___|___     |     |          |    
 DT  JJ   NN  VBD   VBG     IN       DT      NN   CC   VBG         NN  
 |   |    |    |     |      |        |       |    |     | 

#### Acronyms  
S : Sentence  
NP  : Noun Phase  
VP  : Verb Phase  
PP  : Prepositional Phase  
Det : Determiner  
N   : Noun
V   : Verb  
P   : Preposition  
VBD : Past Tense Verb  
JJ  : Adjective

In [7]:
# split at right most nounphrase or verbphrase

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

def get_right_most_VP_or_NP(parse_tree,last_NP = None,last_VP = None):
    if len(parse_tree.leaves()) == 1:
        return last_NP,last_VP
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_NP = last_subtree
    elif last_subtree.label() == "VP":
        last_VP = last_subtree

    return get_right_most_VP_or_NP(last_subtree,last_NP,last_VP)


last_nounphrase, last_verbphrase =  get_right_most_VP_or_NP(tree)
last_nounphrase_flattened = get_flattened(last_nounphrase)
last_verbphrase_flattened = get_flattened(last_verbphrase)

print ("Original Sentence ",text)
print ("last_nounphrase ",last_nounphrase )
print ("last_verbphrase ",last_verbphrase)
print ("\n ")
print ("last_nounphrase ",last_nounphrase_flattened )
print ("last_verbphrase ",last_verbphrase_flattened)

Original Sentence  The old woman was sitting under a tree and sipping coffee
last_nounphrase  (NP (NN coffee))
last_verbphrase  (VP (VBG sipping) (NP (NN coffee)))

 
last_nounphrase  coffee
last_verbphrase  sipping coffee


In [8]:
import re

# sub_string - sipping coffee
# main_string - The old woman was sitting under a tree and sipping coffee
# compare like below
# Theoldwomanwassittingunderatreeandsippingcoffee  || sippingcoffee
# oldwomanwassittingunderatreeandsippingcoffee || sippingcoffee
# womanwassittingunderatreeandsippingcoffee || sippingcoffee
# ...............
# andsippingcoffee || sippingcoffee
# sippingcoffee || sippingcoffee
def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ", "")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ", "")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])

    return None

longest_phrase_to_use = max(last_nounphrase_flattened, last_verbphrase_flattened,key = len)
print ("Ending phrase: ", longest_phrase_to_use)

longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)


split_sentence = get_termination_portion(text, longest_phrase_to_use)
print ("Original sentence : ",text)
print ("Original sentence after splitting at ending phrase: ",split_sentence)

Ending phrase:  sipping coffee
Original sentence :  The old woman was sitting under a tree and sipping coffee
Original sentence after splitting at ending phrase:  The old woman was sitting under a tree and


In [9]:
# split at the first noun phrase or verb phrase

text2 = "They had no ice cream left at home, nor did they have money to go to the store."
text2 = text2.rstrip('?:!.,;')
print (text2)
parser_output2 = predictor.predict(sentence=text2)
tree_string2 = parser_output2["trees"]

tree2 = Tree.fromstring(tree_string2)
print (tree2.pretty_print())


They had no ice cream left at home, nor did they have money to go to the store
                                              S                                                    
       _______________________________________|_______                                              
      |                                   |   |       SQ                                           
      |                                   |   |    ___|__________                                   
      |                                   |   |   |   |          VP                                
      |                                   |   |   |   |     _____|________                          
      |                                   |   |   |   |    |              NP                       
      |                                   |   |   |   |    |      ________|___                      
      S                                   |   |   |   |    |     |           SBAR                  
  ____|___       

In [10]:
# SBAR stands for Subordinate Clause.
#  Penn Tree bank overview - http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf

def get_first_VP_and_NP_and_sentence(parse_tree, first_NP=None, first_VP=None, first_sent=None):
    if len(parse_tree.leaves()) == 1:
        return get_flattened(first_NP), get_flattened(first_VP), get_flattened(first_sent)
    last_subtree = parse_tree[-1]

    if last_subtree.label() == "NP" and not first_NP:
        first_NP = last_subtree
    elif last_subtree.label() == "VP" and not first_VP:
        first_VP = last_subtree
    elif last_subtree.label() == "S" and not first_sent:
        first_sent = last_subtree

    return get_first_VP_and_NP_and_sentence(last_subtree, first_NP, first_VP, first_sent)


first_nounphrase, first_verbphrase, first_sentence = get_first_VP_and_NP_and_sentence(tree2)

print("first_nounphrase: ",first_nounphrase)
print ("first_verbphrase: ",first_verbphrase)
print ("first_sentence: ",first_sentence)

first_nounphrase:  money to go to the store
first_verbphrase:  have money to go to the store
first_sentence:  to go to the store


In [11]:
longest_phrase_to_use = max(first_nounphrase, first_verbphrase,key = len)
print ("Ending phrase: ", longest_phrase_to_use)

longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)


split_sentence = get_termination_portion(text2, longest_phrase_to_use)
print ("Original sentence : ",text2)
print ("Original sentence after splitting at ending phrase: ",split_sentence)

Ending phrase:  have money to go to the store
Original sentence :  They had no ice cream left at home, nor did they have money to go to the store
Original sentence after splitting at ending phrase:  They had no ice cream left at home, nor did they


****

## 2) Generate alternate endings to a split sentence using OpenAI GPT2

In [5]:
# !pip install --quiet transformers
!pip install --quiet sacremoses
!pip install --quiet tokenizers
!pip install --quiet sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install --quiet transformers

In [8]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import nltk
from nltk import tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
GPT2tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPT2model = TFGPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=GPT2tokenizer.eos_token_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [6]:
partial_sentence = "The old woman was sitting under a tree and"
input_ids = GPT2tokenizer.encode(partial_sentence, return_tensors='tf')
print(input_ids)
max_length = len(partial_sentence.split()) + 40

tf.Tensor([[ 464 1468 2415  373 5586  739  257 5509  290]], shape=(1, 9), dtype=int32)


In [7]:
sample_outputs = GPT2model.generate(
    input_ids,
    do_sample=True,
    max_length=max_length,
    top_p=0.80,
    top_k=30,
    repetition_penalty=10.0,
    num_return_sequences=10
)

In [12]:
generated_sentences=[]

for index, sample_output in enumerate(sample_outputs):
  decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True)

  final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
  generated_sentences.append(final_sentence)

  print(index, ": ", final_sentence)

0 :  The old woman was sitting under a tree and she had to run up the steps with her hands behind.
1 :  The old woman was sitting under a tree and her feet were covered with the leaves of dead trees.
2 :  The old woman was sitting under a tree and the young man stood in front of her.
3 :  The old woman was sitting under a tree and watching the sun rise from its shadow.
4 :  The old woman was sitting under a tree and said: "He'll kill you."
5 :  The old woman was sitting under a tree and looked down at me.
6 :  The old woman was sitting under a tree and the other two had their noses pulled out.
7 :  The old woman was sitting under a tree and holding her son.
8 :  The old woman was sitting under a tree and looking out of the window at me, when I said something like this: "Well then," she replied.
9 :  The old woman was sitting under a tree and the new one with her arms raised in front of it.


****

## 3) Filter sentences with BERT

In [13]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [21]:
from sentence_transformers import SentenceTransformer, util
import scipy

In [15]:
BERT_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [46]:
possible_false_sentences = [
    "The old woman was sitting under a tree and her feet were covered with the leaves of dead trees.",
    "The old woman was sitting under a tree and the young man stood in front of her.",
    "The old woman was sitting under a tree and watching the sun rise from its shadow.",
    "The old woman was sitting under a tree and said: 'He'll kill you.'",
    "The old woman was sitting under a tree and looked down at me.",
    "The old woman was sitting under a tree and the other two had their noses pulled out.",
    "The old woman was sitting under a tree and holding her son.",
    "The old woman was sitting under a tree and looking out of the window at me, when I said something like this: 'Well then,' she replied.",
    "The old woman was sitting under a tree and the new one with her arms raised in front of it.",
    "The old woman was sitting under a tree and drinking tea."] # added sentence

In [47]:
original_sentence = "The old woman was sitting under a tree and sipping coffee."

In [48]:
false_sentences_embeddings = BERT_model.encode(possible_false_sentences)
original_sentence_embedding = BERT_model.encode([original_sentence])

In [49]:
distances = scipy.spatial.distance.cdist(original_sentence_embedding, false_sentences_embeddings, 'cosine')[0]
print(distances)

[0.48423187 0.6217012  0.40564722 0.54273074 0.38066874 0.50376231
 0.71157684 0.48313672 0.50177847 0.23803733]


In [50]:
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print(results)

[(9, 0.23803732780951703), (4, 0.3806687360049903), (2, 0.4056472166589572), (7, 0.48313671923085), (0, 0.4842318749832535), (8, 0.5017784660237569), (5, 0.5037623118228942), (3, 0.5427307360716267), (1, 0.6217012047087931), (6, 0.7115768391746398)]


In [51]:
dissimilar_sentences = []

for idx, distance in results:
  dissimilar_sentences.append(possible_false_sentences[idx])
  print(possible_false_sentences[idx])

The old woman was sitting under a tree and drinking tea.
The old woman was sitting under a tree and looked down at me.
The old woman was sitting under a tree and watching the sun rise from its shadow.
The old woman was sitting under a tree and looking out of the window at me, when I said something like this: 'Well then,' she replied.
The old woman was sitting under a tree and her feet were covered with the leaves of dead trees.
The old woman was sitting under a tree and the new one with her arms raised in front of it.
The old woman was sitting under a tree and the other two had their noses pulled out.
The old woman was sitting under a tree and said: 'He'll kill you.'
The old woman was sitting under a tree and the young man stood in front of her.
The old woman was sitting under a tree and holding her son.


In [52]:
false_sentences_list = reversed(dissimilar_sentences)
for sentence in false_sentences_list:
  print(sentence)

The old woman was sitting under a tree and holding her son.
The old woman was sitting under a tree and the young man stood in front of her.
The old woman was sitting under a tree and said: 'He'll kill you.'
The old woman was sitting under a tree and the other two had their noses pulled out.
The old woman was sitting under a tree and the new one with her arms raised in front of it.
The old woman was sitting under a tree and her feet were covered with the leaves of dead trees.
The old woman was sitting under a tree and looking out of the window at me, when I said something like this: 'Well then,' she replied.
The old woman was sitting under a tree and watching the sun rise from its shadow.
The old woman was sitting under a tree and looked down at me.
The old woman was sitting under a tree and drinking tea.
