In [95]:
import numpy as np
import torch
import re
import pickle 
import os
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
from functools import partial
from collections import OrderedDict
from itertools import islice
from random import shuffle
from IPython.display import clear_output
import time

In [2]:
model = SentenceTransformer('sentence-transformers/gtr-t5-xl')

In [46]:
training_instruction_path = "train_data/training_instruction_sen_emb_phrase_chunk.pkl"
with open(training_instruction_path, 'rb') as f:
    raw_instruction_dict = pickle.load(f) # key:data_id, val:{"spell_correction_txt","embedding":
                                                            # "phrase_chunking_sentence", "phrase_chunking_tag"}

## Analyse verb phrase set 

In [3]:
with open('train_data/action_phrase_chunk_set.pkl', 'rb') as f:
    verb_phrase_set = pickle.load(f)

In [4]:
verb_phrase_set

{'remain up',
 'disappear',
 'stand to',
 'grabbing',
 'just waiting',
 'waiting and standing on',
 'walk',
 'to capture',
 'view',
 'jump before',
 'slowly make',
 'slightly climb down',
 'jump across',
 'disappear take',
 'turn',
 'keep walking',
 'wait between',
 'to catch',
 'little move',
 'to evade',
 'step down',
 'switches',
 'stops',
 'starting',
 'stuck',
 'try to go',
 'face',
 'try to move',
 'going',
 'return to',
 'avoiding',
 'twice going',
 'talk',
 'keep going',
 'to give',
 'getting bitten by',
 'try to collect',
 'jump towards',
 'climbing up',
 'go slightly left',
 'climb into',
 'go left jumping over',
 'go jump across',
 'stay out',
 'is moving',
 'stay on',
 'to avoid getting hit',
 'again take',
 'to pass',
 'jumping up',
 'jump going',
 'jump to rope',
 'playing',
 'go left move to ladder',
 'go past',
 'to move across',
 'are',
 'jump to grab',
 'jumping to',
 'slipped',
 'walk from',
 'left to ledge',
 'move under',
 'break',
 'continue to walk to',
 'running

### Using sentence embedding model to find out top 30 similar set 

In [5]:
# calculate embeddings
verb_phrase_emb_dict = dict()
for p in tqdm(verb_phrase_set):
    verb_phrase_emb_dict[p] = model.encode(p)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for p in tqdm(verb_phrase_set):


  0%|          | 0/1104 [00:00<?, ?it/s]

In [65]:
from IPython.display import HTML as html_print

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

left, word, right = 'foo' , 'abc' , 'bar'
html_print(cstr(' '.join([left, cstr(word, color='yellow'), right]), color='black') )

In [69]:
def calculate_cosine_distance(emb_one, emb_two):
    return cosine(emb_one, emb_two)

def find_sentence_three(phrase, training_dict):
    output_arr = []
    for key, val in training_dict.items():
        if phrase in val['spell_correction_txt']:
            
            output_arr.append(val['spell_correction_txt'])
    shuffle(output_arr)
    return output_arr[:3]
    

In [29]:
verb_phrase_close_phase_dict = dict()
for key, val in tqdm(verb_phrase_emb_dict.items()):
    calculate_distance_partial = partial(calculate_cosine_distance, emb_one = val)
    sorted_arr = sorted(verb_phrase_emb_dict, key=lambda k: calculate_distance_partial(emb_two=verb_phrase_emb_dict.get(k)))
    top_ten_arr = sorted_arr[1:31] # index 0 is itself
    verb_phrase_close_phase_dict[key] = top_ten_arr

  0%|          | 0/1104 [00:00<?, ?it/s]

In [96]:
hard_negative_verb_phrase_dict = dict()
for ind, (key, val) in enumerate(verb_phrase_close_phase_dict.items()):
    print(f"{ind+1}/{len(verb_phrase_close_phase_dict)} Verb Key:", key)
    print("Example:")
    egsent_arr =  find_sentence_three(key, raw_instruction_dict)
    for sent in egsent_arr:
        print("\t",sent)
    print(f"Similar Phrase:")
    for i in range(0, 30, 5):
        print('\t', val[i:i+5])
    print("")
    # now enter hard negative manually 
    while True:
        x = input('input hard negative verb arr')
        try:
            eval_x = eval(x)
            if type(eval_x) != list:
                raise RuntimeError('not a list!')
            break # break the while true loop
        except Exception as e:
            print(e)
            print("Sorry Try again")
    hard_negative_verb_phrase_dict[key] = eval_x
    # refresh 
    clear_output(wait=False)
    time.sleep(0.2)


In [98]:
len(hard_negative_verb_phrase_dict)

1104

In [97]:
with open('train_data/action_phrase_hard_negative_dict.pkl', 'wb') as f:
    pickle.dump(hard_negative_verb_phrase_dict, f) 

In [32]:
eval("['hello'")

SyntaxError: unexpected EOF while parsing (<string>, line 1)

## Analyse Noun Phrase Set

In [71]:
with open('train_data/noun_phrase_chunk_set.pkl', 'rb') as f:
    noun_phrase_set = pickle.load(f)

In [72]:
noun_phrase_set

{'two jumps',
 'the skull to the right',
 'left and walk',
 'the second one',
 'the hanging purple rod',
 'a small step towards the barrier',
 'exit ladder',
 'mid level',
 'the endow',
 'the column',
 'the snake other side',
 'the next side',
 'the ladder in the blue world',
 'the surroundings',
 'moving horizontal line to the right',
 'another step',
 'the blue lines',
 'the electrical field',
 'the far right edge',
 'the rope to the floor',
 'a few more',
 'the solid brick',
 'playing',
 'down side',
 'you to the ground',
 'the half way point on the ladder',
 'big ladder',
 'a bit at the ladder',
 'the left snake',
 'left side rope',
 'step left',
 'the ladder in the middle',
 'this jump',
 'green creature to the right',
 'three times to the edge',
 '5 rungs of the blue ladder',
 'the edge of the ledge',
 'the left platform',
 'fast search',
 'a different area',
 'the hammer power',
 'head',
 'this person',
 'green creature',
 'the roller',
 'the flame',
 'the right until the end',


In [73]:
# calculate embeddings
noun_phrase_emb_dict = dict()
for p in tqdm(noun_phrase_set):
    noun_phrase_emb_dict[p] = model.encode(p)

  0%|          | 0/1579 [00:00<?, ?it/s]

In [76]:
noun_phrase_close_phase_dict = dict()
for key, val in tqdm(noun_phrase_emb_dict.items()):
    calculate_distance_partial = partial(calculate_cosine_distance, emb_one = val)
    sorted_arr = sorted(noun_phrase_emb_dict, key=lambda k: calculate_distance_partial(emb_two=noun_phrase_emb_dict.get(k)))
    top_ten_arr = sorted_arr[1:31] # index 0 is itself
    noun_phrase_close_phase_dict[key] = top_ten_arr

  0%|          | 0/1579 [00:00<?, ?it/s]

In [99]:
hard_negative_noun_phrase_dict = dict()

for ind, (key, val) in enumerate(noun_phrase_close_phase_dict.items()):
    print(f"{ind+1}/{len(noun_phrase_close_phase_dict)} Noun Key:", key)
    print("Example:")
    egsent_arr =  find_sentence_three(key, raw_instruction_dict)
    for sent in egsent_arr:
        print("\t",sent)
    print(f"Similar Phrase:")
    for i in range(0, 30, 3):
        print('\t', val[i:i+3])
    print("")
    # now enter hard negative manually 
    while True:
        x = input('input hard negative noun arr')
        try:
            eval_x = eval(x)
            if type(eval_x) != list:
                raise RuntimeError('not a list!')
            break # break the while true loop
        except Exception as e:
            print(e)
            print("Sorry Try again")
    hard_negative_noun_phrase_dict[key] = eval_x
    # refresh 
    clear_output(wait=False)
    time.sleep(0.2)

In [100]:
with open('train_data/noun_phrase_hard_negative_dict.pkl', 'wb') as f:
    pickle.dump(hard_negative_noun_phrase_dict, f) 

## Notes when replacing the phrase

1. you need to have spaces to the left and right e.g., " walk towards "
2. there might be no hard negative elements

### Some wrong phrase correct them first before replacing
1. 'claim" -> 'climb'
2. 'club' -> 'climb'
3. 'done' -> 'do not'
4. 'clip' -> 'climb'
5. 'clumping' -> 'climbing'
6. 'clime' -> 'climb'
7. 'hope' -> 'hop'
8. 'toe' -> 'to'
9. 'endow' -> 'end'
10. 'latter' -> 'ladder'
11. 'bride' -> 'bridge'
12. 'leaser' -> 'laser'
13. 'later' -> 'ladder'
14. 'leaf' -> 'left'
15. 'snack' -> 'snake'
16. 'rob' -> 'rope'
17. "don' -> 'do not'
18. 'article' -> 'verticle'
19. 'life edge' -> 'left edge'
20. 'done' -> 'do not'
21. 'ans' -> 'and'
22. 'skelton' -> 'skeleton'
23. 'skilled' -> 'skull'