## Prepare Referece files using TFIDF for retrieving attributes


In [1]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

In [3]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [4]:
train0_org = read_file("/zhangpai25/wyc/drg/drg_data/hlm/processed_files_with_bert_with_best_head/sentiment_train_0.txt") # Training data of negative sentiment
train1_org = read_file("/zhangpai25/wyc/drg/drg_data/hlm/processed_files_with_bert_with_best_head/sentiment_train_1.txt") # Training data of positive sentiment
ref0_processed = read_file("/zhangpai25/wyc/drg/drg_data/hlm/processed_files_with_bert_with_best_head/reference_0.txt") # Reference data for delete_generate model
ref1_processed = read_file("/zhangpai25/wyc/drg/drg_data/hlm/processed_files_with_bert_with_best_head/reference_1.txt") # Reference data for delete_generate model
ref0_org = read_file("/zhangpai25/wyc/drg/drg_data/hlm/reference_0.txt") # Original Refrence_0 data
ref1_org = read_file("/zhangpai25/wyc/drg/drg_data/hlm/reference_1.txt") # Original Refrence_1 data
train0_processed = read_file("/zhangpai25/wyc/drg/drg_data/hlm/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file("/zhangpai25/wyc/drg/drg_data/hlm/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt") # training data with content and attributes seperation

In [5]:
# Get the Original Reference Sentence
ref0_org = [x.split("\t")[0] for x in ref0_org]
ref1_org = [x.split("\t")[0] for x in ref1_org]

In [6]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
ref1_con = [clean_text(x) for x in ref1_processed]

In [8]:
ref0_org[:4], ref0_con[:4]

([' 我这么大年纪了,连这点子事还耽着呢! ',
  ' 这么着罢,就叫雪姑娘去罢.雪雁使得吗. ',
  ' 这雪雁也使得,姑娘就叫他雪雁去罢,姑娘快叫他雪姑娘跟了去罢!',
  ' 老太太和二奶奶办的是头一宗,平姑娘和姑娘办的是第二宗. '],
 ['我 这 么 大 年 纪 连 这 点 子 还 耽 着 呢 !',
  '这 么 着 罢 就 叫 雪 姑 娘 罢 . 雁 使 得 吗 .',
  '这 雪 雁 也 使 得 姑 娘 就 叫 他 雪 雁 去 姑 娘 快 叫 他 雪 姑 娘 跟 了 去 罢 !',
  '老 太 太 和 二 奶 奶 办 是 头 宗 平 姑 娘 和 姑 娘 办 的 是 第 二 宗 .'])

In [9]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [10]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [11]:
get_train_attrs(train0_processed[0])

['汝父年将半百,再无续室之意,且汝多病,年又极小,上无亲母教养,下无姊妹兄弟扶持,今依傍外祖母及舅氏姊妹去,正好减我顾盼之忧,何反云不往?']

In [12]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> 汝父年将半百,再无续室之意,且汝多病,年又极小,上无亲母教养,下无姊妹兄弟扶持,今依傍外祖母及舅氏姊妹去,正好减我顾盼之忧,何反云不往? <CON_START> 汝 父 年 将 半 百 , 再 无 续 室 之 意 , 且 汝 多 病 年 又 极 小 上 无 亲 母 教 养 下 无 姊 妹 兄 弟 扶 持 , 今 依 傍 外 祖 母 及 舅 氏 姊 妹 去 , 正 好 减 我 顾 盼 之 忧 , 何 反 云 不 <START> 汝父年将半百,再无续室之意,且汝多病,年又极小,上无亲母教养,下无姊妹兄弟扶持,今依傍外祖母及舅氏姊妹去,正好减我顾盼之忧,何反云不往? <END>',
  '<ATTR_WORDS> 敕造宁国府 <CON_START> 敕 造 宁 <START> 敕造宁国府 <END>',
  '<ATTR_WORDS> 刚才老太太还念呢,可巧就来了 <CON_START> 刚 才 老 太 太 还 念 可 巧 就 来 . <START> 刚才老太太还念呢,可巧就来了 . <END>',
  '<ATTR_WORDS> 这是你大舅母, 这是你二舅母,这是你先珠大哥的媳妇珠大嫂子. <CON_START> 这 是 你 大 舅 母 这 是 你 二 舅 母 这 是 你 先 珠 大 哥 媳 妇 珠 大 嫂 子 . <START> 这是你大舅母, 这是你二舅母,这是你先珠大哥的媳妇珠大嫂子. <END>'],
 ['<ATTR_WORDS> 私募大佬抛弃了谁？ <CON_START> 大 佬 抛 弃 了 谁 <START> 私募大佬抛弃了谁？ <END>',
  '<ATTR_WORDS> 前上海上港球员孙祥的爱妻，晒出自己近日的自拍照 <CON_START> 前 上 海 上 港 球 孙 祥 的 爱 妻 晒 出 自 己 近 日 的 拍 照 <START> 前上海上港球员孙祥的爱妻，晒出自己近日的自拍照 <END>',
  '<ATTR_WORDS> 首台奇瑞瑞虎8到店，围观人群挤不动，网友：不到10万要卖疯 <CON_START> 首 台 奇 瑞 瑞 虎 8 到 店 围 观 人 群 挤 不 动 网 友 不 到 10 万 要 卖 疯 <START> 首台奇瑞瑞虎8到店，围观人群挤不动，网友：不到10万要卖疯 <END>',
  '<ATTR

In [13]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [14]:
train0_con[:4], train1_con[:4]

(['汝 父 年 将 半 百 , 再 无 续 室 之 意 , 且 汝 多 病 年 又 极 小 上 无 亲 母 教 养 下 无 姊 妹 兄 弟 扶 持 , 今 依 傍 外 祖 母 及 舅 氏 姊 妹 去 , 正 好 减 我 顾 盼 之 忧 , 何 反 云 不',
  '敕 造 宁',
  '刚 才 老 太 太 还 念 可 巧 就 来 .',
  '这 是 你 大 舅 母 这 是 你 二 舅 母 这 是 你 先 珠 大 哥 媳 妇 珠 大 嫂 子 .'],
 ['大 佬 抛 弃 了 谁',
  '前 上 海 上 港 球 孙 祥 的 爱 妻 晒 出 自 己 近 日 的 拍 照',
  '首 台 奇 瑞 瑞 虎 8 到 店 围 观 人 群 挤 不 动 网 友 不 到 10 万 要 卖 疯',
  '王 羲 之 集 字 《 圣 教 序 》 很 多 字 东 倒 西 歪 如 何 理 解 其 中 的 美 感'])

In [15]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [16]:
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [17]:
from annoy import AnnoyIndex

ModuleNotFoundError: No module named 'annoy'

In [43]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

In [26]:
# We have randomly selected training samples to control the memory usage
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

In [32]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [06:20<00:00, 131.31it/s]


In [34]:
train0_tree.build(50)
train0_tree.save('tfidf_train0.ann')

True

In [41]:
ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])

(['it is a and just takes a second .',
  'this is definitely the with the num _ extend',
  'in num _ num days in bulk packaging exactly like .'],
 'sneak caught impossible')

In [42]:
with open("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> usb charger interface num_num num_extend <CON_START> it is a and just takes a second . <START>

<ATTR_WORDS> blended would work <CON_START> this is definitely the with the num _ extend <START>

<ATTR_WORDS> why bought <CON_START> in num _ num days in bulk packaging exactly like . <START>

<ATTR_WORDS> impeller does mulch <CON_START> i bought three of these to in our home . <START>

<ATTR_WORDS> case volume keys <CON_START> i this and was happy with my . <START>

<ATTR_WORDS> linksys router num_num <CON_START> went looking them and you had them . <START>

<ATTR_WORDS> color rubs hands <CON_START> with medium its to num _ num hours . <START>

<ATTR_WORDS> headband flimsy use <CON_START> i don t want the of an otterbox . <START>

<ATTR_WORDS> clear gbs style <CON_START> i beef bolognese in the oven and it out wonderfully . <START>

<ATTR_WORDS> better tighter more <CON_START> , which i did not view as all that . <START>

<ATTR_WORDS> band long <CON_START> is , with passwords 

<ATTR_WORDS> product for <CON_START> these small cookie perfectly two inch cookies . <START>

<ATTR_WORDS> motorola num_extend radios with <CON_START> to on and i have no problems . <START>

<ATTR_WORDS> would rave review <CON_START> do your homework and you will end up pretty . <START>

<ATTR_WORDS> returning too short <CON_START> i can t believe how this guy is . <START>

<ATTR_WORDS> shot num_num low bullets <CON_START> i the oster , it everything it claim to do . <START>

<ATTR_WORDS> ticking num_num time left <CON_START> the came out and fluffy wished i had had earlier . <START>

<ATTR_WORDS> controller work with <CON_START> the finer the setting , the it . <START>

<ATTR_WORDS> corded hand fit <CON_START> i think this case is a very phone . <START>

<ATTR_WORDS> nice shoe slip <CON_START> i this with afresh to my cook top . <START>

<ATTR_WORDS> sunburn burned <CON_START> these wine stoppers are very inexpensive and certainly the . <START>

<ATTR_WORDS> coffee instant coffees <CO

<ATTR_WORDS> irrelevant smartphone <CON_START> it easily when you are in the num _ extend <START>

<ATTR_WORDS> game played xbox <CON_START> nothing to say about this at all . <START>

<ATTR_WORDS> keep order again <CON_START> i would this to my friends . <START>

<ATTR_WORDS> eager to try <CON_START> this , wasn t near as as some were saying . <START>

<ATTR_WORDS> love brands <CON_START> the cord is extra heavy to . <START>

<ATTR_WORDS> job <CON_START> for the , weight and price it is really it . <START>

<ATTR_WORDS> not set <CON_START> i have been out ! this thing is so . <START>

<ATTR_WORDS> used new batteries <CON_START> also than going to the store . <START>

<ATTR_WORDS> using using pantene <CON_START> like that rotisserrie , you set it and forget it . <START>

<ATTR_WORDS> received band num_extend birthday <CON_START> this exactly what it is designed to do . <START>

<ATTR_WORDS> no adjust problem <CON_START> i feel like i have a . <START>

<ATTR_WORDS> tiring small valves n

<ATTR_WORDS> device ssid router <CON_START> my pampered chef garlic with this one . <START>

<ATTR_WORDS> plus might cheaper <CON_START> this is , the sound quality is truly amazing . <START>

<ATTR_WORDS> avoid from after <CON_START> the tiny green light on one is very when fully . <START>

<ATTR_WORDS> did mix with <CON_START> i had it a long time now and i it . <START>

<ATTR_WORDS> which emailed refund <CON_START> no on my when cooking up a steak or falafel . <START>

<ATTR_WORDS> taste baked dry <CON_START> wow im with this device ! how it is . <START>

<ATTR_WORDS> could adjusted view <CON_START> the glass cuisinart pour water everywhere in a wet . <START>

<ATTR_WORDS> tastes horrible give <CON_START> i not have what i felt were unrealistic of this . <START>

<ATTR_WORDS> card went problems <CON_START> i think this do a bird than num _ num pounds . <START>

<ATTR_WORDS> rapid whitening kits <CON_START> it s very and up very little on the pot rack . <START>

<ATTR_WORDS> watch ti

<ATTR_WORDS> turned phone would <CON_START> we often hot dogs and hamburgers in it as well . <START>

<ATTR_WORDS> product fine packaging <CON_START> i have , after num _ num months . <START>

<ATTR_WORDS> everything uppers fake <CON_START> i think the is reading carpet fibers its up as . <START>

<ATTR_WORDS> annoying coffee months <CON_START> the got a tad after running it a cycle . <START>

<ATTR_WORDS> price to high <CON_START> if you want get the cdn num _ extend count down big . <START>

<ATTR_WORDS> lock mounting lock <CON_START> much to my , the invisishield was nothing but . <START>

<ATTR_WORDS> optics looking plastic <CON_START> another thing i really is that it does not have . <START>

<ATTR_WORDS> pass reciever td <CON_START> it s very to use and very . <START>

<ATTR_WORDS> return shipping more <CON_START> now i just to buy a catch for the tops . <START>

<ATTR_WORDS> cats touch stuff <CON_START> the voice from the earpiece says on . <START>

<ATTR_WORDS> writing sloppy <

<ATTR_WORDS> luckily back num_extend did <CON_START> cutting make your pies look as as they taste . <START>

<ATTR_WORDS> of vitamin immune <CON_START> it right the job with no trouble . <START>

<ATTR_WORDS> work for peel <CON_START> , solid build quality and blocks out the . <START>

<ATTR_WORDS> gm mode ticket <CON_START> the roomba just it s and doesn t up on it . <START>

<ATTR_WORDS> would nothing would <CON_START> , and the battery life is pretty . <START>

<ATTR_WORDS> restaurant love both <CON_START> i was to see it was available . <START>

<ATTR_WORDS> holds too <CON_START> this is being for dollar size pancakes . <START>

<ATTR_WORDS> brown taste dramatic <CON_START> this is s right now , so it has to . <START>

<ATTR_WORDS> num_num pack opened <CON_START> despite the breaking it num _ extend . <START>

<ATTR_WORDS> uses products linksys <CON_START> it in a really small place and is built to . <START>

<ATTR_WORDS> attempt sweet cookbook <CON_START> we that it straight from 

In [44]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)


  0%|          | 0/50000 [00:00<?, ?it/s][A
  0%|          | 14/50000 [00:00<06:14, 133.40it/s][A
  0%|          | 28/50000 [00:00<06:14, 133.55it/s][A
  0%|          | 41/50000 [00:00<06:22, 130.65it/s][A
  0%|          | 54/50000 [00:00<06:29, 128.27it/s][A
  0%|          | 67/50000 [00:00<06:33, 126.99it/s][A
  0%|          | 80/50000 [00:00<06:35, 126.30it/s][A
  0%|          | 93/50000 [00:00<06:37, 125.53it/s][A
  0%|          | 106/50000 [00:00<06:38, 125.12it/s][A
  0%|          | 119/50000 [00:00<06:37, 125.33it/s][A
  0%|          | 132/50000 [00:01<06:38, 125.08it/s][A
  0%|          | 145/50000 [00:01<06:51, 121.21it/s][A
  0%|          | 158/50000 [00:01<06:47, 122.17it/s][A
  0%|          | 171/50000 [00:01<06:46, 122.55it/s][A
  0%|          | 184/50000 [00:01<06:50, 121.28it/s][A
  0%|          | 197/50000 [00:01<06:48, 121.80it/s][A
  0%|          | 210/50000 [00:01<06:47, 122.33it/s][A
  0%|          | 223/50000 [00:01<06:46, 122.57it/s][A
  0%|    

  8%|▊         | 3946/50000 [00:30<05:43, 134.17it/s][A
  8%|▊         | 3960/50000 [00:30<05:43, 133.99it/s][A
  8%|▊         | 3974/50000 [00:30<05:43, 134.19it/s][A
  8%|▊         | 3988/50000 [00:30<05:43, 133.92it/s][A
  8%|▊         | 4002/50000 [00:30<05:42, 134.33it/s][A
  8%|▊         | 4016/50000 [00:30<05:42, 134.32it/s][A
  8%|▊         | 4030/50000 [00:30<05:43, 133.95it/s][A
  8%|▊         | 4044/50000 [00:30<05:43, 133.71it/s][A
  8%|▊         | 4058/50000 [00:31<05:43, 133.72it/s][A
  8%|▊         | 4072/50000 [00:31<05:42, 134.16it/s][A
  8%|▊         | 4086/50000 [00:31<05:42, 133.92it/s][A
  8%|▊         | 4100/50000 [00:31<05:43, 133.56it/s][A
  8%|▊         | 4114/50000 [00:31<05:43, 133.69it/s][A
  8%|▊         | 4128/50000 [00:31<05:45, 132.80it/s][A
  8%|▊         | 4142/50000 [00:31<05:45, 132.83it/s][A
  8%|▊         | 4156/50000 [00:31<05:44, 132.96it/s][A
  8%|▊         | 4170/50000 [00:31<05:45, 132.64it/s][A
  8%|▊         | 4184/50000 [00

 16%|█▌        | 7950/50000 [01:00<05:10, 135.23it/s][A
 16%|█▌        | 7964/50000 [01:00<05:11, 135.00it/s][A
 16%|█▌        | 7978/50000 [01:00<05:11, 135.00it/s][A
 16%|█▌        | 7992/50000 [01:00<05:12, 134.27it/s][A
 16%|█▌        | 8006/50000 [01:00<05:13, 134.02it/s][A
 16%|█▌        | 8020/50000 [01:00<05:13, 134.02it/s][A
 16%|█▌        | 8034/50000 [01:00<05:12, 134.39it/s][A
 16%|█▌        | 8048/50000 [01:01<05:12, 134.27it/s][A
 16%|█▌        | 8062/50000 [01:01<05:12, 134.12it/s][A
 16%|█▌        | 8076/50000 [01:01<05:11, 134.80it/s][A
 16%|█▌        | 8090/50000 [01:01<05:10, 134.94it/s][A
 16%|█▌        | 8104/50000 [01:01<05:09, 135.40it/s][A
 16%|█▌        | 8118/50000 [01:01<05:09, 135.33it/s][A
 16%|█▋        | 8132/50000 [01:01<05:08, 135.72it/s][A
 16%|█▋        | 8146/50000 [01:01<05:09, 135.25it/s][A
 16%|█▋        | 8160/50000 [01:01<05:08, 135.70it/s][A
 16%|█▋        | 8174/50000 [01:02<05:08, 135.36it/s][A
 16%|█▋        | 8188/50000 [01

 24%|██▍       | 11926/50000 [01:30<04:44, 133.87it/s][A
 24%|██▍       | 11940/50000 [01:30<04:44, 133.75it/s][A
 24%|██▍       | 11954/50000 [01:30<04:44, 133.82it/s][A
 24%|██▍       | 11968/50000 [01:30<04:44, 133.66it/s][A
 24%|██▍       | 11982/50000 [01:30<04:44, 133.75it/s][A
 24%|██▍       | 11996/50000 [01:30<04:44, 133.77it/s][A
 24%|██▍       | 12010/50000 [01:30<04:43, 134.18it/s][A
 24%|██▍       | 12024/50000 [01:30<04:43, 133.99it/s][A
 24%|██▍       | 12038/50000 [01:30<04:43, 133.96it/s][A
 24%|██▍       | 12052/50000 [01:31<04:42, 134.16it/s][A
 24%|██▍       | 12066/50000 [01:31<04:43, 134.02it/s][A
 24%|██▍       | 12080/50000 [01:31<04:44, 133.48it/s][A
 24%|██▍       | 12094/50000 [01:31<04:43, 133.68it/s][A
 24%|██▍       | 12108/50000 [01:31<04:42, 133.92it/s][A
 24%|██▍       | 12122/50000 [01:31<04:43, 133.82it/s][A
 24%|██▍       | 12136/50000 [01:31<07:28, 84.49it/s] [A
 24%|██▍       | 12150/50000 [01:31<06:39, 94.85it/s][A
 24%|██▍       

 32%|███▏      | 15874/50000 [02:00<05:01, 113.35it/s][A
 32%|███▏      | 15888/50000 [02:00<04:48, 118.36it/s][A
 32%|███▏      | 15902/50000 [02:00<04:38, 122.33it/s][A
 32%|███▏      | 15916/50000 [02:00<04:32, 125.07it/s][A
 32%|███▏      | 15930/50000 [02:01<04:26, 127.62it/s][A
 32%|███▏      | 15944/50000 [02:01<04:22, 129.55it/s][A
 32%|███▏      | 15958/50000 [02:01<04:20, 130.83it/s][A
 32%|███▏      | 15972/50000 [02:01<04:17, 132.16it/s][A
 32%|███▏      | 15986/50000 [02:01<04:16, 132.50it/s][A
 32%|███▏      | 16000/50000 [02:01<04:15, 133.15it/s][A
 32%|███▏      | 16014/50000 [02:01<04:15, 133.10it/s][A
 32%|███▏      | 16028/50000 [02:01<04:13, 133.78it/s][A
 32%|███▏      | 16042/50000 [02:01<04:14, 133.20it/s][A
 32%|███▏      | 16056/50000 [02:01<04:13, 133.70it/s][A
 32%|███▏      | 16070/50000 [02:02<04:12, 134.21it/s][A
 32%|███▏      | 16084/50000 [02:02<04:13, 133.67it/s][A
 32%|███▏      | 16098/50000 [02:02<04:13, 133.49it/s][A
 32%|███▏     

 40%|███▉      | 19822/50000 [02:30<03:45, 134.02it/s][A
 40%|███▉      | 19836/50000 [02:30<03:45, 133.51it/s][A
 40%|███▉      | 19850/50000 [02:30<03:45, 133.89it/s][A
 40%|███▉      | 19864/50000 [02:30<03:45, 133.72it/s][A
 40%|███▉      | 19878/50000 [02:30<03:45, 133.72it/s][A
 40%|███▉      | 19892/50000 [02:30<03:44, 133.98it/s][A
 40%|███▉      | 19906/50000 [02:30<03:45, 133.70it/s][A
 40%|███▉      | 19920/50000 [02:30<03:44, 134.08it/s][A
 40%|███▉      | 19934/50000 [02:30<03:44, 133.70it/s][A
 40%|███▉      | 19948/50000 [02:31<03:44, 133.85it/s][A
 40%|███▉      | 19962/50000 [02:31<03:44, 133.60it/s][A
 40%|███▉      | 19976/50000 [02:31<03:44, 133.66it/s][A
 40%|███▉      | 19990/50000 [02:31<03:44, 133.48it/s][A
 40%|████      | 20004/50000 [02:31<03:44, 133.60it/s][A
 40%|████      | 20018/50000 [02:31<03:44, 133.32it/s][A
 40%|████      | 20032/50000 [02:31<03:44, 133.78it/s][A
 40%|████      | 20046/50000 [02:31<03:44, 133.51it/s][A
 40%|████     

 48%|████▊     | 23770/50000 [02:59<03:15, 133.89it/s][A
 48%|████▊     | 23784/50000 [03:00<03:16, 133.62it/s][A
 48%|████▊     | 23798/50000 [03:00<03:17, 132.95it/s][A
 48%|████▊     | 23812/50000 [03:00<03:16, 133.27it/s][A
 48%|████▊     | 23826/50000 [03:00<03:16, 133.48it/s][A
 48%|████▊     | 23840/50000 [03:00<03:16, 133.43it/s][A
 48%|████▊     | 23854/50000 [03:00<03:15, 133.49it/s][A
 48%|████▊     | 23868/50000 [03:00<03:15, 133.48it/s][A
 48%|████▊     | 23882/50000 [03:00<03:15, 133.58it/s][A
 48%|████▊     | 23896/50000 [03:00<03:15, 133.44it/s][A
 48%|████▊     | 23910/50000 [03:01<03:15, 133.38it/s][A
 48%|████▊     | 23924/50000 [03:01<03:15, 133.17it/s][A
 48%|████▊     | 23938/50000 [03:01<03:14, 133.75it/s][A
 48%|████▊     | 23952/50000 [03:01<03:15, 133.52it/s][A
 48%|████▊     | 23966/50000 [03:01<03:14, 133.84it/s][A
 48%|████▊     | 23980/50000 [03:01<03:14, 133.76it/s][A
 48%|████▊     | 23994/50000 [03:01<03:14, 134.04it/s][A
 48%|████▊    

 55%|█████▌    | 27718/50000 [03:30<02:46, 133.78it/s][A
 55%|█████▌    | 27732/50000 [03:30<02:46, 133.36it/s][A
 55%|█████▌    | 27746/50000 [03:30<02:46, 133.69it/s][A
 56%|█████▌    | 27760/50000 [03:30<02:46, 133.96it/s][A
 56%|█████▌    | 27774/50000 [03:30<02:45, 134.64it/s][A
 56%|█████▌    | 27788/50000 [03:30<02:44, 134.84it/s][A
 56%|█████▌    | 27802/50000 [03:30<02:45, 134.39it/s][A
 56%|█████▌    | 27816/50000 [03:31<02:44, 135.18it/s][A
 56%|█████▌    | 27830/50000 [03:31<02:44, 135.14it/s][A
 56%|█████▌    | 27844/50000 [03:31<02:44, 134.53it/s][A
 56%|█████▌    | 27858/50000 [03:31<02:44, 134.75it/s][A
 56%|█████▌    | 27872/50000 [03:31<02:44, 134.54it/s][A
 56%|█████▌    | 27886/50000 [03:31<02:43, 135.23it/s][A
 56%|█████▌    | 27900/50000 [03:31<02:43, 134.85it/s][A
 56%|█████▌    | 27914/50000 [03:31<02:43, 135.36it/s][A
 56%|█████▌    | 27928/50000 [03:31<02:43, 135.13it/s][A
 56%|█████▌    | 27942/50000 [03:31<02:43, 134.91it/s][A
 56%|█████▌   

 63%|██████▎   | 31666/50000 [03:59<02:16, 134.21it/s][A
 63%|██████▎   | 31680/50000 [03:59<02:16, 134.49it/s][A
 63%|██████▎   | 31694/50000 [03:59<02:16, 134.56it/s][A
 63%|██████▎   | 31708/50000 [04:00<02:16, 134.40it/s][A
 63%|██████▎   | 31722/50000 [04:00<02:16, 134.22it/s][A
 63%|██████▎   | 31736/50000 [04:00<02:16, 134.25it/s][A
 64%|██████▎   | 31750/50000 [04:00<02:15, 134.35it/s][A
 64%|██████▎   | 31764/50000 [04:00<02:15, 134.13it/s][A
 64%|██████▎   | 31778/50000 [04:00<02:15, 134.15it/s][A
 64%|██████▎   | 31792/50000 [04:00<02:16, 133.32it/s][A
 64%|██████▎   | 31806/50000 [04:00<02:16, 133.56it/s][A
 64%|██████▎   | 31820/50000 [04:00<02:15, 133.97it/s][A
 64%|██████▎   | 31834/50000 [04:00<02:16, 133.44it/s][A
 64%|██████▎   | 31848/50000 [04:01<02:16, 133.24it/s][A
 64%|██████▎   | 31862/50000 [04:01<02:16, 132.55it/s][A
 64%|██████▍   | 31876/50000 [04:01<02:16, 132.43it/s][A
 64%|██████▍   | 31890/50000 [04:01<02:16, 132.23it/s][A
 64%|██████▍  

 71%|███████   | 35614/50000 [04:30<01:47, 134.17it/s][A
 71%|███████▏  | 35628/50000 [04:30<01:47, 133.76it/s][A
 71%|███████▏  | 35642/50000 [04:30<01:46, 134.22it/s][A
 71%|███████▏  | 35656/50000 [04:30<01:47, 134.03it/s][A
 71%|███████▏  | 35670/50000 [04:30<01:47, 133.75it/s][A
 71%|███████▏  | 35684/50000 [04:30<01:47, 133.53it/s][A
 71%|███████▏  | 35698/50000 [04:31<01:47, 133.45it/s][A
 71%|███████▏  | 35712/50000 [04:31<01:46, 133.81it/s][A
 71%|███████▏  | 35726/50000 [04:31<01:46, 133.56it/s][A
 71%|███████▏  | 35740/50000 [04:31<01:46, 133.99it/s][A
 72%|███████▏  | 35754/50000 [04:31<01:46, 134.08it/s][A
 72%|███████▏  | 35768/50000 [04:31<01:46, 133.83it/s][A
 72%|███████▏  | 35782/50000 [04:31<01:45, 134.26it/s][A
 72%|███████▏  | 35796/50000 [04:31<01:46, 133.90it/s][A
 72%|███████▏  | 35810/50000 [04:31<01:45, 134.06it/s][A
 72%|███████▏  | 35824/50000 [04:31<01:45, 133.87it/s][A
 72%|███████▏  | 35838/50000 [04:32<01:45, 134.12it/s][A
 72%|███████▏ 

 79%|███████▉  | 39562/50000 [04:59<01:18, 132.90it/s][A
 79%|███████▉  | 39576/50000 [05:00<01:18, 133.06it/s][A
 79%|███████▉  | 39590/50000 [05:00<01:18, 132.58it/s][A
 79%|███████▉  | 39604/50000 [05:00<01:18, 132.34it/s][A
 79%|███████▉  | 39618/50000 [05:00<01:18, 132.35it/s][A
 79%|███████▉  | 39632/50000 [05:00<01:18, 132.21it/s][A
 79%|███████▉  | 39646/50000 [05:00<01:18, 132.34it/s][A
 79%|███████▉  | 39660/50000 [05:00<01:17, 133.17it/s][A
 79%|███████▉  | 39674/50000 [05:00<01:17, 132.90it/s][A
 79%|███████▉  | 39688/50000 [05:00<01:17, 133.40it/s][A
 79%|███████▉  | 39702/50000 [05:01<01:17, 133.31it/s][A
 79%|███████▉  | 39716/50000 [05:01<01:16, 133.74it/s][A
 79%|███████▉  | 39730/50000 [05:01<01:16, 133.97it/s][A
 79%|███████▉  | 39744/50000 [05:01<01:16, 134.13it/s][A
 80%|███████▉  | 39758/50000 [05:01<01:16, 134.38it/s][A
 80%|███████▉  | 39772/50000 [05:01<01:15, 134.65it/s][A
 80%|███████▉  | 39786/50000 [05:01<01:15, 134.43it/s][A
 80%|███████▉ 

 87%|████████▋ | 43510/50000 [05:29<00:48, 133.84it/s][A
 87%|████████▋ | 43524/50000 [05:29<00:48, 133.76it/s][A
 87%|████████▋ | 43538/50000 [05:29<00:48, 133.90it/s][A
 87%|████████▋ | 43552/50000 [05:29<00:48, 133.80it/s][A
 87%|████████▋ | 43566/50000 [05:29<00:48, 134.03it/s][A
 87%|████████▋ | 43580/50000 [05:29<00:47, 133.93it/s][A
 87%|████████▋ | 43594/50000 [05:29<00:47, 134.03it/s][A
 87%|████████▋ | 43608/50000 [05:30<00:47, 133.71it/s][A
 87%|████████▋ | 43622/50000 [05:30<00:47, 133.94it/s][A
 87%|████████▋ | 43636/50000 [05:30<00:47, 133.96it/s][A
 87%|████████▋ | 43650/50000 [05:30<00:47, 134.16it/s][A
 87%|████████▋ | 43664/50000 [05:30<00:47, 133.94it/s][A
 87%|████████▋ | 43678/50000 [05:30<00:47, 133.62it/s][A
 87%|████████▋ | 43692/50000 [05:30<00:47, 133.46it/s][A
 87%|████████▋ | 43706/50000 [05:30<00:47, 133.36it/s][A
 87%|████████▋ | 43720/50000 [05:30<00:47, 133.41it/s][A
 87%|████████▋ | 43734/50000 [05:31<00:46, 133.37it/s][A
 87%|████████▋

 95%|█████████▍| 47454/50000 [06:01<00:18, 135.49it/s][A
 95%|█████████▍| 47468/50000 [06:01<00:18, 135.71it/s][A
 95%|█████████▍| 47482/50000 [06:01<00:18, 135.73it/s][A
 95%|█████████▍| 47496/50000 [06:01<00:18, 135.60it/s][A
 95%|█████████▌| 47510/50000 [06:01<00:18, 135.53it/s][A
 95%|█████████▌| 47524/50000 [06:01<00:18, 135.60it/s][A
 95%|█████████▌| 47538/50000 [06:01<00:18, 135.55it/s][A
 95%|█████████▌| 47552/50000 [06:01<00:18, 135.74it/s][A
 95%|█████████▌| 47566/50000 [06:02<00:17, 135.60it/s][A
 95%|█████████▌| 47580/50000 [06:02<00:17, 135.93it/s][A
 95%|█████████▌| 47594/50000 [06:02<00:17, 136.02it/s][A
 95%|█████████▌| 47608/50000 [06:02<00:17, 135.99it/s][A
 95%|█████████▌| 47622/50000 [06:02<00:17, 136.17it/s][A
 95%|█████████▌| 47636/50000 [06:02<00:17, 136.01it/s][A
 95%|█████████▌| 47650/50000 [06:02<00:17, 136.44it/s][A
 95%|█████████▌| 47664/50000 [06:02<00:17, 136.15it/s][A
 95%|█████████▌| 47678/50000 [06:02<00:17, 136.55it/s][A
 95%|█████████

In [45]:
train1_tree.build(50)
train1_tree.save('tfidf_train1.ann')

True

In [48]:
with open("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> do pain <CON_START> this is honestly the only case i ve away in the . <START>

1 <ATTR_WORDS> steams wantons dream <CON_START> on what otherwise has been a reliable company with several products . <START>

2 <ATTR_WORDS> first carpet cleaner <CON_START> it s like i got an extension and male on both . <START>

3 <ATTR_WORDS> num_extend batter woes over <CON_START> it was supposed to be a . <START>

4 <ATTR_WORDS> not work for <CON_START> so i guess it s really a of . <START>

5 <ATTR_WORDS> toss back <CON_START> this is from what i expected . <START>

6 <ATTR_WORDS> adjustment better tuning <CON_START> obviously we all have our own for . <START>

7 <ATTR_WORDS> perhaps some <CON_START> the seem rather and well , except one problem . <START>

8 <ATTR_WORDS> put coffee chemex <CON_START> i am actually to open the jars . <START>

9 <ATTR_WORDS> battery phone complaints <CON_START> these great little night for the kids . <START>

10 <ATTR_WORDS> find use <CON_START> i don t w

86 <ATTR_WORDS> clear sound <CON_START> i have _ num bird from poles in my front and back yard . <START>

87 <ATTR_WORDS> specks works great <CON_START> very designed and easy to out . <START>

88 <ATTR_WORDS> spatter covers <CON_START> i had high for this since i bruise easily . <START>

89 <ATTR_WORDS> always wash iron <CON_START> the a aborb water at all . <START>

90 <ATTR_WORDS> keeps hot finish <CON_START> the radtach procable num _ num inch only part way . <START>

91 <ATTR_WORDS> cookies without <CON_START> they are and memory than the older ones . <START>

92 <ATTR_WORDS> worry break piece <CON_START> the is not but it might just be the one i . <START>

93 <ATTR_WORDS> bigger ones <CON_START> for some reason it for me . <START>

94 <ATTR_WORDS> expensive films phone <CON_START> that does not hold for this . <START>

95 <ATTR_WORDS> brita filters claims <CON_START> this is a item that makes sense at all . <START>

96 <ATTR_WORDS> utility <CON_START> since that time i have the .

169 <ATTR_WORDS> one man inferior <CON_START> most operations there is a noticable of num _ extend to num _ num . <START>

170 <ATTR_WORDS> pressure cooker pressurecookerrecipes <CON_START> trying to to a different screen turns it off . <START>

171 <ATTR_WORDS> get cleaning tabs <CON_START> soe decided to push it live a whole num _ num than that . <START>

172 <ATTR_WORDS> blenders chopped ice <CON_START> all of them are , even decent polyester . <START>

173 <ATTR_WORDS> stove backup power <CON_START> can t imagine how it you heat style your . <START>

174 <ATTR_WORDS> better beautiful case <CON_START> i recommend this item for who chew . <START>

175 <ATTR_WORDS> white haze perfect <CON_START> to a two year old why her keeps breaking . <START>

176 <ATTR_WORDS> screen protectors impressed <CON_START> the main to is cinnamon and the slight sweetness of stevia . <START>

177 <ATTR_WORDS> garlic press for <CON_START> this was not in condition when delivered . <START>

178 <ATTR_WORDS> 

253 <ATTR_WORDS> easy clean helpful <CON_START> i like as a rule , so this was a . <START>

254 <ATTR_WORDS> convenience voice dialing <CON_START> don buy this unless you like replacing . <START>

255 <ATTR_WORDS> tonight marinated beer <CON_START> that is up until one day the stopped working . <START>

256 <ATTR_WORDS> pull spinner <CON_START> i thought the was good ; thinks otherwise . <START>

257 <ATTR_WORDS> garlic feels <CON_START> my dog raw and cause bleeding . <START>

258 <ATTR_WORDS> thing huge store <CON_START> knows how to design a very good . <START>

259 <ATTR_WORDS> effective does carpets <CON_START> it for me hopefully other people will have better luck . <START>

260 <ATTR_WORDS> great tham <CON_START> the are over a foot long after being in . <START>

261 <ATTR_WORDS> helps stable <CON_START> overall we will be purchasing this again . <START>

262 <ATTR_WORDS> does cleanup snap <CON_START> sliced in lieu of french on a fruit and cheese plate . <START>

263 <ATTR_WORD

337 <ATTR_WORDS> straining oil patience <CON_START> would have been alot if it simply in with the correct . <START>

338 <ATTR_WORDS> then best coffee <CON_START> i ve it once and i won be using it . <START>

339 <ATTR_WORDS> revise review <CON_START> these run quite even though they are . <START>

340 <ATTR_WORDS> like using to <CON_START> i loved the my as you can read below . <START>

341 <ATTR_WORDS> to works perfectly <CON_START> i will always have , no matter what . <START>

342 <ATTR_WORDS> use rice for <CON_START> this is at all from a brand like sony . <START>

343 <ATTR_WORDS> num_num to num_num minutes done <CON_START> too and there are products out there in single units . <START>

344 <ATTR_WORDS> bought spoon utensils <CON_START> a definite num _ for how they this . <START>

345 <ATTR_WORDS> got been opened <CON_START> i m selling to make for something . <START>

346 <ATTR_WORDS> anything like try <CON_START> one major complaint is the micromanagment of . <START>

347 <ATT

422 <ATTR_WORDS> one hell handles <CON_START> i recommend this for anyone trying to fix a leaky . <START>

423 <ATTR_WORDS> num_extend num_extend charge extended batteries <CON_START> the original unreal tournament is the very of . <START>

424 <ATTR_WORDS> product recommend <CON_START> however , i feel iv is most definitely an overall . <START>

425 <ATTR_WORDS> charging units chargers <CON_START> the was considerably quality than pictured . <START>

426 <ATTR_WORDS> salad shooter replacement <CON_START> it is then no lube because it soooo . <START>

427 <ATTR_WORDS> looks vintage rustic <CON_START> i thought this was the plastic screen and it isn t . <START>

428 <ATTR_WORDS> great for <CON_START> i was surprised as the to be in good . <START>

429 <ATTR_WORDS> case nokia num_extend phone <CON_START> i ve not had this my wusthofs . <START>

430 <ATTR_WORDS> do rediscover cooking <CON_START> this is worth the money and the brand name is . <START>

431 <ATTR_WORDS> existed without drai