-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_to_text_gen.py
81 lines (69 loc) · 3.83 KB
/
convert_to_text_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import sys
# task = "TASD"
# task = sys.argv[1]
def get_opinion_string(task, polarity, aspect_category, target_words):
if task == 'TASD':
return f"[{polarity}] opinion on [{aspect_category}] for [{target_words.strip()}]"
elif task == 'AD':
return f"opinion on [{aspect_category}]"
elif task == 'TD':
return f"opinion for [{target_words.strip()}]"
elif task == 'ASD':
return f"[{polarity}] opinion on [{aspect_category}]"
elif task == 'TSD':
return f"[{polarity}] opinion for [{target_words.strip()}]"
elif task == 'TAD':
return f"opinion on [{aspect_category}] for [{target_words.strip()}]"
else:
return ""
for task in ['TASD', 'AD', 'ASD', 'TD', 'TSD', 'TAD']:
for partition in ["train", "test"]:
input_text_ids, input_text, target_text = [], [], []
with open(f'{partition}_TAS.tsv', "r") as fd:
fd.readline()
lines = fd.readlines()
for i in range(0, len(lines), 36):
sentence = lines[i: (i + 36)]
# check whether this chunk belongs to only one sentence
assert len(set([each.split("\t")[3] for each in sentence])) == 1
# append the sentence and the sentence id into the respective lists
input_text_ids.append(sentence[0].split("\t")[0].strip())
input_text.append(sentence[0].split("\t")[3].strip())
opinion_list = []
for each_tas in sentence:
each_tas_parts = each_tas.split("\t")
if each_tas_parts[1] == "1":
aspect_category = ' '.join(each_tas_parts[2].split()[:2]).strip()
polarity = each_tas_parts[2].split()[-1].strip()
target_ner_tags = each_tas_parts[4].split()
sent_words = each_tas_parts[3].split()
target_one_indices = [idx for idx, each_tag in enumerate(target_ner_tags) if each_tag != 'O']
tgt_idx = 0
target_words = ""
target_words_list = []
while tgt_idx < len(target_one_indices):
if tgt_idx == 0:
target_words = sent_words[target_one_indices[tgt_idx]]
elif target_one_indices[tgt_idx] == target_one_indices[tgt_idx - 1] + 1:
target_words = target_words + " " + sent_words[target_one_indices[tgt_idx]]
# target_words = ' '.join([sent_words[each_target_one_idx] for each_target_one_idx in target_one_indices]).strip()
else:
if target_words == '':
target_words = "NULL"
# opinion_list.append(f"[{polarity}] opinion on [{aspect_category}] for [{target_words}]")
opinion_list.append(get_opinion_string(task, polarity, aspect_category, target_words))
target_words = sent_words[target_one_indices[tgt_idx]]
tgt_idx += 1
if target_words == '':
target_words = "NULL"
# opinion_list.append(f"[{polarity}] opinion on [{aspect_category}] for [{target_words.strip()}]")
opinion_list.append(get_opinion_string(task, polarity, aspect_category, target_words))
# to build the auxiliary sentence that is used as the output of the text generation model
auxiliary_text = "The review expressed " + ", ".join(opinion_list)
target_text.append(auxiliary_text)
df = pd.DataFrame(input_text_ids, columns=["input_text_ids"])
df["input_text"] = input_text
df["target_text"] = target_text
df["prefix"] = task
# df.to_csv(f"{partition}_{task}.csv", header=True, index=False)