**Building the annotation template spreadsheets**

In [1]:
import os
import csv
import json

TSV_SEPERATOR = "\t"

BIOTAG_SEPERATOR = "//"
TOKEN_SEPERATOR = " "
ANNOTATION_SEPERATOR = "####"
LABEL_SEPERATOR = ", "

polarity_annotation_map = {
    "NT": "NEU",
    "NG": "NEG",
    "PO": "POS",
}

In [2]:
def get_elmt_idx(data):
    """
    Helper func to help identify the start & end idx from IOB Tagging
    """
    tag = [datum.split(BIOTAG_SEPERATOR) for datum in data]
    start_idx, end_idx, found = -1, -1, False
    for idx, word_tag in enumerate(tag):
        _, tag = word_tag
        if tag == "B":
            start_idx = idx
            end_idx = idx
            found = True
        elif tag == "O" and found:
            end_idx = idx - 1
            break
    return start_idx, end_idx

def get_elmt_idxs(data):
    """
    Helper func to help identify the start & idx for all IOB Tagging
    """
    tag = [datum.split(BIOTAG_SEPERATOR) for datum in data]
    start_idxs, end_idxs, found = [], [], False
    for idx, word_tag in enumerate(tag):
        _, tag = word_tag
        if tag == "B":
            start_idxs.append(idx)
            end_idxs.append(idx)
            found = True
        elif tag == "O" and found:
            end_idxs[-1] = idx - 1
            found = False       
    return start_idxs, end_idxs

def get_iterate_idx(start_idx, end_idx):
    assert start_idx <= end_idx
    return [i for i in range(start_idx, end_idx + 1)]

def parse_interim_implicit(data, valid_only=False):
    """
    Wrapper to parse interrim data (json formatted) into correct annotated data for OTE-MTL framework
    """
    parsed_data = []
    for datum in data:
        uncomplete_triplets = []
        if valid_only and not datum.get("valid"):
            pass
        else:
            sents_start_idxs, sents_end_idxs = get_elmt_idxs(
                    datum.get("sent_tags").split(TOKEN_SEPERATOR)
            )
            # print(sents_start_idxs)
            # print(sents_end_idxs)
            for triplet in datum.get("triples"):
                sentiment_start_idx, sentiment_end_idx = get_elmt_idx(
                    triplet.get("sent_tags").split(TOKEN_SEPERATOR)
                )
                index = -1
                for i, start_idx in enumerate(sents_start_idxs):
                    if start_idx == sentiment_start_idx:
                        index = i 
                        break
                if index != -1:
                    sents_start_idxs.pop(index)
                    sents_end_idxs.pop(index)
            # Reduce set...
            assert len(sents_start_idxs) == len(sents_end_idxs)
            for i in range(len(sents_start_idxs)):
                uncomplete_triplets.append([sents_start_idxs[i], sents_end_idxs[i]])

        sentence = datum.get("sentence")
        parsed_data.append([sentence, uncomplete_triplets])

    return parsed_data

In [3]:
RAW_DATA_DIR = "data/raw"
INTERIM_DATA_FILTER_DIR = "data/interim/filter"
INTERIM_DATA_UNFILTER_DIR = "data/interim/unfilter"
PROCESSED_DATA_FILTER_DIR = "data/processed/filter"
PROCESSED_DATA_UNFILTER_DIR = "data/processed/unfilter"

# == Save Interim data ==
with open(os.path.join(INTERIM_DATA_UNFILTER_DIR, "train.json")) as f:
    data = json.load(f)
    parsed_datas = parse_interim_implicit(data)

In [4]:
# === Get Implicit Aspect Annotation ===
import pickle
with open("./data/annotation/train_skip_label.pkl", 'rb') as f:
    res = pickle.load(f)
only_implicit = [parsed_datas[i] for i in range(len(parsed_datas)) if i in res]

In [5]:
counter = 0
for datum in parsed_datas:
    for implicit in datum[1]:
        counter += 1
print("Total Instance to be annotated:",counter)

Total Instance to be annotated: 1362


In [6]:
import pandas as pd

id = []
sents = []
sentiments_exp = []
polarity = []
labels = []
target_repr = []
triplets = []

for i in range(len(parsed_datas)):
    if i in res:
        continue
    for j in range(len(parsed_datas[i][1])):
        id.append(i)
        sents.append(parsed_datas[i][0])
        start, end = parsed_datas[i][1][j][0], parsed_datas[i][1][j][1]
        sentiments_exp.append(TOKEN_SEPERATOR.join(parsed_datas[i][0].split(TOKEN_SEPERATOR)[start:end+1]))
        labels.append(f"([-1], {get_iterate_idx(start, end)}, ")
        polarity.append("-")
        triplets.append("-")
        target_repr.append(f"(hotel, {sentiments_exp[-1]}, ")

df = pd.DataFrame({
    "id": id,
    "sents": sents,
    "expr": sentiments_exp,
    "polarity": polarity,
    "labels": labels,
    "target_repr": target_repr,
})

In [50]:
df.to_csv("annotation-implicit-train-template.csv", index=False)

**Aggregating implicit aspect into current data**

In [7]:
import pandas as pd

data = pd.read_csv("annotation-implicit-train-v2.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     820 non-null    int64 
 1   sents                  820 non-null    object
 2   expr                   820 non-null    object
 3   polarity               820 non-null    object
 4   labels                 820 non-null    object
 5   target_repr            820 non-null    object
 6   completed_labels       820 non-null    object
 7   completed_target_repr  797 non-null    object
 8   checkpoint             820 non-null    bool  
dtypes: bool(1), int64(1), object(7)
memory usage: 52.2+ KB


In [8]:
len(data.id.unique())

605

In [9]:
# == Build Hash Map for each index ==
ids = data.id.unique()
memory = dict()
for id in ids:
    labels = data.loc[(data.id == id) & (data.checkpoint)].completed_labels.values
    memory[id] = list(labels) 

In [10]:
def combine(triplets_left, triplets_right):
    triplets = []
    for triplet in triplets_left:
        triplets.append(triplet)
    for triplet in triplets_right:
        triplets.append(triplet)
    return triplets 

def parse_interim_combined(data, memory, valid_only=False):
    """
    Wrapper to parse interrim data (json formatted) into correct annotated data for OTE-MTL framework
    """
    parsed_data = []
    for i, datum in enumerate(data):
        triplets = []
        if valid_only and not datum.get("valid"):
            pass
        else:
            for triplet in datum.get("triples"):
                aspect_start_idx, aspect_end_idx = get_elmt_idx(
                    triplet.get("aspect_tags").split(TOKEN_SEPERATOR)
                )

                sentiment_start_idx, sentiment_end_idx = get_elmt_idx(
                    triplet.get("sent_tags").split(TOKEN_SEPERATOR)
                )

                polarity = triplet.get("polarity")
                triplets.append(
                    str(
                        (
                            get_iterate_idx(aspect_start_idx, aspect_end_idx),
                            get_iterate_idx(sentiment_start_idx, sentiment_end_idx),
                            polarity_annotation_map.get(polarity),
                        )
                    )
                )
        sentence = datum.get("sentence")
        if i in memory:
            triplets = combine(triplets, memory[i])
        parsed_data.append([sentence, triplets])
    return parsed_data

In [11]:
# == Save Interim data ==
with open(os.path.join(INTERIM_DATA_UNFILTER_DIR, "train.json")) as f:
    data = json.load(f)
    parsed_datas = parse_interim_combined(data, memory)

In [12]:
sents = []
labels = []
for datum in parsed_datas:
    sent, label = datum[0], datum[1]
    sents.append(sent)
    labels.append(label)

In [13]:
def write_annotated(sents, labels, target_path):
    assert len(sents) == len(labels)
    with open(target_path, "w") as fout:
        for i in range(len(sents)):
            text, label = sents[i], labels[i]

            fout.write(text + ANNOTATION_SEPERATOR)
            label = [str(l) for l in label]
            fout.write("[" + str(LABEL_SEPERATOR.join(label)) + "]" + "\n")
write_annotated(sents, labels, os.path.join("data", "processed", "implicit-v2", "train.txt"))

**Aggregating implicit aspect only review into current dataset**

In [15]:
TSV_SEPERATOR = "\t"

BIOTAG_SEPERATOR = "//"
TOKEN_SEPERATOR = " "
ANNOTATION_SEPERATOR = "####"
LABEL_SEPERATOR = ", "

def load(path):
    df = pd.read_csv(path)

    idx = list(df.idx)
    sents = [eval(sent) for sent in list(df.sents)]
    labels = [eval(label) for label in list(df.labels)]
    checkpoints = [str(checker) for checker in list(df.checkpoint)]

    filter_idx = []
    filter_sents = []
    filter_labels = []
    
    for i, checkpoint in enumerate(checkpoints):
        if checkpoint != "False":
            filter_idx.append(idx[i])
            filter_sents.append(sents[i])
            filter_labels.append(labels[i])

    assert len(filter_idx) == len(filter_sents) == len(filter_labels)
    return filter_idx, filter_sents, filter_labels 

def parse(file, separator):
    sents, labels = [], []
    words = []
    for line in file:
        line = line.strip()
        if line != "":
            words, targets = line.split(separator)
            sents.append(words.split())
            labels.append(eval(targets))
    return sents, labels

def combine(file, seperator, idx, annotated_labels):
    curr_sents, curr_labels = parse(file, seperator)
    for i in range(len(idx)):
        curr_labels[idx[i] - 1] = annotated_labels[i]
    return curr_sents, curr_labels 

def write_annotated(sents, labels, target_path):
    assert len(sents) == len(labels)
    with open(target_path, "w") as fout:
        for i in range(len(sents)):
            text, label = sents[i], labels[i]

            fout.write(" ".join(text) + ANNOTATION_SEPERATOR)
            label = [str(l) for l in label]
            fout.write("[" + str(LABEL_SEPERATOR.join(label)) + "]" + "\n")

idx, _, labels = load(os.path.join("data", "annotation", "train-annotated.csv"))

with open(os.path.join("data", "processed", "implicit-v2", "train.txt"), 'r') as f:
    sents, labels = combine(f, ANNOTATION_SEPERATOR, idx, labels)

write_annotated(sents, labels, os.path.join("data", "processed", "implicit-v2", "train.txt"))