In [10]:
import sys
sys.path.append("../")
import os.path as osp
from stanza_utils import *
from glob import glob
from tqdm import tqdm

pos, dep, tag = {}, {}, {}
sentis = {
    "1": "T-POS",
    "0": "T-NEU",
    "-1": "T-NEG"
}
for file in glob("../data/raw/**.train.txt"):
    domain = osp.basename(file).split('.')[0]
    pos_list, dep_list, tag_list = [], [], []
    lines = open(file, "r").read().splitlines()
    if domain == "twitter":
        ans = []
        for line, aspect, polarity in zip(lines[::3], lines[1::3], lines[2::3]):
            text = line.replace("$T$", aspect)
            words = line.split(' ')
            anns = ['O'] * len(words)
            for i, word in enumerate(words):
                if word != "$T$": continue
                anns[i] = ' '.join([sentis[polarity]] * len(aspect.split(' ')))
            assert len(words) == len(anns)
            ans.append(f"{text}***{' '.join(anns)}")
        lines = ans
    sentences_list = annotation_plus([line.rsplit("***", maxsplit=1)[0] for line in lines])
    label_list = [line.rsplit("***", maxsplit=1)[1] for line in lines]
    for sentence, labels in tqdm(zip(sentences_list, label_list), total=len(lines), desc=domain):
        for token, label in zip(sentence.to_dict(), labels.split(' ')):
            pos_list.append(token['xpos'])
            dep_list.append(token['deprel'])
            tag_list.append(label)
    pos[domain] = pos_list
    dep[domain] = dep_list
    tag[domain] = tag_list

laptop: 100%|██████████| 3045/3045 [00:01<00:00, 1738.98it/s]
restaurant: 100%|██████████| 3877/3877 [00:02<00:00, 1865.34it/s]
twitter: 100%|██████████| 6248/6248 [00:04<00:00, 1415.37it/s]


In [12]:
import pandas as pd
from pandas import ExcelWriter
 
domains = pos.keys()
total = 0


def func(x: pd.Series):
    if x.name == "tag": return len(x[x != 'O']) * 1.0 / len(x)
    else: return len(x[x != 'O']) * 1.0 / total


writer = ExcelWriter("./annotation.xlsx")
pos_writer = ExcelWriter("./pos.xlsx")
dep_writer = ExcelWriter("./dep.xlsx")
for i, domain in enumerate(domains):
    data = pd.DataFrame({"pos": pos[domain], "dep": dep[domain], "tag": tag[domain]})
    total = len(data[data['tag'] != 'O'])
    data['tag_count'] = data['tag'].copy()
    groups = data.groupby(["pos", "dep"], group_keys=False)
    df = groups.agg(func).sort_values(["tag_count", "tag"], ascending=False)
    df.to_excel(writer, sheet_name=domain)
    pos_df = df.groupby("pos").agg({
        "tag_count": "sum"
    }).sort_values("tag_count", ascending=False).head(15)
    pos_df.to_excel(pos_writer, sheet_name=domain)
    dep_df = df.groupby("dep").agg({
        "tag_count": "sum"
    }).sort_values("tag_count", ascending=False).head(15)
    dep_df.to_excel(dep_writer, sheet_name=domain)
writer.close()
pos_writer.close()
dep_writer.close()