**Finding statistically top 5 aspect to be used as general aspects**

In [1]:
import os
import csv
from pprint import pprint
import time

from transformers import T5ForConditionalGeneration, T5Tokenizer

from args import init_args
from src.postprocess import (
    IPostprocess,
    EditDistancePostProcessor,
    EmbeddingDistancePostProcessor,
)
from src.loader import ILoader, HotelLoader
from src.utility import get_config, set_seed
from src.constant import Path, ModelType, PostprocessType, ProcessType
from src.trainer import ITrainer, T5Trainer
from src.generator import IGenerator, T5Generator
from src.evaluation import Evaluator

from src.loader import ILoader, HotelLoader

# == Dependencies Maps (Factory) ==
trainer_config_maps = {ModelType.T5Model: T5Trainer}

tokenizer_config_names = {ModelType.T5Model: T5Tokenizer}

generator_config_names = {ModelType.T5Model: T5Generator}

postprocess_config_names = {
    PostprocessType.EDITDISTANCE: EditDistancePostProcessor,
    PostprocessType.EMBEDDING: EmbeddingDistancePostProcessor,
}


config_path = "resources/exp-v3/exp-m0.yaml"
configs = get_config(config_path)
set_seed(configs["main"]["seed"])

mode = configs.get("main").get("mode")

model_type = configs.get("type")
model_name = configs.get("main").get("pretrained")
use_checkpoint = configs.get("trainer").get("use_checkpoint")
if use_checkpoint:
    model_name = configs.get("trainer").get("checkpoint_path")
print(f"Tokenizer type: {model_name}")
tokenizer = tokenizer_config_names.get(model_type).from_pretrained(model_name)

# 2. Preparing Dataset ...
loader: ILoader = HotelLoader(tokenizer, configs)

train_loader, val_loader = loader.get_train_loader(), loader.get_val_loader()
train_dataset, val_dataset = loader.get_train_dataset(), loader.get_val_dataset()
train_sents, val_sents = train_dataset.get_sents(), val_dataset.get_sents()

test_loader = loader.get_test_loader()
test_dataset = loader.get_test_dataset()
test_sents = test_dataset.get_sents()

Tokenizer type: Wikidepia/IndoT5-base
implicit-v2


In [9]:
def get_expr(labels):
    aspects = []
    sentiments = []

    EMPTY = ''
    def extract(sequence):
        extractions = []
        triplets = sequence.split("; ")
        for elem in triplets:
            elem = elem[1:-1] # Remove the in the start "("  and at the end ")".
            try:
                a, b, c = elem.split(', ')
            except ValueError:
                a, b, c = '', '', ''
            
            a = a.strip()
            b = b.strip()
            c = c.strip()
            # Postprocess...
            if (a == EMPTY or b == EMPTY or c == EMPTY) or (a,b,c) in extractions:
                continue
            extractions.append((a, b, c)) 
        return extractions

    for datum in labels:
        triplets = extract(datum)
        for triplet in triplets:
            aspects.append(triplet[0])

    for datum in labels:
        triplets = extract(datum)
        for triplet in triplets:
            sentiments.append(triplet[1])

    return aspects, sentiments

train_aspects, train_sentiments = get_expr(train_dataset.extracted_labels)
test_aspects, test_sentiments = get_expr(test_dataset.extracted_labels)
val_aspects, val_sentiments = get_expr(val_dataset.extracted_labels)
    

In [11]:
# Get Top 5 most common aspects expr as general aspect

def get_n_top(collection, n):
    from collections import Counter
    c = Counter(collection)
    c.most_common(n)
    print ("",c.most_common(n))

print("Train:")
get_n_top(train_aspects, 10)
print("Val:")
get_n_top(val_aspects, 10)

# Picked general aspect:
# - general
# - kamar
# - pelayanan
# - kamarnya
# - tempat
# - hotel

Train:
 [('hotel', 824), ('kamar', 759), ('pelayanan', 339), ('kamarnya', 285), ('ac', 186), ('kamar mandi', 176), ('tempatnya', 169), ('tempat', 142), ('wifi', 121), ('harga', 121)]
Val:
 [('hotel', 488), ('kamar', 271), ('pelayanan', 131), ('kamarnya', 85), ('kamar mandi', 50), ('tempatnya', 47), ('pelayanannya', 45), ('tempat', 44), ('fasilitas', 43), ('ac', 43)]


**Checking out all aspects expresion that contains punctuation**

In [13]:
def show_aspects_with_special_char(aspects,chars):
    for aspect in aspects:
        if any(p in aspect for p in chars):
            print(aspect)

import string
punctuations = string.punctuation

print("Special character:")
print(punctuations)

print("Train:")
show_aspects_with_special_char(train_aspects, punctuations)
print("Val:")
show_aspects_with_special_char(val_aspects, punctuations)


Special character:
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Train:
seprai - nya
check - out
air panas /
c/o
selimut/sprey
toilet flash - nya
check - in
set - up airy
pintu kamar - mandi
kasur&bantalnya
sistem check - in
sabun /
ac -
ac -
ac -
remote - nya
cemilan+air minum
wi - fi
wc - nya
seprei/bantal
wi - fi
Val:
exhaust/vacuum
ac - nya
pelayanan'ya
ac - nya
d'kmar tidur


In [7]:
import string
punctuations = string.punctuation

for aspect in aspects:
    if any(p in aspect for p in punctuations):
        print(aspect)

air panas/hangat
cs /
minum+snack gratis
