**Finding statistically top 5 aspect to be used as general aspects**

In [3]:
import os
import csv
from pprint import pprint
import time

from transformers import T5ForConditionalGeneration, T5Tokenizer

from args import init_args
from src.postprocess import (
    IPostprocess,
    EditDistancePostProcessor,
    EmbeddingDistancePostProcessor,
)
from src.loader import ILoader, HotelLoader
from src.utility import get_config, set_seed
from src.constant import Path, ModelType, PostprocessType, ProcessType
from src.trainer import ITrainer, T5Trainer
from src.generator import IGenerator, T5Generator
from src.evaluation import Evaluator

from src.loader import ILoader, HotelLoader

# == Dependencies Maps (Factory) ==
trainer_config_maps = {ModelType.T5Model: T5Trainer}

tokenizer_config_names = {ModelType.T5Model: T5Tokenizer}

generator_config_names = {ModelType.T5Model: T5Generator}

postprocess_config_names = {
    PostprocessType.EDITDISTANCE: EditDistancePostProcessor,
    PostprocessType.EMBEDDING: EmbeddingDistancePostProcessor,
}


config_path = "resources/exp-v3/exp-m0.yaml"
configs = get_config(config_path)
set_seed(configs["main"]["seed"])

mode = configs.get("main").get("mode")

model_type = configs.get("type")
model_name = configs.get("main").get("pretrained")
use_checkpoint = configs.get("trainer").get("use_checkpoint")
if use_checkpoint:
    model_name = configs.get("trainer").get("checkpoint_path")
print(f"Tokenizer type: {model_name}")
tokenizer = tokenizer_config_names.get(model_type).from_pretrained(model_name)

# 2. Preparing Dataset ...
loader: ILoader = HotelLoader(tokenizer, configs)

train_loader, val_loader = loader.get_train_loader(), loader.get_val_loader()
train_dataset, val_dataset = loader.get_train_dataset(), loader.get_val_dataset()
train_sents, val_sents = train_dataset.get_sents(), val_dataset.get_sents()

test_loader = loader.get_test_loader()
test_dataset = loader.get_test_dataset()
test_sents = test_dataset.get_sents()

Tokenizer type: Wikidepia/IndoT5-base
implicit


In [6]:
labels[:10]

['(ac, tidak berfungsi optimal, negatif); (wifi koneksi, kurang stabil, negatif)',
 '(tempatnya, bagus, positif); (kolam renangnya, bersih, positif)',
 '(ac nya, tidak bisa diatur, negatif)',
 '(pintu geser, kurang rapat, negatif)',
 '(semuanya, nyaman, positif)',
 '(snack, tidak dapat, negatif)',
 '(pelayanan, lumayan baik, positif)',
 '(overall, bagus, positif)',
 '(pelayanan mas nya, sangat baik, positif); (pelayanan mas nya, membantu, positif); (pelayanan mas nya, ramah, positif); (pelayanan mas nya, komunikatif, positif); (kamar, sangat bersih, positif); (kamar mandi, sangat baik, positif); (air panasnya, tidak bisa lama, negatif)',
 '(layanan, sangat mengecewakan, negatif); (kebersihan kamar, sangat mengecewakan, negatif)']

In [8]:
labels = train_dataset.extracted_labels
aspects = []

EMPTY = ''
def extract(sequence):
    extractions = []
    triplets = sequence.split("; ")
    for elem in triplets:
        elem = elem[1:-1] # Remove the in the start "("  and at the end ")".
        try:
            a, b, c = elem.split(', ')
        except ValueError:
            a, b, c = '', '', ''
        
        a = a.strip()
        b = b.strip()
        c = c.strip()
        # Postprocess...
        if (a == EMPTY or b == EMPTY or c == EMPTY) or (a,b,c) in extractions:
            continue
        extractions.append((a, b, c)) 
    return extractions

for datum in labels:
    triplets = extract(datum)
    for triplet in triplets:
        aspects.append(triplet[0])

In [10]:
# Get Top 5 most common aspects expr as general aspect
from collections import Counter
c = Counter(aspects)
c.most_common(10)
print ("",c.most_common(10))

# Picked general aspect:
# - general
# - kamar
# - pelayanan
# - kamarnya
# - tempat
# - hotel

 [('kamar', 757), ('pelayanan', 336), ('kamarnya', 282), ('ac', 186), ('kamar mandi', 175), ('tempatnya', 169), ('tempat', 141), ('harga', 121), ('wifi', 120), ('lokasi', 105)]
