In [2]:
import pickle
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast
import torch
from transformers import GPT2LMHeadModel, GPT2Config
from torch.utils.data import random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoConfig
import time
import datetime
import numpy as np
from torch.utils.data import ConcatDataset
from decimal import Decimal


from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from tokenizers import Tokenizer
from typing import Dict, List, Optional
from torch.utils.data import Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from IPython.display import display
from typing import Dict

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BartModel

## Data Handling

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    'hyunwoongko/kobart', sep_token='<sep>'
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
class WordReplace:
    def __init__(self, path="../dataset-woorimalsam/similar_words.pkl"):
        # self.tokenizer_encode = lambda x: tokenizer.encode(x)[1:-1]
        # self.tokenizer_decode = lambda x: tokenizer.decode(x)
        self.tokenizer_encode = lambda x: [y for y in x]
        self.tokenizer_decode = lambda x: ''.join(x)
        self.pad_token = '_'

        df_similar_words = pd.read_pickle(path)
        df_similar_words = df_similar_words.sort_index()
        
        df_similar_words = df_similar_words.map(lambda xs: [self.tokenizer_encode(x) for x in xs])
        df_similar_words = df_similar_words.rename("similar_words").to_frame()
        df_similar_words["word"] = df_similar_words.index.map(lambda x: self.tokenizer_encode(x))

        max_token_len = 10
        df_idx = df_similar_words['word'].apply(lambda x: (x+[self.pad_token]*max_token_len)[:max_token_len]).tolist()
        df_idx = np.array(df_idx).T.tolist()
        df_similar_words.index = df_idx
        df_similar_words = df_similar_words.sort_index()
        
        self.df_similar_words = df_similar_words
        
    def replace_word(self, word):
        tokens = self.tokenizer_encode(word) + [self.pad_token]

        df = self.df_similar_words
        match = []
        for token in tokens:
            if token in df.index:
                df = df.loc[token]
                match.append(token)
            else:
                break

        if len(match)>0:
            residual = df.index.map(lambda x : sum([1 for y in x if y!=self.pad_token]))
            df = df[residual<=min(residual)]
            candidates = [y for x in df["similar_words"].to_list() for y in x]
            # print(df)
            return self.tokenizer_decode(candidates[np.random.choice(len(candidates))])
        else:
            return word

wr = WordReplace()
wr.replace_word("인간성이")

'본성'

In [5]:
MAX_LEN = 256
BATCH_SIZE = 32

class SplitDataset:
    def __init__(self, dataset, ratio=0.8, mode='train'):
        self.dataset = dataset
        self.ratio = int(ratio*100)
        self.mode = mode
    
    def __len__(self):
        ratio = self.ratio
        train_len = len(self.dataset) // 100 * ratio + min(len(self.dataset) % 100, ratio)
        test_len = len(self.dataset) - train_len
        if self.mode =='train':
            return train_len
        elif self.mode =='test':
            return test_len
        
    def __getitem__(self, idx):
        ratio = self.ratio
        if self.mode =='train':
            return self.dataset.__getitem__(idx//ratio*100+idx%ratio)
        elif self.mode=='test':
            return self.dataset.__getitem__(idx//(100-ratio)*100+(ratio+idx%(100-ratio)))
        

class ParallelDataset(Dataset):
    def __init__(
        self, file_path, max_length = 256, max_history=5, word_noiser=wr.replace_word,
        delete_ratio = 0.05, mask_ratio = 0.10, mask_span = 3,
        replace_ratio=0.2, diffusion_distance=1,
        cache=True
    ):
        self.load(file_path)
        self.tokenizer = tokenizer
        self.tokenizer_encode = lambda x: tokenizer.encode(x)[1:-1]
        self.tokenizer_decode = lambda x: tokenizer.decode(x)
        self.max_length = max_length
        self.max_history = max_history
        self.source_enc = {}
        self.target_enc = {}
        self.tag_bos = [tokenizer.bos_token_id]
        self.tag_eos = [tokenizer.eos_token_id]
        self.tag_pad = [tokenizer.pad_token_id]
        self.tag_sep = [tokenizer.sep_token_id]
        self.word_noiser = word_noiser
        
        self.sample_ratio = 1-delete_ratio
        self.mask_ratio = mask_ratio
        self.mask_span = mask_span
        self.replace_ratio = replace_ratio
        self.diffusion_distance = diffusion_distance
        self.cache = cache
        
    def load(self, file_path):
        target_pkl, source_pkl_list = file_path["target_pkl"], file_path["source_pkl_list"]
        with open(target_pkl, "rb") as f:
            self.data_target = pickle.load(f)
        self.data_sources = []
        for path in source_pkl_list:
            with open(path, "rb") as f:
                self.data_sources.append(pickle.load(f))
        self.keys = list(self.data_target.keys())

    def __len__(self):
        return len(self.keys) * len(self.data_sources)
    
    def _noise_token(self, tokens, sample_ratio = 0.95, mask_ratio = 0.10, mask_span = 3):
        # Token Deletion
        L = len(tokens)
        xmap = np.random.choice(range(L), int(sample_ratio * L), replace=False)
        tokens = [tokens[i] for i in sorted(xmap)]
        
        # Token Masking
        for i in np.random.choice(range(len(tokens)), int(mask_ratio*len(tokens)), replace=False):
            tokens[i] = tokenizer.mask_token_id
        if len(tokens) > mask_span * 3:
            a = np.random.randint(len(tokens)-mask_span)
            for i in range(a, a+mask_span):
                tokens[i] = tokenizer.mask_token_id
        
        return tokens
    
    def _noise_text(self, text, replace_ratio=0.2, diffusion_distance=1):
        # Token Replace
        words = text.split()
        for i in np.random.choice(range(len(words)), int(replace_ratio * len(words)), replace=False):
            words[i] = self.word_noiser(words[i])

        # Token Permutation
        Lx = len(words)
        xmap = np.argsort( np.arange(0,Lx)+ np.random.normal(0,1+diffusion_distance,Lx) )
        words = [words[i] for i in xmap]
        
        return ' '.join(words)
    
    def _query(self, idx, noise):
        key = self.keys[idx%len(self.keys)]
        if (idx not in self.source_enc) or (not self.cache):
            self.source_enc[idx] = self.data_sources[idx//len(self.keys)][key].strip()
        if idx%len(self.keys) not in self.target_enc:
            self.target_enc[idx%len(self.keys)] = self.tokenizer_encode(self.data_target[key].strip())
        if noise:
            source_enc = self._noise_token(
                self.tokenizer_encode(
                    self._noise_text(
                        self.source_enc[idx],
                        replace_ratio=self.replace_ratio, diffusion_distance=self.diffusion_distance
                    )
                ),
                sample_ratio = self.sample_ratio, mask_ratio = self.mask_ratio, mask_span = self.mask_span
            )
        else:
            source_enc = self.tokenizer_encode(self.source_enc[idx])
        return source_enc, self.target_enc[idx%len(self.keys)]
        
    def __getitem__(self, idx):
        output = []
        source_tokens, target_tokens = self._query(idx, noise=True)
        source_tokens = self.tag_sep + source_tokens
        if (len(source_tokens) == 0) or (len(target_tokens) ==0):
            return self.__getitem__(self, (idx+1) % self.__len__())
        for i in range(1, self.max_history):
            if idx-i<0:
                break
            s_tokens, _ = self._query(idx-i, noise=False)
            if len(self.tag_bos) + len(s_tokens) + len(source_tokens) + len(self.tag_eos) <= self.max_length:
                source_tokens = s_tokens + source_tokens
            else:
                break
        return self.make_tensor_from_list(source_tokens, target_tokens)
    
    def make_tensor_from_list(self, encoder_tokens, decoder_tokens):
        encoder_text = self.tokenizer_decode(encoder_tokens)
        decoder_text = self.tokenizer_decode(decoder_tokens)
        model_inputs = self.tokenizer(encoder_text, max_length=self.max_length, truncation=True)
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(decoder_text, max_length=self.max_length, truncation=True)
        model_inputs['labels'] = labels['input_ids']
        # del model_inputs['token_type_ids']
        return model_inputs
    
# class CloneDataset(ParallelDataset):
        
#     def load(self, text_path):
#         with open(text_path, "rt") as f:
#             self.text = f.readlines()

#     def __len__(self):
#         return len(self.text)

#     def _query(self, idx):
#         dropout_rate = 0.2
        
#         tokens = self.tokenizer_encode(self.text[idx].strip())
#         tokens_partial = [tokens[i] for i in sorted(np.random.choice(len(tokens), int(len(tokens)*(1-dropout_rate)), replace=False))]
#         return tokens_partial, tokens
    
class GeneratedDataset(ParallelDataset):
    # GPT generated text
    def __init__(self):
        raise NotImplementedError()
        
def dataset_to_dataloader(ds, num_samples, batch_size = 1):
    # Split into training and validation sets
    val_size = int(min(10000, len(ds)*0.03))
    train_size = len(ds)-val_size

    train_set, val_set = random_split(ds, [train_size, val_size])
    print("train_size :",int(num_samples),"/",train_size)
    print("val_size   :",val_size)

    train_dataloader = DataLoader(train_set,  sampler = RandomSampler(train_set, num_samples = int(num_samples)), batch_size = batch_size)
    validation_dataloader = DataLoader(val_set, sampler = SequentialSampler(val_set), batch_size = batch_size )
    return train_dataloader ,validation_dataloader



In [6]:
Khala_hq = "../dataset-Khala/ko.pkl"
Khala_lq = "../dataset-Khala/ko_en_ko.pkl"

In [8]:
max_history = 5
ratio = 0.99

# Khalar parallel
dss_train = tuple([
    SplitDataset(
        ParallelDataset(
            {
                "target_pkl":Khala_hq, 
                "source_pkl_list":[Khala_hq, Khala_lq],
            }, max_length = MAX_LEN, max_history=i,
            delete_ratio = 0.05, mask_ratio = 0.10, mask_span = 1,
            replace_ratio=0.2, diffusion_distance=1.5,
        ),
        ratio=ratio, mode='train'
    )
    for i in range(max_history)
    ])

dss_test = tuple([
    SplitDataset(
        ParallelDataset(
            {
                "target_pkl":Khala_hq, 
                "source_pkl_list":[Khala_hq, Khala_lq]
            }, max_length = MAX_LEN, max_history=i,
            delete_ratio = 0.05, mask_ratio = 0.10, mask_span = 1,
            replace_ratio=0.2, diffusion_distance=1.5,
        ),
        ratio=ratio, mode='test'
    )
    for i in range(max_history)
    ])

# {k:tokenizer.decode(v) for k,v in dss_train[3][len(dss_train[3])-1].items()}
# {k:tokenizer.decode(v) for k,v in dss_test[3][len(dss_test[3])-1].items()}

ds_train = ConcatDataset(dss_train)
ds_test = ConcatDataset(dss_test)

In [9]:
print(len(ds_train))
print(len(ds_test))

144540
1460


## Model

In [10]:
# See also : https://colab.research.google.com/drive/1IHMJHPwoOvAKH7NvyzPjm9cZRSVbLeYR?usp=sharing#scrollTo=LuHj3IJPjrAZ
# Create device
device = torch.device("cuda")
# cfg = AutoConfig.from_pretrained('hyunwoongko/kobart')
model = AutoModelForSeq2SeqLM.from_pretrained('hyunwoongko/kobart')
# model.config.update({
#     "encoder_attention_heads":24,
#     "decoder_attention_heads":24
# })
model.resize_token_embeddings(len(tokenizer))
# model.resize_position_embeddings
# model.cuda()
model = model.to(device)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30001, 768)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30001, 768)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    

In [11]:
data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, model=model )

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_model/", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, #24, # number of training epochs
    per_device_train_batch_size=64, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps=500, # Number of update steps between two evaluations.
    save_steps=1000, # after # steps model is saved 
    warmup_steps=300,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_train, 
    eval_dataset=ds_test,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 144540
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4518
  Number of trainable parameters = 123860736
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,3.0201,2.447293
1000,2.2333,2.342017
1500,2.0124,2.328092
2000,1.847,2.315989
2500,1.6571,2.354383
3000,1.5214,2.366665
3500,1.4378,2.376867
4000,1.382,2.388084
4500,1.3481,2.384627


***** Running Evaluation *****
  Num examples = 1460
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1460
  Batch size = 64
Saving model checkpoint to ./bart_model/checkpoint-1000
Configuration saved in ./bart_model/checkpoint-1000/config.json
Model weights saved in ./bart_model/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [bart_model/checkpoint-2000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1460
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1460
  Batch size = 64
Saving model checkpoint to ./bart_model/checkpoint-2000
Configuration saved in ./bart_model/checkpoint-2000/config.json
Model weights saved in ./bart_model/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [bart_model/checkpoint-4000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1460
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1460
  Batch size = 64
Saving model checkpoint to

TrainOutput(global_step=4518, training_loss=1.8269861026272514, metrics={'train_runtime': 3244.2228, 'train_samples_per_second': 89.106, 'train_steps_per_second': 1.393, 'total_flos': 2.727474315890688e+16, 'train_loss': 1.8269861026272514, 'epoch': 2.0})

## infer

In [30]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    'hyunwoongko/kobart', sep_token='<sep>'
)
MAX_LEN=256

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from transformers import pipeline

nlg_pipeline = pipeline('text2text-generation', model="./bart_model/checkpoint-4000/", tokenizer=tokenizer)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [10]:
def generate_text(pipe, text, context, num_return_sequences=5, max_length=60):
    text = f"{context}<sep>{text}"
    out = pipe(
        text, num_return_sequences=num_return_sequences, max_length=max_length
    )
    return [x['generated_text'] for x in out]

In [11]:
# context = "표절 아닌 것은 세상에 없어요. 다 표절입니다. 난 그렇게 생각해요. (문제 본질은) 문학 권력 투쟁 아니요? 쉽게 말하면 무슨 출판사, 무슨 출판사, 그거 아니요? 그게 문제였지. 뭐, 다른 게 무슨 문제였어요? 표절 아닌 게 세상에 있는 줄 압니까? 우리 말도 다 표절이에요. 엄마 말을 가지고 표절 하는 것 아니에요? 우리가 쓰는 말도."
context = "표절 아닌 것은 세상에 없어요. 다 표절입니다. 난 그렇게 생각해요. (문제 본질은) 문학 권력 투쟁 아니요? 쉽게 말하면 무슨 출판사, 무슨 출판사, 그거 아니요? 그게 문제였지. 뭐, 다른 게 무슨 문제였어요? 표절 아닌 게 세상에 있는 줄 압니까? 우리 말도 다 표절이에요. 엄마 말을 가지고 표절 하는 것 아니에요? 우리가 쓰는 말도."
src_text = "나는 우습다고 생각해요. 그 권력에 붙어 있는 사람도 우습고, 권력을 이용하는 사람도 우습고. 나는 그렇게 생각해요."

print("입력 문장:", src_text)
print(generate_text(nlg_pipeline, src_text, context, num_return_sequences=1, max_length=1000)[0])

입력 문장: 나는 우습다고 생각해요. 그 권력에 붙어 있는 사람도 우습고, 권력을 이용하는 사람도 우습고. 나는 그렇게 생각해요.
나는 우습다고 생각해요. 그 권력에 붙어 있는 사람도 우습고, 그 권력에 이용하는 사람도 우습고. 나는 그렇게 생각해요.


## infer2

In [35]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    'hyunwoongko/kobart', sep_token='<sep>'
)
MAX_LEN=256

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
device = torch.device("cuda")
model = AutoModelForSeq2SeqLM.from_pretrained("./bart_model/checkpoint-4000/")
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [63]:
from transformers import LogitsProcessor, LogitsProcessorList
# avoid_same_thres = 2e-6
avoid_same_thres = 1e-5
class MyLogitsProcessor(LogitsProcessor):
    def __init__(self, avoid_same_thres):
        self.avoid_same_thres = avoid_same_thres
        super(MyLogitsProcessor,self).__init__()
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        thres = torch.quantile(scores,1 - self.avoid_same_thres).tolist()
        scores[scores>thres]= 2 * thres - scores[scores>thres]
        return scores
logit_processors = LogitsProcessorList([MyLogitsProcessor(avoid_same_thres=avoid_same_thres)])

In [60]:
# from transformers.generation_utils import GenerationConfig
def infer(context, src_text, 
          temperature = 1.7, max_len=MAX_LEN, do_sample=True, repetition_penalty=2.0,
          num_beams=9, typical_p=0.7, logit_processors=logit_processors,
         ):
    input_ids = torch.tensor([tokenizer(context + "<sep>" + src_text)['input_ids']]).to(device)
    gen_ids = model.generate(
        inputs = input_ids,
        max_length=MAX_LEN,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        use_cache=False,

        do_sample=do_sample, 
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        num_beams=num_beams,
        typical_p = typical_p,
        renormalize_logits = True,
        logits_processor = logit_processors
    )
    result = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0]
    return result

In [65]:
def generate(context, src_text, n=10, avoid_same_thres=1e-5, temperature=1.7, num_beams=9, typical_p=0.7):
    print(f"avoid_same_thres : {avoid_same_thres}")
    print(f"temperature : {temperature}")
    print(f"num_beams : {num_beams}")
    print(f"typical_p : {typical_p}")

    print(f"Context : {context}")
    print(f"Input : {src_text}")
    print()

    logit_processors = LogitsProcessorList([MyLogitsProcessor(avoid_same_thres=avoid_same_thres)])
    for i in range(n):
        output = infer(context, src_text)
        print(f"Output {i} : {output}")

In [67]:
generate(
    context = "표절 아닌 것은 세상에 없어요. 다 표절입니다. 난 그렇게 생각해요. (문제 본질은) 문학 권력 투쟁 아니요? 쉽게 말하면 무슨 출판사, 무슨 출판사, 그거 아니요? 그게 문제였지. 뭐, 다른 게 무슨 문제였어요? 표절 아닌 게 세상에 있는 줄 압니까? 우리 말도 다 표절이에요. 엄마 말을 가지고 표절 하는 것 아니에요? 우리가 쓰는 말도.",
    src_text = "나는 우습다고 생각해요. 그 권력에 붙어 있는 사람도 우습고, 권력을 이용하는 사람도 우습고. 나는 그렇게 생각해요.",
)

avoid_same_thres : 1e-05
temperature : 1.7
num_beams : 9
typical_p : 0.7
Context : 표절 아닌 것은 세상에 없어요. 다 표절입니다. 난 그렇게 생각해요. (문제 본질은) 문학 권력 투쟁 아니요? 쉽게 말하면 무슨 출판사, 무슨 출판사, 그거 아니요? 그게 문제였지. 뭐, 다른 게 무슨 문제였어요? 표절 아닌 게 세상에 있는 줄 압니까? 우리 말도 다 표절이에요. 엄마 말을 가지고 표절 하는 것 아니에요? 우리가 쓰는 말도.
Input : 나는 우습다고 생각해요. 그 권력에 붙어 있는 사람도 우습고, 권력을 이용하는 사람도 우습고. 나는 그렇게 생각해요.

Output 0 : 나는 그건 또 우습다고 생각해. 그 권력에 붙어 있는 사람도 우습고, 그 권력을 이용하는 사람도 우습고. 나는 그렇게 생각한다, 고 말이네.
Output 1 : 나는 생각해봐. 그 권력에 붙어 있는 것도 우습고, 그 권력을 이용하는 사람도 우습고, 나는 그런다고 생각해. 너는 그렇게 생각했재.
Output 2 : 나는 그렇게 우스웠다고 생각해. 그 권력에 붙어 있는 것도 우스우고, 그 권력을 이용하는 것도 우스우고, 그랬는데 저는 그렇게 생각해요.”
Output 3 : 나는 힘겹고 저러니까 안되는 거죠. 그 힘에 붙어 있는 사람도 우습고, 그 권력을 끌어가는 사람도 안되어 안되는 거지만 나는 그렇게 생각한단 말이야 뭐여.
Output 4 : 나는 생각해봐라. 그 권력에 붙어 있는 사람들도 우습고, 그 권력과 결부되어 있는 사람들도 우습고. 나는 그렇게 생각해본다네.
Output 5 : 나는 생각해봐, 우습다고. 그 권력에 붙어 있는 것도 우습고, 그 힘을 이용하는 사람도 우습고, 나는 그런 사람이 아니라고 생각해요.
Output 6 : 나는 힘에 겹고 그 권력에 붙어 있는 사람도 힘겹고 그 권력에 붙어 있는 사람도 우습고. 나는 그런 생각을 해본 적이 없어요.
Output 7 : 나는 힘도 부질없다고 생각해. 권력이 붙든 지위