# 데이터 로더

## 라이브러리 설치

In [None]:
!pip install transformers

## Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# 자신의 경로로 설정
BASE_DIR="/content/drive/MyDrive/Colab Notebooks/GW"

## 실행 테스트 코드

### 라이브러리 Import

In [14]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from transformers import BartTokenizer

import json
import numpy as np
from functools import cmp_to_key
from itertools import chain

### Dataset, DataLoader 사용한 코드

In [6]:
# cmp_to_key 를 사용할 함수
def cmp_aspect(v1, v2):
    if v1[0]['from'] == v2[0]['from']:
        return v1[1]['from'] - v2[1]['from']
    return v1[0]['from'] - v2[0]['from']


def cmp_opinion(v1, v2):
    if v1[1]['from'] == v2[1]['from']:
        return v1[0]['from'] - v2[0]['from']
    return v1[1]['from'] - v2[1]['from']

In [7]:
class ABSADataset(Dataset):
    def __init__(self, path, tokenizer, opinion_first=True, limit=None):
        super(ABSADataset, self).__init__()
        self.limit = limit
        self.tokenizer = tokenizer
        self.data = self._load_data(path)
        self.opinion_first = opinion_first

        self.mapping = {
            'POS': '<<positive>>',
            'NEG': '<<negative>>',
            'NEU': '<<neutral>>'
        }
        self.target_shift = len(self.mapping) + 2

        cur_num_tokens = self.tokenizer.vocab_size
        self.cur_num_token = cur_num_tokens

        tokens_to_add = sorted(
            list(self.mapping.values()),
            key=lambda x: len(x),
            reverse=True
        )

        unique_no_split_tokens = self.tokenizer.unique_no_split_tokens
        sorted_add_tokens = sorted(
            list(tokens_to_add),
            key=lambda x: len(x),
            reverse=True
        )

        for tok in sorted_add_tokens:
            assert self.tokenizer.convert_tokens_to_ids(
                [tok])[0] == self.tokenizer.unk_token_id
        self.tokenizer.unique_no_split_tokens = unique_no_split_tokens + sorted_add_tokens
        self.tokenizer.add_tokens(sorted_add_tokens)
        self.mapping2id = {}
        self.mapping2targetid = {}

        for key, value in self.mapping.items():
            key_id = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(value))
            assert len(key_id) == 1, value
            assert key_id[0] >= cur_num_tokens
            self.mapping2id[key] = key_id[0]
            self.mapping2targetid[key] = len(self.mapping2targetid)

    def _load_data(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        data = data[:self.limit] if self.limit else data

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ins = self.data[idx]
        raw_words = ins['words']
        aspects = ins['aspects']
        opinions = ins['opinions']
        print("raw_words", raw_words)
        print("aspects", aspects)
        print("opinions", opinions)

        target, target_spans, src_tokens = self.prepare_target(ins)
        return {
            'src_tokens': src_tokens,
            'tgt_tokens': target,
            'target_span': target_spans,
            'src_seq_len': len(src_tokens),
            'tgt_seq_len': len(target)
        }

    def prepare_target(self, ins):
        # Byte pair
        raw_words = ins['raw_words']
        word_bpes = [[self.tokenizer.bos_token_id]]
        for word in raw_words:
            bpes = self.tokenizer.tokenize(word, add_prefix_space=True)
            bpes = self.tokenizer.convert_tokens_to_ids(bpes)
            word_bpes.append(bpes)
        word_bpes.append([self.tokenizer.eos_token_id])

        lens = list(map(len, word_bpes))
        cum_lens = np.cumsum(list(lens)).tolist()
        target = [0]  # sos를 위해 0 추가
        target_spans = []

        aspects_opinions = [(a, o)
                            for a, o in zip(ins['aspects'], ins['opinions'])]
        if self.opinion_first:
            aspects_opinions = sorted(
                aspects_opinions, key=cmp_to_key(cmp_opinion))
        else:
            aspects_opinions = sorted(
                aspects_opinions, key=cmp_to_key(cmp_aspect))

        for aspects, opinions in aspects_opinions:
            # bpe의 start를 예측
            print("aspects", aspects)
            print("opinions", opinions)
            assert aspects['index'] == opinions['index']

            a_start_bpe = cum_lens[aspects['from']]
            a_end_bpe = cum_lens[aspects['to']-1]

            o_start_bpe = cum_lens[opinions['from']]
            o_end_bpe = cum_lens[opinions['to']-1]

            if self.opinion_first:
                target_spans.append([o_start_bpe+self.target_shift, o_end_bpe+self.target_shift,
                                     a_start_bpe+self.target_shift, a_end_bpe+self.target_shift])
            else:
                target_spans.append([a_start_bpe+self.target_shift, a_end_bpe+self.target_shift,
                                     o_start_bpe+self.target_shift, o_end_bpe+self.target_shift])
            print("target_spans", target_spans)

            # 앞에 sos랑 eos 포함
            target_spans[-1].append(self.mapping2targetid[aspects['polarity']]+2)
            target_spans[-1] = tuple(target_spans[-1])
        target.extend(list(chain(*target_spans)))
        target.append(1)  # eos를 위해 1을 추가
        print("target", target)

        return {'tgt_tokens': target, 'target_span': target_spans, 'src_tokens': list(chain(*word_bpes))}

In [None]:
# 데이터셋 설정
data_path = '/data/train_convert.json'
print(BASE_DIR + data_path)
# Pretrained Model 설정
batch_size = 4

In [15]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', add_prefix_space=True)

In [16]:
dataset = ABSADataset(BASE_DIR + data_path, tokenizer, limit=10)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
for batch in dataloader:
    print("Batch", batch)

print("DONE")