In [1]:
import math
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder , TransformerEncoderLayer

import torchtext
from torchtext.data.utils import get_tokenizer

In [39]:
class Transformer(nn.Module) : # 모델 정의
    '''
    num_token : 토큰 갯수 > 단어 임베딩을 위함
    num_inputs : embedding 차원 수
    num_heads : 멀티헤드 수
    num_hidden : layer 차원수
    num_layers : layer 수
    dropout = dropout 비율 defalut:0.3 
    '''
    def __init__(self, num_token, num_inputs, num_heads, num_hidden, num_layers, dropout = 0.3):
        super(Transformer, self).__init__()
        #======================= 초기설정 =============================
        self.model_name = "transformer"
        self.mask_source = None
        self.position_enc = PosEnc(num_inputs, dropout) # PosEnd -> functional
        
        layers_enc = TransformerEncoderLayer(num_inputs,num_heads,num_hidden, dropout) # encoder block
        self.enc_transformer = TransformerEncoder(layers_enc, num_layers) # encoder block를 n개 쌓겠다
        self.enc = nn.Embedding(num_token, num_inputs) # word to embedding layer
        self.num_inputs = num_inputs
        self.dec = nn.Linear(num_inputs, num_token)
        self.init_params() # init_params() -> functional
    
    def _gen_sqr_nxt_mask(self, size):
        ''' 
        torch.triu : 대각행렬 기준 위쪽만 1로 채움
            ex) in : torch.triu(torch.ones(5,5))
                out : tensor([[1., 1., 1., 1., 1.],
                              [0., 1., 1., 1., 1.],
                              [0., 0., 1., 1., 1.],
                              [0., 0., 0., 1., 1.],
                              [0., 0., 0., 0., 1.]])
                              
                in : (torch.triu(torch.ones(size,size)) == 1).transpose(0,1) 
                out : tensor([[ True, False, False, False, False],
                              [ True,  True, False, False, False],
                              [ True,  True,  True, False, False],
                              [ True,  True,  True,  True, False],
                              [ True,  True,  True,  True,  True]])    
                
        '''
        msk = (torch.triu(torch.ones(size,size)) == 1).transpose(0,1) # mask 안할 위치 지정
        msk = msk.float().masked_fill(msk==0, float("-inf")) # mask 할 위치 -inf
        msk = msk.masked_fill(msk == 1, float(0.0)) # mask 안할 위치 0으로 변경
        return msk
    
    def init_params(self):
        ''' 
        파라미터 초기화 세팅 function
        '''
        initial_rng = 0.12
        self.enc.weight.data.uniform_(-initial_rng , initial_rng) # -initial_rng ~ initial_rng 사이 실수
        self.dec.bias.data.zero_() # bias 0
        self.dec.weight.data.uniform_(-initial_rng , initial_rng)
        
    def forward(self, source):
        '''
        source : 단어 list
        mask_source : mask의 형태(특정 마스크로 지정가능)
        '''
        if self.mask_source is None or self.mask_source.size(0) != len(source): # mask_source가 지정되지 않으면
            dvc = source.device # 입력데이터는 어디에 적재되어있는가 "CPU"? or "GPU"
            msk = self._gen_sqr_nxt_mask(len(source)).to(dvc) # mask 배열 생성 
            self.mask_source = msk  
            
        source = self.enc(source) * math.sqrt(self.num_inputs) # self.enc : embedding layer / out > source embedding 
                                                               # math.sqrt : 값 정규화
        source = self.position_enc(source) # positional encoding
        
        op = self.enc_transformer(source, self.mask_source)
        op = self.dec(op) # 단순 linear를 이용하여 decode
        return op

In [85]:
class PosEnc(nn.Module):
    '''
    Positional Encoding 
    '''
    def __init__(self, d_m, dropout = 0.2 , size_limit = 5000):
        super(PosEnc,self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        p_enc = torch.zeros(size_limit, d_m) # size_limit : vocab length, d_m : embedding dim 
                                             # 2차원 array
        
        pos = torch.arange(0, size_limit,dtype = torch.float).unsqueeze(1) # [size_limit,] > [size_limit,1]
        
        divider = torch.exp(torch.arange(0,d_m,2).float() * (-math.log(10000.0) / d_m))
        
        p_enc[:,0::2] = torch.sin(pos * divider)
        p_enc[:,1::2] = torch.cos(pos * divider) # [size_limit,d_m] 
        p_enc = p_enc.unsqueeze(0).transpose(0,1) # [size_limit,d_m] > [1,size_limit,d_m] > [size_limit,1,d_m]
        
        self.register_buffer("p_enc",p_enc)
    
    def forward(self,x):
        return self.dropout(x + self.p_enc[:x.size(0),:])

In [92]:
get_tokenizer("basic_english")

<function torchtext.data.utils._basic_english_normalize(line)>

In [139]:
# https://tutorials.pytorch.kr/beginner/text_sentiment_ngrams_tutorial.html
# 위 링크를 참고하여 코드를 재구성하였습니다.

train_iter = iter(torchtext.datasets.AG_NEWS(split="train"))
#next(train_iter)  #(label, text)형식으로 구성
 
 
# 1. vocab 생성

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
train_iter = torchtext.datasets.AG_NEWS(split="train")

def yield_tokens(data_iter):
    for _ , text in data_iter : # (label, text) 
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter),
                                  specials=["<unk>"],
                                  max_tokens = 10)

vocab.set_default_index(vocab["the"]) #vocab 안에 없는 단어가 들어왔을때 해당 단어의 index를 return

In [148]:
training_text, validation_text, testing_text = torchtext.datasets.WikiText2() # train, val, test 제공


(ShardingFilterIterDataPipe,
 ShardingFilterIterDataPipe,
 ShardingFilterIterDataPipe)

In [240]:
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter : # (label, text) 
        yield tokenizer(text)

training_text, validation_text, testing_text = torchtext.datasets.WikiText2() # train, val, test 제공

# training_text를 이용하여 vocab 생성
vocab = build_vocab_from_iterator(yield_tokens(training_text),
                                  specials= ["<sos>","<eos>","<unk>"]) # vocab생성
vocab.set_default_index(vocab["<unk>"]) # 없는 단어 처리

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def data_process(raw_text_iter):
    # 텍스트를 하나씩 넣어 숫자로 변환
    data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]    

    return 

                                                                  


In [244]:
data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in training_text]  

In [255]:
tuple(filter(lambda t: t.numel() > 0, data))

23767

In [257]:
filter(lambda t: t.numel() > 0, data)

<filter at 0x1378dbf3df0>

In [259]:
data[0]

tensor([], dtype=torch.int64)

In [249]:
len(data)

36718

In [242]:
text_tokenizer(training_text)

TypeError: can only concatenate str (not "ShardingFilterIterDataPipe") to str

In [232]:
def gen_batches(text_dataset, batch_size):
    text_dataset = TEXT.numericalize([text_dataset.examples[0].text])
    # divide text dataset into parts of size equal to batch_size
    num_batches = text_dataset.size(0) // batch_size
    # remove data points that lie outside batches (remainders)
    text_dataset = text_dataset.narrow(0, 0, num_batches * batch_size)
    # distribute dataset across batches evenly
    text_dataset = text_dataset.view(batch_size, -1).t().contiguous()
    return text_dataset.to(device)

In [None]:
TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), lower=True, eos_token='<eos>', init_token='<sos>')
training_text, validation_text, testing_text = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(training_text)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def gen_batches(text_dataset, batch_size):
    text_dataset = TEXT.numericalize([text_dataset.examples[0].text])
    # divide text dataset into parts of size equal to batch_size
    num_batches = text_dataset.size(0) // batch_size
    # remove data points that lie outside batches (remainders)
    text_dataset = text_dataset.narrow(0, 0, num_batches * batch_size)
    # distribute dataset across batches evenly
    text_dataset = text_dataset.view(batch_size, -1).t().contiguous()
    return text_dataset.to(device)

training_batch_size = 32
evaluation_batch_size = 16

training_data = gen_batches(training_text, training_batch_size)
validation_data = gen_batches(validation_text, evaluation_batch_size)
testing_data = gen_batches(testing_text, evaluation_batch_size)

AttributeError: module 'torchtext.data' has no attribute 'Field'

In [123]:
vocab["안녕!"]

RuntimeError: Token 안녕! not found and default index is not set

In [86]:
TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), lower=True, eos_token='<eos>', init_token='<sos>')
training_text, validation_text, testing_text = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(training_text)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def gen_batches(text_dataset, batch_size):
    text_dataset = TEXT.numericalize([text_dataset.examples[0].text])
    # divide text dataset into parts of size equal to batch_size
    num_batches = text_dataset.size(0) // batch_size
    # remove data points that lie outside batches (remainders)
    text_dataset = text_dataset.narrow(0, 0, num_batches * batch_size)
    # distribute dataset across batches evenly
    text_dataset = text_dataset.view(batch_size, -1).t().contiguous()
    return text_dataset.to(device)

training_batch_size = 32
evaluation_batch_size = 16

training_data = gen_batches(training_text, training_batch_size)
validation_data = gen_batches(validation_text, evaluation_batch_size)
testing_data = gen_batches(testing_text, evaluation_batch_size)

AttributeError: module 'torchtext.data' has no attribute 'Field'