## Dataloader

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.functional import pad
from torch.utils.data.distributed import DistributedSampler
import sentencepiece as spm
import pandas as pd

class TFDataset(Dataset):

    def __init__(self, bpe_model, tsv_file):
        sp = spm.SentencePieceProcessor()
        sp.load(bpe_model)

        self.sp = sp
        self.bos_id = sp.bos_id() #1
        self.eos_id = sp.eos_id() #2

        self.tsv_file = pd.read_csv(tsv_file, delimiter='\t', usecols=['src', 'tar'])

    def __len__(self):
        return len(self.tsv_file) #250k

    def __getitem__(self, idx):
        src_sent = self.tsv_file.iloc[idx, 0]
        tar_sent = self.tsv_file.iloc[idx, 1]
        src_encoded = [self.bos_id] + self.sp.encode_as_ids(src_sent) + [self.eos_id]
        tar_encoded = [self.bos_id] + self.sp.encode_as_ids(tar_sent) + [self.eos_id]

        return torch.tensor(src_encoded), torch.tensor(tar_encoded)

def collate_fn(batch, max_pad=128):

    '''batch : [(src_tensor, tar_tensor), ...]'''

    src_list, tar_list = [], []

    for (src, tar) in batch:
        src_padded = pad(src, (0, max_pad - len(src))) # 문장 뒤로 max_len까지 zero-padding
        src_list.append(src_padded)
        tar_padded = pad(tar, (0, max_pad - len(tar)))
        tar_list.append(tar_padded)

    src = torch.stack(src_list) # list([128],[128],[128]) => tensor w/ size([3,128])
    tar = torch.stack(tar_list)

    return (src, tar)

def create_dataloader(bpe_model, tsv_file, is_distributed=False, batch_size=128):
    dataset = TFDataset(bpe_model, tsv_file)
    sampler = (DistributedSampler(dataset) if is_distributed else None)

    train_dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=(is_distributed is False),
        sampler=sampler,
        collate_fn=collate_fn
    )
    return train_dataloader

In [3]:
bpe_model = "/content/drive/MyDrive/dmis/Transformer/bpe/bpe_250k.model"
tsv_path = "/content/drive/MyDrive/dmis/Transformer/bpe/train_df_250k.tsv"

## Embedding, Positional Encoding

In [14]:
import math

class Embeddings(nn.Module): # dim(input/output) -> dim(d_model)
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model) # (vocab_size x d_model) embedding layer 생성
        self.d_model = d_model
    def forward(self, x):
        return self.embed(x) * math.sqrt(self.d_model)

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pos_enc = torch.zeros(max_len, d_model) # [5000x512] (최대길이 한문장)
        pos = torch.arange(0, max_len).unsqueeze(1) # [max_len] -> [max_len,1] (position index)
        div_term = 1/torch.pow(10000, torch.arange(0, d_model, 2)/d_model) # 1/10000^(2i/d_model)

        pos_enc[:, 0::2] = torch.sin(pos*div_term)
        pos_enc[:, 1::2] = torch.cos(pos*div_term)
        pos_enc = pos_enc.unsqueeze(0) # batch_dim, max_len, d_model
        self.register_buffer("pos_enc", pos_enc) # pos_enc는 학습되지 않고 고정

    def forward(self, x): # batch_dim x seq_len x d_model
        x = x + self.pos_enc[:, :x.size(1)].requires_grad_(False) # pos_enc[:, :seq_len] 까지 잘라서 적용
        return self.dropout(x)

## Attention

In [37]:
def attention(query, key, value, mask=None, dropout=None):

    '''
    query=(1xd_k), key,value=(nxd_k)로 생각 (실제로 query도 (nxd_k) word matrix)
    scaling : d_k 커질수록 softmax 시 gradient saturate 방지
    '''

    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2,-1)) / math.sqrt(d_k) # score per word wrt query
    if mask:
        scores = scores.masked_fill(mask==0, -1e9) # pad masking
    prob = scores.softmax(dim=0)
    if dropout:
        prob = dropout(prob)

    weighted_query = torch.matmul(prob, value)

    return weighted_query, prob

In [39]:
import copy

def clones(module, N):
    "Produce modulelist with N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [40]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0

        self.d_model = d_model
        self.d_k = d_model // h
        self.h = h
        self.dropout = nn.Dropout(p=dropout)
        self.modulelist = clones(nn.Linear(d_model, d_model), N=4) # torch에 인식 위함, qkv 위해 3개, 마지막 위해 1개
        self.att_prob = None

    def forward(self, query, key, value, mask=None):
        if mask:
            mask = mask.unsqueeze(1) # ??
        num_batch = query.size(0)

        qkv_list = []
        for lin, x in zip(self.modulelist, (query, key, value)):
            qkv = lin(x).view(num_batch, -1, self.h, self.d_k).transpose(1,2)
            qkv_list.append(qkv)

        weighted_query, self.att_prob = attention(query, key, value, mask=mask, dropout=self.dropout)

        # concat
        weighted_query = (weighted_query.transpose(1,2).contiguous().view(num_batch, -1, self.h*self.d_k))
        del query
        del key
        del value

        return self.modulelist[-1](weighted_query)

## FFN

In [41]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_inner, dropout):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_inner) # linear transformation (input d -> hidden d)
        self.w_2 = nn.Linear(d_inner, d_model) # hidden d -> input d
        self.dropout = nn.Dropout(dropout) # regularization

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

## EncoderDecoder

In [42]:
class EncoderDecoder(nn.Module):
    '''A standard Encoder-Decoder architecture'''
    def __init__(self, encoder, decoder, src_embed, tar_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tar_embed = tar_embed
        self.generator = generator

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tar, tar_mask):
        return self.decoder(self.tar_embed(tar), memory, src_mask, tar_mask)

    def forward(self, src, tar, src_mask, tar_mask):
        return self.decode(self.encode(src, src_mask), src_mask, tar, tar_mask)

In [43]:
class LayerNorm(nn.Module):
    '''Normalize weights (into smaller range)
    Formula : (x-mean) / (std+eps)
    '''
    def __init__(self, size, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(size)) #scaling param
        self.b_2 = nn.Parameter(torch.zeros(size)) #bias param

    def forward(self, x):
        # compute mean, std along the last dimension
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)

        return self.a_2 * (x-mean) / (std+self.eps) + self.b_2

In [44]:
class ResConnection(nn.Module):
    '''apply residual connection to
    1) preserve original information
    2) prevent gradient vanishing
    '''
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return self.norm(x + self.dropout(sublayer(x)))

In [45]:
class Encoder(nn.Module):
    '''Single Encoder layer
    input : embedded src with positional encoding
    output : hidden representations z (applied multi-head attention, feed-forward)
    '''
    def __init__(self, layer, N):
        super().__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer)

In [46]:
class EncoderLayer(nn.Module):
    '''Core of encoder : self-attention -> feed forward'''
    def __init__(self, size, self_att, ff, dropout):
        super().__init__()
        self.self_att = self_att
        self.ff = ff
        self.sublayer = clones(ResConnection(size, dropout), 2) # self-attention, ff 둘 다 layernorm(residual connection) 적용하기 위함
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_att(x,x,x,mask)) # query, key, value
        return self.sublayer[1](x, self.ff)

In [47]:
class Decoder(nn.Module):
    '''N-layer decoder with masked attention'''
    def __init__(self, layer, N):
        super().__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tar_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tar_mask)
        return self.norm(x)

In [48]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_att, src_att, ff, dropout):
        super().__init__()
        self.size = size
        self.self_att = self_att
        self.src_att = src_att
        self.ff = ff
        self.sublayer = clones(ResConnection(size, dropout), 3) # masked self-att, self-att, ff 모두 layernorm(residual connection) 적용

    def forward(self, x, memory, src_mask, tar_mask):
        x = self.sublayer[0](x, lambda x: self.self_att(x,x,x,tar_mask))
        x = self.sublayer[1](x, lambda x: self.src_att(x, memory, memory, src_mask))
        return self.sublayer[2](x, self.ff)

In [49]:
from torch import nn
from torch.nn.functional import log_softmax

class Generator(nn.Module):
    '''Standard linear + softmax generation step'''
    def __init__(self, d_model, vocab):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

## Full Model

In [53]:
def make_model(
    src_vocab, tar_vocab, N=6, d_model=512, d_inner=2048, h=8, dropout=0.1
):
  attn = MultiHeadAttention(h, d_model)
  ff = PositionwiseFeedForward(d_model, d_inner, dropout)
  position = PositionalEncoding(d_model, dropout)

  model = EncoderDecoder(
      Encoder(EncoderLayer(size=d_model, self_att=copy.deepcopy(attn), ff=copy.deepcopy(ff), dropout=dropout), N),
      Decoder(DecoderLayer(size=d_model, self_att=copy.deepcopy(attn), src_att=copy.deepcopy(attn), ff=copy.deepcopy(ff), dropout=dropout), N),
      nn.Sequential(Embeddings(src_vocab, d_model), copy.deepcopy(position)),
      nn.Sequential(Embeddings(tar_vocab, d_model), copy.deepcopy(position)),
      Generator(d_model, tar_vocab)
      )

  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)

  return model