In [None]:
!pip install -U datasets transformers

In [None]:
import copy
import torch
import math
import gc
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
tokenizer=AutoTokenizer.from_pretrained('klue/roberta-base')
data=load_dataset('squad_kor_v1')

train-00000-of-00001.parquet:   0%|          | 0.00/11.6M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60407 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5774 [00:00<?, ? examples/s]

In [None]:
class config:
  batch_size = 8
  hidden_dim = 512
  num_heads = 8
  num_layers = 4
  head_dim = hidden_dim // num_heads
  ffn_dim = hidden_dim * 4
  max_length = 256
  dropout_rate = 0.1
  is_training = True
  voca_size = tokenizer.vocab_size
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class QAdataset(torch.utils.data.Dataset):
  def __init__(self,config,data,tokenizer):
    self.data = data
    self.tokenizer = tokenizer
    self.config = config
    self.sequence = []
    self._build()
  def _build(self):
    question = self.data['question']
    context = self.data['context']
    answers = self.data['answers']

    for i in range(len(self.data)):
      q = question[i]
      c = context[i]
      answer = answers[i]
      answer_text = answer['text'][0]
      answer_start = answer['answer_start'][0]
      answer_end = answer_start + len(answer_text)

      tokenized = tokenizer(q,c,return_attention_mask=True,return_offsets_mapping=True,return_token_type_ids=True,padding = 'max_length',max_length = self.config.max_length,truncation = True)

      input_ids = tokenized['input_ids']
      attention_mask = tokenized['attention_mask']
      offset_mapping = tokenized['offset_mapping']

      context_index = -1

      for index,val in enumerate(tokenized.sequence_ids()):
        if index > 0 and val == 1 :
          context_index = index
          break

      start = -1
      end = -1

      for index in range(self.config.max_length):

        if context_index > index :
          continue
        start_offset,end_offset = offset_mapping[index]
        if start_offset <= answer_start < end_offset and start == -1:
          start = index

        if start_offset <= answer_end-1 < end_offset :
          end = index
      if start == -1 or end == -1 :
        continue
      sequence = {
          'input_ids':input_ids,
          'attention_mask':attention_mask,
          'offset_mapping' : offset_mapping,
          'c':c,
          'answer_text':answer_text
          }
      if self.config.is_training :
        sequence['start'] = start
        sequence['end'] = end

      self.sequence.append(sequence)

  def __len__(self):
    return len(self.sequence)
  def __getitem__(self,i):
    sample = self.sequence[i]

    output ={
        'input_ids' : torch.tensor(sample['input_ids'],dtype = torch.long),
        'attention_mask' : torch.tensor(sample['attention_mask'],dtype = torch.long),
        'offset_mapping' : torch.tensor(sample['offset_mapping'],dtype = torch.long),
        'c' : sample['c'],
        'answer_text' : sample['answer_text']
    }
    if self.config.is_training :
      output['start'] = torch.tensor(sample['start'],dtype = torch.long)
      output['end'] = torch.tensor(sample['end'],dtype = torch.long)
    return output

In [None]:
qadata=QAdataset(config,data['train'],tokenizer)

In [None]:
val_qadata=QAdataset(config,data['validation'],tokenizer)

In [None]:
load=torch.utils.data.DataLoader(qadata,batch_size = config.batch_size,drop_last = True)

In [None]:
val_load=torch.utils.data.DataLoader(val_qadata,batch_size = config.batch_size,drop_last = True)

In [None]:
class MultiHeadAttention(torch.nn.Module):
  def __init__(self,config):
    super().__init__()
    self.config = config
    self.linearQ = torch.nn.Linear(config.hidden_dim,config.hidden_dim)
    self.linearK = torch.nn.Linear(config.hidden_dim,config.hidden_dim)
    self.linearV = torch.nn.Linear(config.hidden_dim,config.hidden_dim)
    self.linearO = torch.nn.Linear(config.hidden_dim,config.hidden_dim)
  def forward(self,query,key,value,attention_mask):
    B,S,H = query.shape
    T = key.size(1)
    N = self.config.num_heads
    HD = self.config.head_dim

    Q = self.linearQ(query).view(B,S,N,HD).transpose(2,1)
    K = self.linearK(key).view(B,T,N,HD).transpose(2,1)
    V = self.linearV(value).view(B,T,N,HD).transpose(2,1)

    qk = torch.matmul(Q,K.transpose(3,2)) / math.sqrt(HD)

    if attention_mask is not None :
      # key masking
      mask = torch.zeros_like(attention_mask,dtype = torch.float,device = self.config.device)
      mask.masked_fill_(attention_mask.logical_not(),float('-inf'))
      qk += mask.unsqueeze(1).unsqueeze(2)

    causal = torch.full((S,T),float('-inf'),dtype = torch.float,device= self.config.device).triu(diagonal = 1)

    qk += causal.unsqueeze(0).unsqueeze(0)

    alpha = torch.softmax(qk, dim = -1)

    scores = torch.matmul(alpha,V).transpose(2,1).contiguous().view(B,T,H)

    out = self.linearO(scores)

    return out

In [None]:
class Layer(torch.nn.Module):
  def __init__(self,config):
    super().__init__()
    self.config = config
    self.mha_norm = torch.nn.LayerNorm(config.hidden_dim)
    self.mha = MultiHeadAttention(config)
    self.mha_dropout = torch.nn.Dropout(config.dropout_rate)

    self.ffn = torch.nn.Sequential(
        torch.nn.LayerNorm(config.hidden_dim),
        torch.nn.Linear(config.hidden_dim,config.ffn_dim),
        torch.nn.ReLU(),
        torch.nn.Linear(config.ffn_dim,config.hidden_dim),
        torch.nn.Dropout(config.dropout_rate)
    )
  def forward(self,src,attention_mask):
    x = src

    v = self.mha_norm(x)
    x = x + self.mha_dropout(self.mha(v,v,v,attention_mask))
    x = x + self.ffn(x)

    return x

In [None]:
attention_mask_test = torch.cat([torch.ones((16,256)),torch.zeros((16,256))],dim = -1)

In [None]:
class Block(torch.nn.Module):
  def __init__(self,config):
    super().__init__()
    self.config = config
    self.layers = torch.nn.ModuleList([Layer(config) for _ in range(config.num_layers)])

  def forward(self,src,attention_mask):
    x = src

    for layer in self.layers :
      x = layer(x,attention_mask)

    return x

In [None]:
class Transformer(torch.nn.Module):
  def __init__(self,config):
    super().__init__()
    self.config = config
    self.token_embedding = torch.nn.Embedding(config.voca_size,config.hidden_dim)
    self.pos_embedding = torch.nn.Embedding(config.max_length,config.hidden_dim)
    self.block = Block(config)
    self.out_layer_norm = torch.nn.LayerNorm(config.hidden_dim)
    self.start_out = torch.nn.Linear(config.hidden_dim,1)
    self.end_out = torch.nn.Linear(config.hidden_dim,1)

  def forward(self,src,attention_mask):
    S = src.size(1)
    tk_emb = self.token_embedding(src)
    seq = torch.arange(S,dtype = torch.long,device = self.config.device)
    pos_emb = self.pos_embedding(seq)
    emb = tk_emb + pos_emb.unsqueeze(0)

    out = self.block(emb,attention_mask)
    out = self.out_layer_norm(out)

    start = self.start_out(out).unsqueeze(-1)
    end = self.end_out(out).unsqueeze(-1)
    return start,end


In [None]:
from transformers import AutoModel,AutoConfig

In [None]:
bert_config=AutoConfig.from_pretrained('klue/roberta-base')

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

In [None]:
class Transformer(torch.nn.Module):
  def __init__(self,config):
    super().__init__()
    self.config = config
    self.pre_trained = AutoModel.from_pretrained('klue/roberta-base')
    self.out_layer_norm = torch.nn.LayerNorm(bert_config.hidden_size)
    self.start_out = torch.nn.Linear(bert_config.hidden_size,1)
    self.end_out = torch.nn.Linear(bert_config.hidden_size,1)

  def forward(self,inputs):
    out = self.pre_trained(**inputs).last_hidden_state
    out = self.out_layer_norm(out)

    start = self.start_out(out).squeeze(-1)
    end = self.end_out(out).squeeze(-1)
    return start,end

In [None]:
del tr

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
tr = Transformer(config)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tr.to(config.device)

Transformer(
  (pre_trained): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [None]:
epochs = 5

In [None]:
start_loss = torch.nn.CrossEntropyLoss()
end_loss = torch.nn.CrossEntropyLoss()

Discriminative Learning Rates / Layer-wise LR Decay도 적용해볼 수 있으나, 여기에서는 2e-5로 일률적으로 적용

In [None]:
opt = torch.optim.Adam(tr.parameters(),lr = 2e-5)

In [None]:
losses = []

In [None]:
val_losses = []

In [None]:
from tqdm.auto import tqdm

In [None]:
class EarlyStop :
  def __init__(self,method='max',delta = 0):
    self.best_score = None
    self.path = './qabest.pt'
    self.patience = 3
    self.method = method
    self.no_impr = 0
    self.stop = False
    self.delta = delta


  def __call__(self,model,val_metric):

    score = -val_metric if self.method == 'min' else val_metric

    if self.best_score is None :
      self.best_score = score
      self.checkpoint(model)
    elif score <= self.best_score + self.delta :
      self.no_impr +=1

      if self.no_impr >= self.patience :
        self.stop = True

    else:
      self.best_score = score
      self.checkpoint(model)
      self.no_impr = 0

  def checkpoint(self,model):
    torch.save(model.state_dict(),self.path)

In [None]:
ealrystop = EarlyStop('min',0)

In [None]:
for epoch in range(epochs):
  epoch_loss = 0
  tr.train()
  for b in tqdm(load,desc=f"Training Epoch {epoch+1}") :
    opt.zero_grad()
    inputs = {'input_ids':b['input_ids'].to(config.device),'attention_mask':b['attention_mask'].to(config.device)}
    start_true,end_true=b['start'].to(config.device),b['end'].to(config.device)
    start_pred,end_pred=tr(inputs)

    start_cal = start_loss(start_pred,start_true)
    end_cal = end_loss(end_pred,end_true)

    loss = start_cal + end_cal
    loss.backward()
    opt.step()
    epoch_loss += loss.item()
  epoch_loss /= len(load)
  losses.append(epoch_loss)
  print(f"Epoch {epoch+1} Train Loss: {epoch_loss:.4f}")

  epoch_loss = 0
  tr.eval()
  for b in tqdm(val_load, desc=f"Validation Epoch {epoch+1}") :
    inputs = {'input_ids':b['input_ids'].to(config.device),'attention_mask':b['attention_mask'].to(config.device)}
    start_true,end_true=b['start'].to(config.device),b['end'].to(config.device)

    with torch.no_grad() :
      start_pred,end_pred=tr(inputs)

      start_cal = start_loss(start_pred,start_true)
      end_cal = end_loss(end_pred,end_true)

      loss = start_cal + end_cal

      epoch_loss += loss.item()
  epoch_loss /=len(val_load)
  val_losses.append(epoch_loss)
  print(f"Epoch {epoch+1} Val Loss: {epoch_loss:.4f}")

  ealrystop(tr,epoch_loss)

  if ealrystop.stop :
    break

Training Epoch 1:   0%|          | 0/6820 [00:00<?, ?it/s]

Epoch 1 Train Loss: 1.0226


Validation Epoch 1:   0%|          | 0/651 [00:00<?, ?it/s]

Epoch 1 Val Loss: 0.7962


Training Epoch 2:   0%|          | 0/6820 [00:00<?, ?it/s]

Epoch 2 Train Loss: 0.5695


Validation Epoch 2:   0%|          | 0/651 [00:00<?, ?it/s]

Epoch 2 Val Loss: 0.8915


Training Epoch 3:   0%|          | 0/6820 [00:00<?, ?it/s]

Epoch 3 Train Loss: 0.4049


Validation Epoch 3:   0%|          | 0/651 [00:00<?, ?it/s]

Epoch 3 Val Loss: 0.9702


Training Epoch 4:   0%|          | 0/6820 [00:00<?, ?it/s]

Epoch 4 Train Loss: 0.3171


Validation Epoch 4:   0%|          | 0/651 [00:00<?, ?it/s]

Epoch 4 Val Loss: 1.0270


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls -l

total 432200
drwx------ 7 root root      4096 May 23 08:56 drive
-rw-r--r-- 1 root root 442564039 May 23 06:44 qabest.pt
drwxr-xr-x 1 root root      4096 May 14 13:38 sample_data


In [None]:
!cp ./qabest.pt /content/drive/MyDrive/

1epoch = about 41 min