# BERT

In [None]:
pip install Korpora



In [None]:
# 7.15 네이버 영화 리뷰 데이터 불러오기

import numpy as np
import pandas as pd
from Korpora import Korpora

corpus = Korpora.load("nsmc")
df = pd.DataFrame(corpus.test).sample(20000, random_state=42)
train_df, valid_df, test_df = np.split(
    df.sample(frac=1, random_state=42),[int(0.6*len(df)), int(0.8*len(df))]
)
#print(train_df.head(5).to_markdown())

# 20,000개의 데이터세트를 6:2:2로 분리
print(f"Training Data Size: {len(train_df)}")
print(f"Validation Data Size: {len(valid_df)}")
print(f"Testing Data Size: {len(test_df)}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ra

  return bound(*args, **kwds)


In [None]:
# 7.16 BERT 입력 텐서 생성

import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

# 토크나이저를 텐서 데이터세트로 반환
def make_dataset(data, tokenizer, device):
  tokenized = tokenizer(
      text = data.text.tolist(),
      padding = "longest",
      truncation = True,
      return_tensors = "pt"
  )
  input_ids = tokenized["input_ids"].to(device)
  attention_mask = tokenized["attention_mask"].to(device)
  labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
  return TensorDataset(input_ids, attention_mask, labels)

# 샘플러 클래스를 활용해 데이터를 목적에 따라 샘플링
def get_dataloader(dataset, sampler, batch_size):
  data_sampler = sampler(dataset)
  dataloader = DataLoader(dataset, sampler=data_sampler, batch_size = batch_size)
  return dataloader

epochs = 3
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    do_lower_case=False
)

train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)
# RandomSampler: 데이터를 무작위로 샘플링 -> 학습에 적용

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, SequentialSampler, batch_size)
# SequentialSampler: 데이터를 고정된 순서로 반환 -> 검증 & 평가 배치에 적용

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

(tensor([   101,  58466,   9812, 118956, 119122,  59095,  10892,   9434, 118888,
           117,   9992,  40032,  30005,    117,   9612,  37824,   9410,  12030,
         42337,  10739,  83491,  12508,    106,    106,    102,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,

In [None]:
# 7.17 BERT 모델 선언

from torch import optim
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    num_labels=2
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps = 1e-8)
# AdamW: Adam 최적화 함수에 가중치 감쇠를 추가한 변형된 경사하강법 알고리즘

for main_name, main_module in model.named_children():
  print(main_name)
  for sub_name, sub_module in main_module.named_children():
    print("L",sub_name)
    for ssub_name, ssub_module in sub_module.named_children():
      print("| L", ssub_name)
      for sssub_name, sssub_module in ssub_module.named_children():
        print("| | L", sssub_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert
L embeddings
| L word_embeddings
| L position_embeddings
| L token_type_embeddings
| L LayerNorm
| L dropout
L encoder
| L layer
| | L 0
| | L 1
| | L 2
| | L 3
| | L 4
| | L 5
| | L 6
| | L 7
| | L 8
| | L 9
| | L 10
| | L 11
L pooler
| L dense
| L activation
dropout
classifier


In [None]:
# 7.13 모델 학습 및 검증
import numpy as np
from torch import nn

def calc_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

def train(model, optimizer, dataloader):
  model.train()
  train_loss = 0.0

  for input_ids, attention_mask, labels in dataloader:
    outputs = model(
        input_ids=input_ids,
        attention_mask = attention_mask,
        labels = labels
    )
    loss = outputs.loss
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  train_loss = train_loss/len(dataloader)
  return train_loss

def evaluation(model, dataloader):
  with torch.no_grad():
    model.eval()
    criterion = nn.CrossEntropyLoss()
    val_loss, val_accuracy = 0.0, 0.0

    for input_ids, attention_mask, labels in dataloader:
      outputs = model(
          input_ids = input_ids,
          attention_mask = attention_mask,
          labels = labels
      )
      logits = outputs.logits

      loss = criterion(logits, labels)
      logits = logits.detach().cpu().numpy()
      label_ids = labels.to("cpu").numpy()
      accuracy = calc_accuracy(logits, label_ids)

      val_loss += loss
      val_accuracy += accuracy

    val_loss = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy

best_loss = 10000
for epoch in range(epochs):
  train_loss = train(model, optimizer, train_dataloader)
  val_loss, val_accuracy = evaluation(model, valid_dataloader)
  print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val accuracy {val_accuracy:.4f}")
  '''if val_loss < best_loss:
    best_loss = val_loss
    torch.save(model.state_dict(),"../models/BertForSequenceClassification")
    print("Saved the model weights")
    '''

Epoch 1: Train Loss: 0.4014 Val Loss: 0.4196 Val accuracy 0.8127
Epoch 2: Train Loss: 0.3209 Val Loss: 0.4251 Val accuracy 0.8135
Epoch 3: Train Loss: 0.2539 Val Loss: 0.4674 Val accuracy 0.8123


In [None]:
# 7.14 모델 평가
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    num_labels=2
).to(device)

#model.load_state_dict(torch.load("../models/BERTForSequenceClassification.pt"))
test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 0.6987
Test Accuracy: 0.5078


# BART

In [None]:
pip install datasets



In [None]:
pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [None]:
# 7.18 뉴스 요약 데이터세트 불러오기
import numpy as np
from datasets import load_dataset

# 뉴스 요약 데이터 세트: (입력) 뉴스 본문 + (출력) 요약한 짧은 텍스트
# 5,000개를 샘플링해 6:2:2 비율로 사용

news = load_dataset("argilla/news-summary",split="test")
df = news.to_pandas().sample(5000, random_state=42)[["text","prediction"]]
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])
train_df, valid, test = np.split(
    df.sample(frac=1, random_state=42),[int(0.6*len(df)), int(0.8*len(df))]
)

print(f"Source News: {train_df.text.iloc[0][:200]}")
print(f"Summarization: {train_df.prediction.iloc[0][:50]}")
print(f"Training Data size: {len(train_df)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size: {len(test)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

(…)-00000-of-00001-ebc48879f34571f6.parquet:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

(…)-00000-of-00001-6227bd8eb10a9b50.parquet:   0%|          | 0.00/31.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20417 [00:00<?, ? examples/s]

Source News: DANANG, Vietnam (Reuters) - Russian President Vladimir Putin said on Saturday he had a normal dialogue with U.S. leader Donald Trump at a summit in Vietnam, and described Trump as civil, well-educated
Summarization: Putin says had useful interaction with Trump at Vi
Training Data size: 3000
Validation Data Size : 1000
Testing Data Size: 1000


  return bound(*args, **kwds)


In [None]:
# 7.19 BART 입력 텐서 생성

import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

# 토크나이저를 텐서 데이터세트로 반환
def make_dataset(data, tokenizer, device):
  tokenized = tokenizer(
      text = data.text.tolist(),
      padding = "longest",
      truncation = True,
      return_tensors = "pt"
  )
  labels=[]
  input_ids = tokenized["input_ids"].to(device)
  attention_mask = tokenized["attention_mask"].to(device)

  for target in data.prediction:
    labels.append(tokenizer.encode(target, return_tensors="pt").squeeze())
  labels = pad_sequence(labels, batch_first=True, padding_value=-100).to(device)
  return TensorDataset(input_ids, attention_mask, labels)
  # 요약 작업은 입/출력값 문장 길이가 다르기 때문에 padding을 사용
  # CrossEntropy와 같은 손실함수에서 패딩된 토큰을 무시하게 하기 위해 -100값 사용

# 샘플러 클래스를 활용해 데이터를 목적에 따라 샘플링
def get_dataloader(dataset, sampler, batch_size):
  data_sampler = sampler(dataset)
  dataloader = DataLoader(dataset, sampler=data_sampler, batch_size = batch_size)
  return dataloader

epochs = 3
batch_size = 8
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BartTokenizer.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base",
)

train_dataset = make_dataset(train_df, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)
# RandomSampler: 데이터를 무작위로 샘플링 -> 학습에 적용

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, SequentialSampler, batch_size)
# SequentialSampler: 데이터를 고정된 순서로 반환 -> 검증 & 평가 배치에 적용

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(tensor([   0,  495, 1889,  ...,    1,    1,    1]), tensor([1, 1, 1,  ..., 0, 0, 0]), tensor([    0, 35891,   161,    56,  5616, 10405,    19,   140,    23,  5490,
         3564,     2,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]))


In [None]:
# 7.20 BART 모델 선언
from torch import optim
from transformers import BartForConditionalGeneration

# 12개의 인코더/디코더 계층이 아닌 6개의 계층을 사용함
# facebook/bart-large로 12개 계층 사용 모델을 불러올 수 있음.

model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base",
).to(device)
optimizer = optim.AdamW(model.parameters(),lr=5e-5,eps=1e-8)

for main_name, main_module in model.named_children():
  print(main_name)
  for sub_name, sub_module in main_module.named_children():
    print("L",sub_name)
    for ssub_name, ssub_module in sub_module.named_children():
      print("| L", ssub_name)
      for sssub_name, sssub_module in ssub_module.named_children():
        print("| | L", sssub_name)

model
L shared
L encoder
| L embed_tokens
| L embed_positions
| L layers
| | L 0
| | L 1
| | L 2
| | L 3
| | L 4
| | L 5
| L layernorm_embedding
L decoder
| L embed_tokens
| L embed_positions
| L layers
| | L 0
| | L 1
| | L 2
| | L 3
| | L 4
| | L 5
| L layernorm_embedding
lm_head


In [None]:
pip install evaluate rouge_score absl-py



In [None]:
# 7.21 BART 모델 학습 및 평가
import numpy as np
import evaluate

def calc_rouge(preds, labels):
  preds = preds.argmax(axis=-1)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  rouge2 = rouge_score.compute(
      predictions=decoded_preds,
      references = decoded_labels
  )
  return rouge2["rouge2"]


def train(model, optimizer, dataloader):
  model.train()
  train_loss = 0.0

  for input_ids, attention_mask, labels in dataloader:
    outputs = model(
        input_ids=input_ids,
        attention_mask = attention_mask,
        labels = labels
    )
    loss = outputs.loss
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  train_loss = train_loss/len(dataloader)
  return train_loss

def evaluation(model, dataloader):
  with torch.no_grad():
    model.eval()
    val_loss, val_rouge = 0.0, 0.0

    for input_ids, attention_mask, labels in dataloader:
      outputs = model(
          input_ids = input_ids,
          attention_mask = attention_mask,
          labels = labels
      )
      logits = outputs.logits
      loss = outputs.loss

      logits = logits.detach().cpu().numpy()
      label_ids = labels.to("cpu").numpy()
      rouge = calc_rouge(logits, label_ids)

      val_loss += loss
      val_rouge += rouge

    val_loss = val_loss/len(dataloader)
    val_rouge = val_rouge/len(dataloader)
    return val_loss, val_rouge

rouge_score = evaluate.load("rouge", tokenizer=tokenizer)

best_loss = 10000
epochs=1
for epoch in range(epochs):
  train_loss = train(model, optimizer, train_dataloader)
  val_loss, val_rouge = evaluation(model, valid_dataloader)
  print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Rouge {val_rouge:.4f}")

In [None]:
# 7.22 BART 모델 평가
model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path = "facebook/bart-base"
).to(device)
model.load_state_dict(torch.load("../models/BartsForConditionalGeneration.pt"))

test_loss, test_rouge_score = evaluation(model, test_dataloader)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test ROUGE-2 Score: {test_rouge_score:.4f}")

In [None]:
# 7.23 문장 요약문 비교

from transformers import pipeline

summarizer = pipeline(
    task = "summarization",
    model = model,
    tokenizer = tokenizer,
    max_length=54,
    device="cpu"
)

for index in range(5):
  news_text = test.text.iloc[index]
  summarization = test.prediction.iloc[index]
  predicted_summarization = summarizer(news_text)[0]["summary_text"]
  print(f"정답 요약문 : {summarization}")
  print(f"모델 요약문 : {predicted_summarizaition}\n")

# ELECTRA

In [None]:
# 7.16 ELECTRA 입력 텐서 생성

import torch
from transformers import ElectraTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

# 토크나이저를 텐서 데이터세트로 반환
def make_dataset(data, tokenizer, device, max_length: int = 128):
  tokenized = tokenizer(
      text = data.text.tolist(),
      padding = "max_length",
      truncation = True,
      return_tensors = "pt"
  )
  input_ids = tokenized["input_ids"].to(device)
  attention_mask = tokenized["attention_mask"].to(device)
  labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
  return TensorDataset(input_ids, attention_mask, labels)

# 샘플러 클래스를 활용해 데이터를 목적에 따라 샘플링
def get_dataloader(dataset, sampler, batch_size):
  data_sampler = sampler(dataset)
  dataloader = DataLoader(dataset, sampler=data_sampler, batch_size = batch_size)
  return dataloader

epochs = 3
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = ElectraTokenizer.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator",
    do_lower_case=False
)

train_dataset = make_dataset(train_df, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)
# RandomSampler: 데이터를 무작위로 샘플링 -> 학습에 적용

valid_dataset = make_dataset(valid_df, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, SequentialSampler, batch_size)
# SequentialSampler: 데이터를 고정된 순서로 반환 -> 검증 & 평가 배치에 적용

test_dataset = make_dataset(test_df, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(tensor([    2,  6511, 14347,  4087,  4665,  4112,  2924,  4806,    16,  3809,
         4309,  4275,    16,  3201,  4376,  2891,  4139,  4212,  4007,  6557,
         4200,     5,     5,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,

In [None]:
# 7.25 KoELECTRA 모델 선언
from torch import optim
from transformers import ElectraForSequenceClassification

model = ElectraForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator",
    num_labels=2
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)

for main_name, main_module in model.named_children():
  print(main_name)
  for sub_name, sub_module in main_module.named_children():
    print("L",sub_name)
    for ssub_name, ssub_module in sub_module.named_children():
      print("| L", ssub_name)
      for sssub_name, sssub_module in ssub_module.named_children():
        print("| | L", sssub_name)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


electra
L embeddings
| L word_embeddings
| L position_embeddings
| L token_type_embeddings
| L LayerNorm
| L dropout
L encoder
| L layer
| | L 0
| | L 1
| | L 2
| | L 3
| | L 4
| | L 5
| | L 6
| | L 7
| | L 8
| | L 9
| | L 10
| | L 11
classifier
L dense
L activation
L dropout
L out_proj


In [None]:
# 7.13 모델 학습 및 검증
import numpy as np
from torch import nn

def calc_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

def train(model, optimizer, dataloader):
  model.train()
  train_loss = 0.0

  for input_ids, attention_mask, labels in dataloader:
    outputs = model(
        input_ids=input_ids,
        attention_mask = attention_mask,
        labels = labels
    )
    loss = outputs.loss
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  train_loss = train_loss/len(dataloader)
  return train_loss

def evaluation(model, dataloader):
  with torch.no_grad():
    model.eval()
    criterion = nn.CrossEntropyLoss()
    val_loss, val_accuracy = 0.0, 0.0

    for input_ids, attention_mask, labels in dataloader:
      outputs = model(
          input_ids = input_ids,
          attention_mask = attention_mask,
          labels = labels
      )
      logits = outputs.logits

      loss = criterion(logits, labels)
      logits = logits.detach().cpu().numpy()
      label_ids = labels.to("cpu").numpy()
      accuracy = calc_accuracy(logits, label_ids)

      val_loss += loss
      val_accuracy += accuracy

    val_loss = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy

best_loss = 10000
for epoch in range(epochs):
  train_loss = train(model, optimizer, train_dataloader)
  val_loss, val_accuracy = evaluation(model, valid_dataloader)
  print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val accuracy {val_accuracy:.4f}")

  if val_loss < best_loss:
    best_loss = val_loss
    torch.save(model.state_dict(),"../models/ELECTRAForSequenceClassification.pt")
    print("Saved the model weights")

In [None]:
# 7.14 모델 평가
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator",
    num_labels=2
).to(device)

model.load_state_dict(torch.load("../models/ELECTRAForSequenceClassification.pt"))
test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# T5

In [None]:
# 7.26
import numpy as np
from datasets import load_dataset

news = load_dataset("argilla/news-summary",split="test")
df = news.to_pandas().sample(5000, random_state=42)[["text","prediction"]]
df["text"] = "summarize: "+df["text"]
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])
train,valid,test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6*len(df)),int(0.8*len(df))]
)
print(f"Source News: {train.text.iloc[0]['text']}")
print(f"Summarization: {train.prediction.iloc[0][:50]}")

In [None]:
# 7.27 뉴스 요약 데이터세트 전처리

import torch
from transformers import T5Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

# 토크나이저를 텐서 데이터세트로 반환
def make_dataset(data, tokenizer, device):
  tokenized = tokenizer(
      text = data.text.tolist(),
      padding = "max_length",
      max_length=128,
      pad_to_max_length=True,
      truncation = True,
      return_tensors = "pt"
  )
  labels=[]
  input_ids = tokenized["input_ids"].to(device)
  attention_mask = tokenized["attention_mask"].to(device)
  for target in data.prediction:
    labels.append(tokenizer.encode(target, return_tensors="pt").squeeze())
  labels = pad_sequence(labels, batch_first=True, padding_value=-100).to(device)
  return TensorDataset(input_ids, attention_mask, labels)
  # 요약 작업은 입/출력값 문장 길이가 다르기 때문에 padding을 사용
  # CrossEntropy와 같은 손실함수에서 패딩된 토큰을 무시하게 하기 위해 -100값 사용

# 샘플러 클래스를 활용해 데이터를 목적에 따라 샘플링
def get_dataloader(dataset, sampler, batch_size):
  data_sampler = sampler(dataset)
  dataloader = DataLoader(dataset, sampler=data_sampler, batch_size = batch_size)
  return dataloader

epochs = 3
batch_size = 8
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path="t5-small",
)

train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)
# RandomSampler: 데이터를 무작위로 샘플링 -> 학습에 적용

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, SequentialSampler, batch_size)
# SequentialSampler: 데이터를 고정된 순서로 반환 -> 검증 & 평가 배치에 적용

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

In [None]:
# 7.28 모델 선언
from torch import optim
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path = "t5-small",
).to(device)
optimizer = optim.AdamW(model.parameters(), lre-5, eps=1e-8)

In [None]:
# 7.29 모델 학습 및 평가
import numpy as np
from torch import nn

def train(model, optimizer, dataloader):
  model.train()
  train_loss = 0.0

  for source_ids, source_mask, target_ids, target_mask in dataloader:
    decoder_inoput_ids = target_ids[:,1:].clone().detach()
    labels = target_ids[:,1:].clone().detach()
    labels[target_ids[:,1:]==tokenizer.pad_token_id] = -100

    outputs = model(
        input_ids = source_ids,
        attention_mask = source_mask,
        decoder_input_ids = decoder_input_ids.
        labels = labels,
    )

    loss = outputs.loss
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  train_loss = train_loss / len(dataloader)
  return train_loss

def evaluation(model, dataloader):
  with torch.no_grad():
    model.eval()
    val_loss=0.0

    for source_ids, source_mask, target_ids, target_mask in dataloader:
      decoder_input_ids = target_ids[:, :-1].contiguous()
      labels = target_ids[:, 1:].clone().detach()
      labels[target_ids[:,1:]==tokenizer.pad_token_id] = -100

      outputs = model(
          input_ids = source_ids,
          attention_mask = source_mask,
          decoder_input_ids = decoder_input_ids,
          labels=labels,
      )

      loss = outputs.loss
      val_loss += loss

    val_loss = val_loss/len(dataloader)
    return val_loss

best_loss=10000
for epcoh in (epochs):
  train_loss = train(model, optimizer, train_dataloader)
  val_loss = evalutaion(model, valid_dataloader)
  print(f"Epoch {epoch +1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}")

  if val_loss < best_loss:
    best_loss = val_loss
    torch.save(model.state_dict(),"../models/T5ForConditionalGeneration.pt")
    print("Saved the model weights")