<a href="https://colab.research.google.com/github/PRESSANDPULL/hanghae99/blob/main/%5B3%EC%A3%BC%EC%B0%A8%5D_%EA%B8%B0%EB%B3%B8%EA%B3%BC%EC%A0%9C_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [3주차 기본과제] DistilBERT로 뉴스 기사 분석하기

In [2]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets

# tqdm: 프로세스나 반복문 진행 상태를 실시간으로 시각적으로 표시하는 진행 바(progress bar) 라이브러리
# boto3: AWS 서비스를 Python 코드에서 쉽게 조작할 수 있도록 지원하는 SDK
# requests: HTTP 요청(GET, POST 등)을 쉽게 처리할 수 있도록 도와주는 HTTP 라이브러리
# regex: 고급 정규 표현식을 지원하여 문자열을 검색, 매칭, 치환할 수 있는 문자열 처리 라이브러리
# sentencepiece: 텍스트를 토크나이징하는 방법 중 하나로, subword-level 토크나이저를 구현하기 위한 텍스트 처리 라이브러리
# sacremoses: Moses라는 기계 번역 시스템에서 사용하는 토크나이저와 디토크나이저를 Python에서 구현한 라이브러리



In [4]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
ds = load_dataset("fancyzhx/ag_news")

def collate_fn(batch):
    texts, labels = [], []
    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    # Tokenizer를 사용해 input_ids와 attention_mask 생성
    encodings = tokenizer(texts, padding=True, return_tensors="pt")
    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]
    labels = torch.LongTensor(labels)

    return input_ids, attention_mask, labels


train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [12]:
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
model

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [13]:
from torch import nn


class TextClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
        self.classifier = nn.Linear(768, 4)  # 출력 차원을 4로 수정

    def forward(self, input_ids, attention_mask):
        x = self.encoder(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
        x = self.classifier(x[:, 0])
        return x


model = TextClassifier()

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


In [None]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to('cuda')
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

for epoch in range(n_epochs):
    total_loss = 0.
    model.train()
    for data in train_loader:
        model.zero_grad()
        input_ids, attention_mask, labels = data
        input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        preds = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(preds, labels)  # 다중 클래스 손실 계산
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

시간이 부족하여 결과를 돌려보지 못 했습니다... 4주차부터는 유료 결제를 해야겠습니다...

In [None]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        input_ids, attention_mask, labels = data
        input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        preds = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(preds, dim=-1)  # argmax로 예측값 결정

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")