In [1]:
!pip install datasets transformers



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

In [3]:
# ==========================================
# 1. Transformer 핵심 모듈 직접 구현
# ==========================================

In [4]:
# [1-1] Positional Encoding (사인/코사인 방식)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        return x + self.pe[:, :x.size(1)]

In [5]:
# [1-2] Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):
    def forward(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        # 수식: softmax(QK^T / sqrt(d_k))V
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(scores, dim=-1)
        return torch.matmul(attn_probs, V), attn_probs

In [6]:
# [1-3] Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)
        self.attention = ScaledDotProductAttention()

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        # 선형 변환 후 헤드 분할: [batch, num_heads, seq, d_k]
        q = self.w_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.w_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.w_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        context, _ = self.attention(q, k, v, mask)
        # 헤드 병합: [batch, seq, d_model]
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.fc(context)

In [7]:
# [1-4] Position-wise Feed Forward
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [8]:
# [1-5] Encoder Layer (Residual + LayerNorm 조립)
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Attention + Residual + Norm
        attn_out = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        # FFN + Residual + Norm
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))
        return x

In [9]:
# ==========================================
# 2. AG News 데이터셋 준비
# ==========================================

In [10]:
print("데이터셋 로드 및 토크나이징 중...")
raw_ds = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

데이터셋 로드 및 토크나이징 중...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

In [12]:
# 실습용 데이터 축소 (학습 2000개, 테스트 400개)
train_ds = raw_ds['train'].shuffle(seed=42).select(range(10000)).map(tokenize_fn, batched=True)
test_ds = raw_ds['test'].shuffle(seed=42).select(range(400)).map(tokenize_fn, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [13]:
train_ds.set_format(type='torch', columns=['input_ids', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'label'])

In [14]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

In [15]:
# ==========================================
# 3. 뉴스 분류용 Transformer 모델 조립
# ==========================================

In [16]:
class NewsClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        mask = (x != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        x = self.dropout(self.pos_encoding(self.embedding(x)))
        for layer in self.layers:
            x = layer(x, mask)
        # Global Average Pooling (문장 전체 특징 집약)
        return self.fc(x.mean(dim=1))

In [17]:
# 모델 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NewsClassifier(tokenizer.vocab_size, 128, 4, 256, 2, 4).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [18]:
# ==========================================
# 4. 학습 및 테스트 실행
# ==========================================

In [19]:
print(f"학습 시작 (Device: {device})...")
for epoch in range(10):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        ids, labels = batch['input_ids'].to(device), batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/10 | Loss: {epoch_loss/len(train_loader):.4f}")

학습 시작 (Device: cuda)...
Epoch 1/10 | Loss: 1.3184
Epoch 2/10 | Loss: 1.0021
Epoch 3/10 | Loss: 0.7693
Epoch 4/10 | Loss: 0.6483
Epoch 5/10 | Loss: 0.5799
Epoch 6/10 | Loss: 0.5223
Epoch 7/10 | Loss: 0.4820
Epoch 8/10 | Loss: 0.4543
Epoch 9/10 | Loss: 0.4221
Epoch 10/10 | Loss: 0.4019


In [20]:
# 결과 확인 전용 함수
def check_news(text):
    model.eval()
    labels_map = ["World", "Sports", "Business", "Sci/Tech"]
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=64).to(device)
    with torch.no_grad():
        out = model(inputs['input_ids'])
        pred = torch.argmax(out, dim=1).item()
    print(f"\n입력 뉴스: {text}\n분류 결과: {labels_map[pred]}")

In [21]:
print("-" * 50)
check_news("The final match of the world cup was incredibly intense.")
check_news("The stock market saw a significant drop after the federal report.")

--------------------------------------------------

입력 뉴스: The final match of the world cup was incredibly intense.
분류 결과: Sports

입력 뉴스: The stock market saw a significant drop after the federal report.
분류 결과: Business
