In [1]:
import torch
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokernizer = get_tokenizer("basic_english")

train_iter = AG_NEWS(split="train")


In [2]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokernizer(text)


In [3]:
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])

vocab.set_default_index(vocab["<pad>"])


In [4]:
vocab(["<pad>", "<unk>", "here", "is", "an", "example"])


[0, 1, 476, 22, 31, 5298]

In [5]:
text_pipeline = lambda x: vocab(tokernizer(x))

label_pipeline = lambda x: int(x) - 1


In [6]:
train_iter = AG_NEWS(split="train")
length_dict = {}
for label_, text_ in train_iter:
    length = len(text_pipeline(text_))
    if length in length_dict.keys():
        length_dict[length] += 1
    else:
        length_dict[length] = 1


In [7]:
PAD_ID = text_pipeline("<pad>")[0]
MAX_LENGTH = sorted(length_dict.items(), key=lambda x: x[1], reverse=True)[0][0]
BATCH_SIZE = 128


In [8]:
def pad_sequence(sentences, max_length, pad, start="R"):
    sentences = sentences[:max_length] if start == "R" else sentences[-max_length:]
    result = (
        (sentences + [pad] * (max_length - len(sentences)))
        if start == "R"
        else ([pad] * (max_length - len(sentences)) + sentences)
    )
    return result


In [9]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]

    for _label, _text in batch:
        label_list.append(label_pipeline(_label))

        processed_text = pad_sequence(text_pipeline(_text), MAX_LENGTH, PAD_ID)
        # processed_text = torch.tensor(padding_text, dtype=torch.int64)
        text_list.append(processed_text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.tensor(text_list, dtype=torch.int64)

    return text_list.to(device), label_list.to(device)


In [10]:
train_iter = AG_NEWS(split="train")
test_iter = AG_NEWS(split="test")
trainDL = DataLoader(
    train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
testDL = DataLoader(
    test_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)


In [11]:
### ===> 분류 클래스 수와 단어사전 개수
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)

print(f"num_class : {num_class}    vocab_size : {vocab_size}")


num_class : 4    vocab_size : 95812


In [12]:
from torch import nn


class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layer,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm",
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0
        )
        if model_type == "rnn":
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layer,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layer,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, num_class)
        else:
            self.classifier = nn.Linear(hidden_dim, num_class)

        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits


In [13]:
from torch import optim

n_vocab = len(vocab)
hidden_dim = 64
embedding_dim = 128
n_layer = 2

device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layer=n_layer
).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(classifier.parameters())


In [14]:
from torchinfo import summary

summary(classifier)


Layer (type:depth-idx)                   Param #
SentenceClassifier                       --
├─Embedding: 1-1                         12,263,936
├─LSTM: 1-2                              198,656
├─Linear: 1-3                            516
├─Dropout: 1-4                           --
Total params: 12,463,108
Trainable params: 12,463,108
Non-trainable params: 0

In [15]:
import numpy as np
from torchmetrics.functional import accuracy


def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step} : {np.mean(losses)}")


def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()

    with torch.no_grad():
        for step, (input_ids, labels) in enumerate(datasets):
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            logits = model(input_ids)
            loss = criterion(logits, labels)
            losses.append(loss.item())
            corrects.extend(torch.eq(logits.argmax(dim=1), labels).cpu().tolist())

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")


epochs = 5
interval = 100

for epoch in range(epochs):
    train(classifier, trainDL, criterion, optimizer, device, interval)
    test(classifier, testDL, criterion, device)


Train Loss 0 : 1.3870983123779297
Train Loss 100 : 1.303557273184899
Train Loss 200 : 1.119230790814357
Train Loss 300 : 0.991891983834612
Train Loss 400 : 0.906186282114495
Train Loss 500 : 0.8372362538012202
Train Loss 600 : 0.7764976990599799
Train Loss 700 : 0.7308536968200591
Train Loss 800 : 0.6938664631897145
Train Loss 900 : 0.6624040723706985
Val Loss : 0.38863015845417975, Val Accuracy : 0.866578947368421
Train Loss 0 : 0.4532986581325531
Train Loss 100 : 0.4357316732996761
Train Loss 200 : 0.40859868351499834
Train Loss 300 : 0.3922239392005724
Train Loss 400 : 0.3781857704952768
Train Loss 500 : 0.367749891833155
Train Loss 600 : 0.35231509127850935
Train Loss 700 : 0.3424625628642451
Train Loss 800 : 0.33458788679928964
Train Loss 900 : 0.32721772432724194
Val Loss : 0.32519606028993925, Val Accuracy : 0.8944736842105263
Train Loss 0 : 0.263317346572876
Train Loss 100 : 0.32787886896345875
Train Loss 200 : 0.30464180036267235
Train Loss 300 : 0.29152135595530765
Train Loss