In [2]:
print('Hello world!')

Hello world!


In [109]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
import random
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [59]:
# 데이터셋 클래스 정의
class AdditionDataset(Dataset):
    def __init__(self, num_samples):
        self.samples = [self.generate_sample() for _ in range(num_samples)]

    @staticmethod
    def generate_sample():
        num1 = random.randint(10, 99)
        num2 = random.randint(10, 99)
        return f"{num1}+{num2}={num1+num2}"

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [6]:
# 모델 정의
class TransformerDecoderModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, dropout=0.1):
        super(TransformerDecoderModel, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 15, embed_size))
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        src = self.dropout(self.token_emb(src) + self.positional_encoding[:, :src.size(1), :])
        memory = torch.zeros_like(src)  # No encoder output
        output = self.transformer_decoder(src, memory)
        return self.fc_out(output)

In [7]:
# 토큰화 함수
def char_to_idx(char):
    return {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9,
            '+': 10, '=': 11, ' ': 12}[char]

def idx_to_char(idx):
    return '0123456789+= '[idx]

# 하이퍼파라미터 설정
VOCAB_SIZE = 13  # 숫자 0-9, '+', '=', 공백
EMBED_SIZE = 256
NUM_HEADS = 8
NUM_LAYERS = 3
DROPOUT = 0.1
BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCHS = 10

In [8]:
# 모델, 손실 함수, 옵티마이저 초기화
model = TransformerDecoderModel(VOCAB_SIZE, EMBED_SIZE, NUM_HEADS, NUM_LAYERS, DROPOUT)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [9]:
# 데이터 로더 생성
train_dataset = AdditionDataset(num_samples=5000)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
model = model.to(device)

In [14]:
# 학습 과정
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_loader):
        batch_loss = 0
        for data in batch:
            input_tensor = torch.tensor([char_to_idx(char) for char in data[:-1]], dtype=torch.long).unsqueeze(0).to(device)
            target_tensor = torch.tensor([char_to_idx(char) for char in data[1:]], dtype=torch.long).unsqueeze(0).to(device)

            optimizer.zero_grad()
            output = model(input_tensor)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target_tensor.view(-1))
            loss.backward()
            optimizer.step()
            batch_loss += loss.item()
        total_loss += batch_loss / len(batch)
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

Epoch 1, Loss: 1.565700283887077
Epoch 2, Loss: 1.5364091452211142
Epoch 3, Loss: 1.5327580948654986
Epoch 4, Loss: 1.528622689574391
Epoch 5, Loss: 1.5244074948132038
Epoch 6, Loss: 1.5053921369953622
Epoch 7, Loss: 1.4965761606404675
Epoch 8, Loss: 1.4911226471532373
Epoch 9, Loss: 1.4880041029964444
Epoch 10, Loss: 1.4848910269056317


In [81]:
labels = '0123456789+= '
data = model.token_emb(torch.tensor([char_to_idx(token) for token in labels]).to(device))
data.shape

torch.Size([13, 256])

In [118]:
def vis_pca(data, labels, title='', show=False):
    
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    
    pca = PCA(n_components=5)
    transformed_list = pca.fit_transform(data.cpu().detach().numpy())  # (label cnt, pc cnt)

    for pcx, pcy in [(0,1)]:
        for i, transformed in enumerate(transformed_list):
            plt.xlabel(f'PC{pcx+1} ({pca.explained_variance_ratio_[pcx]*100:.1f}%)')
            plt.ylabel(f'PC{pcy+1} ({pca.explained_variance_ratio_[pcy]*100:.1f}%)')
            plt.scatter(transformed[pcx], transformed[pcy], alpha=0.25)
            plt.text(transformed[pcx], transformed[pcy], f"'{labels[i]}'")
        plt.title(title)

        if show:
            plt.show()
        else:
            plt.savefig(f'emb_vis/pca_{time.strftime("%Y%m%d-%H%M%S")}.png')
            plt.close()

vis_pca(data, labels, 'Test title')

In [66]:
test_dataset = AdditionDataset(num_samples=100)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)  # 출력의 편의성을 위해 batch는 1로 고정

# 평가 함수
def evaluate_model(model, data):
    model.eval()  # 모델을 평가 모드로 설정
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            for data in batch:
                input_text = data[:-1]  # 입력 (예: '23+47=07')
                target_text = data[1:]  # 정답 (예: '3+47=070')
    
                input_tensor = torch.tensor([char_to_idx(char) for char in data[:-1]], dtype=torch.long).unsqueeze(0).to(device)
                target_tensor = torch.tensor([char_to_idx(char) for char in data[1:]], dtype=torch.long).unsqueeze(0).to(device)
    
                output = model(input_tensor)
                output_onehot = np.argmax(output.cpu().detach(), axis=2)

                print('sentence: ', data)
                print('target_char: ', [idx_to_char(i) for i in target_tensor[0]])
                print('output_char: ', [idx_to_char(i) for i in output_onehot[0]])

                total += len(target_tensor[0])
                correct += sum(1 for i, j in zip(target_tensor[0], output_onehot[0]) if i == j)

    return {'accuracy': correct/total}

evaluate_model(model, test_dataset)['accuracy']

sentence:  32+55=87
target_char:  ['2', '+', '5', '5', '=', '8', '7']
output_char:  ['4', '+', '5', '4', '=', '1', '2']
sentence:  99+49=148
target_char:  ['9', '+', '4', '9', '=', '1', '4', '8']
output_char:  ['4', '+', '5', '4', '=', '1', '2', '4']
sentence:  98+24=122
target_char:  ['8', '+', '2', '4', '=', '1', '2', '2']
output_char:  ['4', '+', '5', '4', '=', '1', '2', '4']
sentence:  45+40=85
target_char:  ['5', '+', '4', '0', '=', '8', '5']
output_char:  ['4', '+', '5', '4', '=', '1', '2']
sentence:  74+57=131
target_char:  ['4', '+', '5', '7', '=', '1', '3', '1']
output_char:  ['4', '+', '5', '4', '=', '1', '2', '4']
sentence:  34+47=81
target_char:  ['4', '+', '4', '7', '=', '8', '1']
output_char:  ['4', '+', '5', '4', '=', '1', '2']
sentence:  74+33=107
target_char:  ['4', '+', '3', '3', '=', '1', '0', '7']
output_char:  ['4', '+', '5', '4', '=', '1', '2', '4']
sentence:  85+29=114
target_char:  ['5', '+', '2', '9', '=', '1', '1', '4']
output_char:  ['4', '+', '5', '4', '=', 

0.40185676392572944

**TODO**
- Operator ('+', '=') 제외 후 accuracy 구해보기
- Right-hand-side의 accuracy 구해보기  