In [12]:
pip install torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# positional_encoding.py
from src.tokenizer import BytePairTokenizer, load_tokenizer
import torch
from torch import Tensor
import math

In [4]:
# 1. 필요한 모듈 임포트
import torch
from torch import Tensor
import math

# Tokenizer 테스트
tokenizer = load_tokenizer()
text = "Sean Bean has a hard time leaving his role as Eddard Stark . He vows to get revenge against those that assisted in his execution , starting with George R. R. Martin"
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")


Encoded: [1777, 4313, 2964, 4313, 3279, 2804, 3914, 3066, 4889, 2871, 10120, 2896, 3070, 3399, 3182, 3474, 5091, 2765, 2963, 3001, 3580, 2796, 3181, 10557, 3698, 3496, 2854, 3874, 4855, 2837, 2871, 7153, 5263, 2772, 5468, 2893, 7311, 3175, 2764, 3175, 2764, 10580]
Decoded: Sean Bean has a hard time leaving his role as Eddard Stark . He vows to get revenge against those that assisted in his execution , starting with George R. R. Martin


10948

In [7]:
# 3. Embedding 클래스 정의
class Embedding:
    def __init__(self, input_dim: int, output_dim: int) -> None:
        """
        Custom Embedding 레이어 초기화
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weights: Tensor = torch.randn(input_dim, output_dim) * 0.01
        self.grad_weights: Tensor = torch.zeros_like(self.weights)

    def forward(self, input_indices: Tensor) -> Tensor:
        """
        순전파 과정
        """
        self.input_indices = input_indices
        self.output = self.weights[input_indices]
        return self.output

    def backward(self, grad_output: Tensor) -> Tensor:
        """
        역전파 과정
        """
        grad_flat = grad_output.view(-1, self.output_dim)
        input_flat = self.input_indices.view(-1)
        self.grad_weights.zero_()
        self.grad_weights.index_add_(0, input_flat, grad_flat)
        return None

    def __str__(self) -> str:
        return "CustomEmbedding"

# Embedding 테스트
vocab_size:int = len(tokenizer.token_map)
embed_size:int = 1536
embedding_layer = Embedding(vocab_size, embed_size)
embedded_tokens = embedding_layer.forward(torch.tensor(encoded))
print(f"Token Size : {len(encoded)}")
print(f"Embedded Tokens Shape: {embedded_tokens.shape}")

Token Size : 42
Embedded Tokens Shape: torch.Size([42, 1536])


In [9]:
# 4. Positional Encoding 클래스 정의
class PositionalEncoding:
    def __init__(self, max_seq_len: int, embed_size: int):
        """
        위치 인코딩 초기화
        """
        self.embed_size = embed_size
        self.pos_encoding = torch.zeros(max_seq_len, embed_size)

        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        self.pos_encoding[:, 0::2] = torch.sin(position * div_term)
        self.pos_encoding[:, 1::2] = torch.cos(position * div_term)
        self.pos_encoding = self.pos_encoding.unsqueeze(0)

    def forward(self, x: Tensor) -> Tensor:
        """
        순전파 과정 (2차원 반환)
        """
        seq_length = x.size(1)  # 입력의 시퀀스 길이
        x_with_pos = x + self.pos_encoding[:, :seq_length, :]  # 위치 인코딩 추가
        return x_with_pos.view(-1, self.embed_size)  # 2차원 텐서로 변환

# Positional Encoding 테스트

vocab_size:int = len(tokenizer.token_map)
embed_size:int = 1536
embedding_layer = Embedding(vocab_size, embed_size)
embedded_tokens = embedding_layer.forward(torch.tensor(encoded))

max_seq_len = len(encoded)
pos_encoding_layer = PositionalEncoding(max_seq_len, embed_size)
output_with_pos_encoding = pos_encoding_layer.forward(embedded_tokens.unsqueeze(0))

print(f"Token Size : {len(encoded)}")
print(f"Embedded Tokens Shape: {embedded_tokens.shape}")
print(f"Output with Positional Encoding Shape: {output_with_pos_encoding.shape}")

Token Size : 42
Embedded Tokens Shape: torch.Size([42, 1536])
Output with Positional Encoding Shape: torch.Size([42, 1536])
