In [1]:
import numpy as np

def rotary_positional_embedding(x: np.ndarray, 
                              seq_len: int, 
                              embed_dim: int):
    
    """
    Áp dụng Rotary Positional Embedding cho vector ẩn.
    
    Args:
        x: Tensor input (batch_size, seq_len, dim).
        seq_len: Chiều dài chuỗi (sequence length).
        dim: Kích thước vector (embedding dimension).
        
    Returns:
        Tensor sau khi áp dụng RoPE.
    """

    # 1. Tách vector thành (even, odd)
    x_even = x[:, :, 0::2]  # Các giá trị tại chỉ số chẵn
    x_odd = x[:, :, 1::2]   # Các giá trị tại chỉ số lẻ


    # 2. Tính góc xoay
    positions = np.arange(start=0, stop=seq_len)[:, None] # # Vị trí từng token, shape: [seq_len, 1]
    theta = np.arange(0, embed_dim, 2) / embed_dim # 2* k / d. Shape: [embed_dim / 2, ]
    theta = 1 / (10000 ** (theta)) # 1 / (10000 ** (2 *k / d)). Shape: [embed_dim / 2, ]
    # print("positions: \n", positions)
    # print("theta: \n", theta)
    angles = positions * theta # theta
    # print("angles: \n", angles.shape)

    # 3. Tính cos/sin của góc
    cos_angles = np.cos(angles)
    sin_angles = np.sin(angles)

    # 4. Áp dụng phép xoay cho từng cặp
    x_rotated_even = x_even * cos_angles - x_odd * sin_angles
    x_rotated_odd = x_even * sin_angles + x_odd * cos_angles

    # ghép lại
    x_rotated = np.ones_like(x)
    x_rotated[:, :, 0::2] = x_rotated_even
    x_rotated[:, :, 1::2] = x_rotated_odd

    return x_rotated

N = 2
seq_len = 5
embed_dim = 4

mock_data = np.random.randn(N, seq_len, embed_dim)

# In dữ liệu gốc
print("Input ban đầu: \n", mock_data)

# Áp dụng RoPE
x_rotated = rotary_positional_embedding(x=mock_data, seq_len=seq_len, embed_dim=embed_dim)

# In kết quả
print(x_rotated.shape)
print("\nKết quả sau khi áp dụng RoPE:\n",  x_rotated)


Input ban đầu: 
 [[[ 2.24993798  0.93003304  0.64636707 -0.16226251]
  [-0.14318004  0.35855661  1.06733834  1.53455477]
  [-0.486989    1.47998358 -0.15091206 -1.05794049]
  [ 0.6594266  -0.17006548  1.80552295  1.55287562]
  [ 1.33808956 -0.74990166  0.21630472 -0.45453599]]

 [[ 0.65699609 -0.28014326  0.1557733  -0.68552987]
  [ 0.99961896 -1.51971721  0.23570077  0.35635283]
  [ 0.0224391   1.01097942  0.44795195  2.14752525]
  [ 0.19138886  0.93224199 -1.40751126 -0.3284627 ]
  [-0.14883641 -1.22249176  0.77987447  0.38287989]]]
(2, 5, 4)

Kết quả sau khi áp dụng RoPE:
 [[[ 2.24993798  0.93003304  0.64636707 -0.16226251]
  [-0.37907549  0.07324711  1.05193968  1.54515125]
  [-1.14308633 -1.05870833 -0.12972448 -1.06074694]
  [-0.62882774  0.26142183  1.75813125  1.60633444]
  [-1.44216116 -0.52250108  0.23430829 -0.44552253]]

 [[ 0.65699609 -0.28014326  0.1557733  -0.68552987]
  [ 1.81889437  0.02004364  0.23212551  0.35869198]
  [-0.92861895 -0.40031207  0.40491472  2.1560542 ]

---


In [2]:
%cd .

/home/ducpham/Documents/Pytorch-Coding/LLMs/LLama-2


In [3]:
import torch 
import torch.nn as nn 
import pandas as pd

In [4]:
train_df = pd.read_csv("./data/train_set.csv")
train_df.head()

Unnamed: 0,_id,label,title,label_numeric
0,66b5aabf8a38820e82e0b6ce,Xu hướng,"100+ STT Né thính, Cap né thính hài hước, NÉT ...",7
1,66b5a9838a38820e82e0b64d,Xu hướng,"Top 111+ stt cuộc sống an nhiên, bình dị tự tạ...",7
2,66b5cb358a38820e82e0c408,Xu hướng,"Top hạt giống hoa dễ trồng, nở quanh năm cho n...",7
3,66b5c7548a38820e82e0c271,Dinh dưỡng,Chi tiết 3 cách nấu rau bò khai đơn giản mà th...,1
4,66b5c7a78a38820e82e0c294,Nhà,Top 10 quạt cây hơi nước được ưa chuộng nhất h...,4


In [5]:
print(dict(train_df['label_numeric'].value_counts()))
print(dict(train_df['label'].value_counts()))


{7: 1801, 3: 1435, 5: 433, 4: 286, 0: 241, 6: 192, 1: 84, 2: 41, 8: 30}
{'Xu hướng': 1801, 'Làm Đẹp': 1435, 'Trẻ em': 433, 'Nhà': 286, 'Công Nghệ': 241, 'Tài chính': 192, 'Dinh dưỡng': 84, 'Khuyến mãi': 41, 'Đánh giá': 30}


In [6]:
class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, 
                 depth: int, # d_model // num_heads
                 base: int = 10000):
        super().__init__()
        theta = 1.0 / (base ** (torch.arange(0, depth, 2) / depth)) # [embed_dim //2 ,]
        self.register_buffer(name="theta", tensor=theta)
        
    def forward(self, x: torch.Tensor):
        # x: [N, num_heads, seq_len, depth] 
        # do seq_len kh cố định < tokenizer của gpt2 thiết kế cho chiều dài đông>, 
        # vì vậy ta sẽ tạo postions khi có seq_len đưuọc lấy từ x
        seq_len = x.size(2) 
        positions = torch.arange(0, seq_len)[:, None]
        # print("positions: ", positions.shape)
        # print("theta: ", self.theta.shape)
        theta_cp = positions * self.theta
        x_even, x_odd = x[..., 0::2], x[..., 1::2] # [N, num_heads, seq_len, depth//2]
        
        sin_angles = torch.sin(theta_cp)
        cos_angles = torch.cos(theta_cp)

        x_even_rotated = x_even * cos_angles - x_odd * sin_angles
        x_odd_rotated = x_even * sin_angles + x_odd * cos_angles

        x_rotated = torch.ones_like(x)
        x_rotated[..., 0::2] = x_even_rotated
        x_rotated[..., 1::2] = x_odd_rotated

        return x_rotated
 

mock_data = torch.randint(0, 1, (1, 4, 4))
embed_model = RotaryPositionalEmbedding(depth= 4//1)
output = embed_model.forward(mock_data)
print("output: ", output.shape)


output:  torch.Size([1, 4, 4])


In [7]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model: int, 
                 num_heads: int, 
                 ff_dim: int, 
                 dropout: float = 0.1):
        super().__init__()
        self.d_model = d_model
        self.depth = d_model // num_heads
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout = dropout

        self.rotary_embed = RotaryPositionalEmbedding(depth = self.depth)
        
        self.Wq = nn.Linear(in_features=d_model, out_features=d_model)
        self.Wk = nn.Linear(in_features=d_model, out_features=d_model)
        self.Wv = nn.Linear(in_features=d_model, out_features=d_model)

        self.Wo = nn.Linear(in_features=d_model, out_features=d_model)

        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=ff_dim), 
            nn.ReLU(), 
            nn.Linear(in_features=ff_dim, out_features=d_model)
        )

        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model)
        self.dropout = nn.Dropout(p=dropout)

    def split_heads(self, x: torch.Tensor, batch_size: int):
        # [N, seq_len, embed_dim] -> [N, num_heads, seq_len, embed_dim//num_heads]
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(1, 2)
    
    def scaled_dot_product_attention(self, Q, K, V, mask =None):
        matmul_qk = torch.matmul(Q, K.transpose(-2, -1))
        dk = torch.tensor(K.size(-1), dtype=torch.float32)
        scaled_attention_logits = matmul_qk / torch.sqrt(dk)

        if mask is not None:
            scaled_attention_logits = scaled_attention_logits.masked_fill(mask == 0, -1e9)

        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, V)

        return output, attention_weights
    
    def forward(self, x: torch.Tensor, mask = None):
        # shape x: [N, seq_len, d_model]
        batch_size = x.size(0)
        # Apply linear layers and split into heads
        Q = self.split_heads(x=self.Wq(x), batch_size=batch_size) # [N, num_heads, seq_len, depth]
        K = self.split_heads(x=self.Wk(x), batch_size=batch_size) # [N, num_heads, seq_len, depth]
        V = self.split_heads(x=self.Wv(x), batch_size=batch_size) # [N, num_heads, seq_len, depth]
        # Rotary position embedding
        Q_rotated = self.rotary_embed(Q) # [N, num_heads, seq_len, depth]
        K_rotated = self.rotary_embed(K) # [N, num_heads, seq_len, depth]


        # Apply the custom scaled dot-product attention
        scaled_attention, _ = self.scaled_dot_product_attention(Q_rotated, K_rotated, V, mask)
        # print(scaled_attention.shape) # [N, num_heads, seq_len, depth]

        # Transpose and reshape back to (batch_size, seq_len, d_model)
        scaled_attention = scaled_attention.transpose(1, 2).contiguous() # [N, seq_len, num_heads, depth]
        concat_attention = scaled_attention.view(batch_size, -1, self.d_model) # [N, seq_len, d_model]

        # Apply the final linear layer to combine the heads
        attn_output = self.Wo(concat_attention) # [N, seq_len, d_model]

        # Add & Norm
        x = self.layernorm1(x + self.dropout(attn_output)) # # [N, seq_len, d_model]

        # Feed-forward
        ff_output = self.feed_forward(x) # [N, seq_len, d_model]

        # Add & Norm
        x = self.layernorm2(x + self.dropout(ff_output)) # [N, seq_len, d_model]

        return x # [N, seq_len, d_model]
    

model = TransformerEncoderLayer(d_model=256, num_heads=8, ff_dim=512)
output = model(torch.randn(1, 1000, 256))
print("output: ", output.shape)

output:  torch.Size([1, 1000, 256])


In [8]:
class TransformerModel(nn.Module):
    def __init__(self, 
                 vocab_size: int, 
                 d_model: int,
                 num_heads: int, 
                 ff_dim: int, 
                 output_size: int, 
                 num_layers: int, 
                 dropout:float = 0.1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=d_model)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model=d_model, num_heads=num_heads, ff_dim=ff_dim) 
            for _ in range(num_layers)  
        ])

        self.fc = nn.Linear(in_features=d_model, out_features=output_size)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask=None):
        # x: [N, seq_len]
        x = self.embedding(x) # [N, seq_len, embed_dim]
        for layer in self.encoder_layers:
            x  = layer(x, mask) # [N, seq_len, embed_dim]
        x = x.mean(dim=1) # (batch_size, d_model)
        x = self.fc(self.dropout(x)) # (batch_size, output_size)
        return x


model = TransformerModel(vocab_size=100000, d_model=256, 
                         num_heads=8, ff_dim=512, 
                         output_size=9, num_layers=2)

mock_data = torch.randint(0, 1, (2, 1000), dtype=torch.long)

output = model(mock_data)
print(output.shape)

torch.Size([2, 9])


In [9]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, 
                 dataframe: pd.DataFrame, 
                 tokenizer):
        self.titles = dataframe['title'].str.lower().values
        self.labels = dataframe['label_numeric'].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self, index):
        title = self.titles[index]
        label = self.labels[index]

        title_ids = self.tokenizer.encode(title)
        return (
            torch.tensor(title_ids, dtype=torch.long), 
            torch.tensor(label, dtype=torch.long)
        ) 

In [10]:

# Collate function to pad sequences
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    max_length = max(len(ids) for ids in input_ids)
    input_ids = torch.stack([torch.cat([ids, torch.zeros(max_length - len(ids), dtype=torch.long)]) for ids in input_ids])
    labels = torch.tensor(labels, dtype=torch.long)
    return input_ids, labels

In [11]:
import tiktoken
from torch.utils.data import DataLoader


train_df = pd.read_csv("./data/train_set.csv")
val_df = pd.read_csv("./data/validation_set.csv")

tokenizer = tiktoken.get_encoding(encoding_name='gpt2')

train_dataset = TextDataset(dataframe=train_df, tokenizer=tokenizer)
val_dataset = TextDataset(dataframe=val_df, tokenizer=tokenizer)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, 
                              shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, 
                            shuffle=False, collate_fn=collate_fn)


In [12]:
# Initialize the Transformer model
vocab_size = tokenizer.n_vocab
d_model = 256
num_heads = 8
ff_dim = 512
output_size = len(train_df['label_numeric'].unique())
num_layers = 2
dropout = 0.1

model = TransformerModel(vocab_size=vocab_size, d_model=d_model, 
                         num_heads=num_heads, ff_dim=ff_dim, output_size=output_size, 
                         num_layers=num_layers, dropout=dropout)

# Training loop
num_epochs = 20
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
for epoch in range(30):
    model.train()
    for input_ids, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Validation step
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, labels in val_dataloader:
            outputs = model(input_ids)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {accuracy:.2f}%")

Epoch 1/20, Loss: 0.8510082364082336
Validation Accuracy after Epoch 1: 73.72%
Epoch 2/20, Loss: 0.7910841107368469
Validation Accuracy after Epoch 2: 78.13%


KeyboardInterrupt: 