# LSTM-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

In [None]:
!pip install seaborn
!pip install opencc
!pip install -U scikit-learn

In [2]:
# ! pip install seaborn
# ! pip install opencc
# ! pip install -U scikit-learn

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.utils.rnn
import torch.utils.data
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

data_path = './data'

In [3]:
df_train = pd.read_csv(os.path.join(data_path, 'arithmetic_train.csv'))
df_eval = pd.read_csv(os.path.join(data_path, 'arithmetic_eval.csv'))
df_train.head()

Unnamed: 0,src,tgt
0,14*(43+20)=,882
1,(6+1)*5=,35
2,13+32+29=,74
3,31*(3-11)=,-248
4,24*49+1=,1177


In [4]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'] + df_train['tgt'] 
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))

# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.

建立一個將「字元 → 數值ID」與「ID → 字元」的字典。

（把文字（字元）轉成模型可以理解的「數字 ID」，）

In [5]:
char_to_id = {}
id_to_char = {}

# write your code here
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to conver charactors to ids, while id_to_char is the opposite

# 建立特殊符號
# <pad>：補齊句子長度，讓 batch 中每筆資料一樣長。
# <eos>：標記句子結尾（模型生成時知道要停下來）。
special_tokens = ['<pad>', '<eos>']

# 從訓練資料集中擷取所有字元
# 將 src 欄位（包含題目與答案）中出現的所有字元放進集合去重
all_chars = set(''.join(df_train['src'].tolist()))

# 建立完整詞彙表，把 <pad> 和 <eos> 加到字元表的前面。
vocab = special_tokens + sorted(list(all_chars))

# 為每個字元分配一個唯一的數字編號。
#    char_to_id：字元 -> 數字
#    id_to_char：數字 -> 字元
for i, ch in enumerate(vocab):
    char_to_id[ch] = i
    id_to_char[i] = ch
    
vocab_size = len(char_to_id)
print('Vocab size: {}'.format(vocab_size))

# 範例輸出
print("\nExample:")
print("char_to_id:", {k: v for k, v in list(char_to_id.items())[:10]})
print("id_to_char:", {k: v for k, v in list(id_to_char.items())[:10]})

Vocab size: 18

Example:
char_to_id: {'<pad>': 0, '<eos>': 1, '(': 2, ')': 3, '*': 4, '+': 5, '-': 6, '0': 7, '1': 8, '2': 9}
id_to_char: {0: '<pad>', 1: '<eos>', 2: '(', 3: ')', 4: '*', 5: '+', 6: '-', 7: '0', 8: '1', 9: '2'}


# Data Preprocessing
 - The data is processed into the format required for the model's input and output. (End with \<eos\> token)

目標：把每一列 src（已包含等式與答案，如 3+5=8）轉成 模型輸入序列 與 訓練標籤序列。

In [6]:
# Data Preprocessing
PAD_ID = char_to_id['<pad>']
EOS_ID = char_to_id['<eos>']

max_len_with_eos = int(df_train['len'].max()) + 1

def encode_with_eos(s: str):
    ids = [char_to_id[ch] for ch in s]
    ids.append(EOS_ID)
    return ids

def build_char_ids(row):
    ids = encode_with_eos(row['src'])
    if len(ids) < max_len_with_eos:
        ids = ids + [PAD_ID] * (max_len_with_eos - len(ids))
    else:
        ids = ids[:max_len_with_eos]
    return ids

def build_label_ids(row):
    src_str = row['src']
    full_ids = encode_with_eos(src_str)
    
    # Shift left
    labels = full_ids[1:] + [PAD_ID]
    
    # 找最後一個 =
    eq_pos = src_str.rfind('=')
    
    # 等號之前都設為 PAD (只訓練答案)
    if eq_pos >= 0:
        for i in range(eq_pos):
            if i < len(labels):
                labels[i] = PAD_ID
    
    # Padding
    if len(labels) < max_len_with_eos:
        labels = labels + [PAD_ID] * (max_len_with_eos - len(labels))
    else:
        labels = labels[:max_len_with_eos]
    
    return labels

# 應用到資料
df_train['char_id_list'] = df_train.apply(build_char_ids, axis=1)
df_train['label_id_list'] = df_train.apply(build_label_ids, axis=1)

print("max_len_with_eos =", max_len_with_eos)
display(df_train.head())

max_len_with_eos = 17


Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,882,14,"[8, 11, 4, 2, 11, 10, 5, 9, 7, 3, 17, 15, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 9, 1, 0..."
1,(6+1)*5=35,35,10,"[2, 13, 5, 8, 3, 4, 12, 17, 10, 12, 1, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 10, 12, 1, 0, 0, 0, 0, 0..."
2,13+32+29=74,74,11,"[8, 10, 5, 10, 9, 5, 9, 16, 17, 14, 11, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 1, 0, 0, 0, 0..."
3,31*(3-11)=-248,-248,14,"[10, 8, 4, 2, 10, 6, 8, 8, 3, 17, 6, 9, 11, 15...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 9, 11, 15, 1, 0..."
4,24*49+1=1177,1177,12,"[9, 11, 4, 11, 16, 5, 8, 17, 8, 8, 14, 14, 1, ...","[0, 0, 0, 0, 0, 0, 0, 8, 8, 14, 14, 1, 0, 0, 0..."


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|

In [7]:
batch_size = 64
epochs = 30
embed_dim = 256
hidden_dim = 256
lr = 5e-4
weight_decay = 1e-4
grad_clip = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.

- Example: 1+2-3=0
    - Model input: 1 + 2 - 3 = 0
    - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
    - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, index):
        # 取出第 index 筆資料的 x、y
        if isinstance(self.sequences, (list, tuple)):
            x_ids, y_ids = self.sequences[index]
        else:
            row = self.sequences.iloc[index]
            x_ids, y_ids = row['char_id_list'], row['label_id_list']
        return x_ids, y_ids


# collate function, used to build dataloader
# 把不同長度序列打包成 batch，並做 padding
def collate_fn(batch):
    # batch 是 list，每個元素是 (x_ids, y_ids)
    batch_x = [torch.tensor(x, dtype=torch.long) for x, _ in batch]
    batch_y = [torch.tensor(y, dtype=torch.long) for _, y in batch]

    # 記錄原始長度（可用於 RNN pack/pad 或 mask）
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])

    # 右側用 <pad> 補齊到同長
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(
        batch_x, batch_first=True, padding_value=char_to_id['<pad>']
    )
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(
        batch_y, batch_first=True, padding_value=char_to_id['<pad>']
    )
    
    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [9]:
df_eval['char_id_list']  = df_eval.apply(build_char_ids,  axis=1)
df_eval['label_id_list'] = df_eval.apply(build_label_ids, axis=1)

ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])
ds_eval  = Dataset(df_eval [['char_id_list', 'label_id_list']])

In [10]:
dl_train = torch.utils.data.DataLoader(
    ds_train,
    batch_size=batch_size,
    shuffle=True,          # 訓練要打亂
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)

dl_eval = torch.utils.data.DataLoader(
    ds_eval,
    batch_size=batch_size,
    shuffle=False,        
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)

xb, yb, xlen, ylen = next(iter(dl_train))
print(xb.shape, yb.shape, xlen.shape, ylen.shape)

torch.Size([64, 17]) torch.Size([64, 17]) torch.Size([64]) torch.Size([64])


# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an LSTM sequentially.
3. The output of the LSTM is passed into another LSTM, and additional layers can be added.
4. The output from all time steps of the final LSTM is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [None]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=char_to_id['<pad>'])

        # 兩層 LSTM
        self.rnn_layer1 = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)                 
        self.rnn_layer2 = nn.LSTM(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)

        self.proj = nn.Linear(hidden_dim, vocab_size, bias=False)
        self.proj.weight = self.embedding.weight
        
    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)
    
    # 編碼器：嵌入 → pack → LSTM×2 → pad → Linear → logits
    def encoder(self, batch_x, batch_x_lens):
        x = self.embedding(batch_x)                                  
        x = nn.utils.rnn.pack_padded_sequence(x, batch_x_lens.cpu(),
                                              batch_first=True, enforce_sorted=False)
        x, _ = self.rnn_layer1(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)  

        x = self.dropout(x)                                         

        x = nn.utils.rnn.pack_padded_sequence(x, batch_x_lens.cpu(),
                                              batch_first=True, enforce_sorted=False)
        x, _ = self.rnn_layer2(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)  

        logits = self.proj(x)                                       
        return logits
    
    
    def generator(self, start_char, max_len=200):
        self.eval()
        PAD_ID = char_to_id['<pad>']
        EOS_ID = char_to_id['<eos>']
        
        # 將起始字串轉為 id
        char_list = [char_to_id[c] for c in start_char]
        
        with torch.no_grad():
            # 初始化 LSTM 狀態
            h1, c1 = None, None
            h2, c2 = None, None
            
            # 第一次:處理整個起始序列
            x = torch.tensor([char_list], dtype=torch.long, device=device)
            x_lens = torch.tensor([len(char_list)], dtype=torch.long, device=device)
            
            # 通過嵌入層
            x_emb = self.embedding(x)
            
            # 第一層 LSTM
            out1, (h1, c1) = self.rnn_layer1(x_emb)
            out1 = self.dropout(out1)
            
            # 第二層 LSTM
            out2, (h2, c2) = self.rnn_layer2(out1)
            
            # 從最後一個時間步開始生成
            while len(char_list) < max_len:
                # 投影到詞彙表
                logits = self.proj(out2[:, -1:, :])  # 取最後一個時間步
                next_char_id = int(torch.argmax(logits[0, 0, :]).item())
                
                if next_char_id == EOS_ID:
                    break
                
                char_list.append(next_char_id)
                
                # 用新生成的字元繼續
                x_next = torch.tensor([[next_char_id]], dtype=torch.long, device=device)
                x_emb_next = self.embedding(x_next)
                
                # 通過 LSTM
                out1, (h1, c1) = self.rnn_layer1(x_emb_next, (h1, c1))
                out1 = self.dropout(out1)
                out2, (h2, c2) = self.rnn_layer2(out1, (h2, c2))
        
        return [id_to_char[ch_id] for ch_id in char_list]

In [12]:
torch.manual_seed(4321)
# model = CharRNN(vocab_size, embed_dim, hidden_dim).to(device)

model = CharGRU(vocab_size, embed_dim, hidden_dim, dropout=0.2).to(device)

In [13]:
criterion = nn.CrossEntropyLoss(
    ignore_index=char_to_id['<pad>'],
    label_smoothing=0.1,           
)

optimizer = optim.AdamW(
    model.parameters(),
    lr=lr,
    weight_decay=weight_decay
)

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.


- Teacher Forcing：訓練時把「完整正解序列（含等式與答案）」當作輸入，

    loss 只在 label_id_list 非 <pad> 的位置（=答案區段 + <eos>）計算，

    等同「下一步的條件使用正確前綴」，這就是 teacher forcing。

- 忽略 <pad>：criterion = CrossEntropyLoss(ignore_index=PAD_ID) 已處理，所以攤平成一維後直接丟給 CE 即可。

- Exact Match (EM)：評估時必須使用 model.generator()，以「等號左邊 + '='」為起點生成；取等號右邊的整段字串與 tgt 完全相同才算 1 分。

### LSTM

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt

train_losses = []
eval_accuracies = []

for epoch in range(1, epochs + 1):
    model.train()
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    total_loss = 0
    batch_count = 0
    
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar: 
        optimizer.zero_grad()
        
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        batch_x_lens = batch_x_lens.to(device)
        
        # Forward
        logits = model(batch_x, batch_x_lens) 
        B, T, V = logits.size()
        
        # 計算 loss
        loss = criterion(logits.reshape(B * T, V), batch_y.reshape(B * T))
        
        # Backward
        loss.backward()
        
        # # 梯度裁剪
        # if grad_clip is not None and grad_clip > 0:
        #     torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        
        optimizer.step()
        
        total_loss += loss.item()
        batch_count += 1
        bar.set_postfix(loss=loss.item())
    
    avg_train_loss = total_loss / batch_count
    train_losses.append(avg_train_loss)
    
    model.eval()
    matched = 0
    total = 0
    
    with torch.no_grad():
        bar_eval = tqdm(df_eval.iterrows(), total=len(df_eval), desc=f"Validation epoch {epoch}")
        for _, row in bar_eval:
            start_str = row['src'].split('=')[0] + '='
            
            # 生成答案
            pred_chars = model.generator(start_str, max_len=max_len_with_eos)
            pred_str = ''.join(pred_chars)
            
            # 提取答案部分 (最後一個等號之後)
            if '=' in pred_str:
                pred_ans = pred_str.split('=')[-1]
            else:
                pred_ans = ""
            
            gold_ans = str(row['tgt'])
            
            # 比對答案
            matched += int(pred_ans == gold_ans)
            total += 1
    
    acc = matched / total
    eval_accuracies.append(acc)
    
    print(f"Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | EM: {acc:.4f}")

# 繪製訓練曲線
fig, ax1 = plt.subplots(figsize=(8, 5))

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Train Loss', color='tab:red')
ax1.plot(range(1, epochs + 1), train_losses, color='tab:red', marker='o', label='Train Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('Evaluation Accuracy (EM)', color='tab:blue')
ax2.plot(range(1, epochs + 1), eval_accuracies, color='tab:blue', marker='x', label='Eval Accuracy')
ax2.tick_params(axis='y', labelcolor='tab:blue')

plt.title('Training Loss & Evaluation Accuracy')
fig.tight_layout()

plt.savefig('GRU_training_curve.png', dpi=300, bbox_inches='tight')
plt.show()

Train epoch 1: 100%|██████████| 37020/37020 [04:43<00:00, 130.76it/s, loss=0.979]
Validation epoch 1: 100%|██████████| 263250/263250 [15:14<00:00, 287.77it/s]


Epoch 1 | Train Loss: 1.1632 | EM: 0.5103


Train epoch 2: 100%|██████████| 37020/37020 [04:43<00:00, 130.69it/s, loss=0.802]
Validation epoch 2: 100%|██████████| 263250/263250 [15:20<00:00, 286.10it/s]


Epoch 2 | Train Loss: 0.9082 | EM: 0.6268


Train epoch 3: 100%|██████████| 37020/37020 [04:43<00:00, 130.55it/s, loss=0.784]
Validation epoch 3: 100%|██████████| 263250/263250 [15:22<00:00, 285.23it/s]


Epoch 3 | Train Loss: 0.8346 | EM: 0.7376


Train epoch 4: 100%|██████████| 37020/37020 [04:45<00:00, 129.60it/s, loss=0.73] 
Validation epoch 4: 100%|██████████| 263250/263250 [15:35<00:00, 281.26it/s]


Epoch 4 | Train Loss: 0.7705 | EM: 0.7879


Train epoch 5: 100%|██████████| 37020/37020 [04:43<00:00, 130.41it/s, loss=0.775]
Validation epoch 5: 100%|██████████| 263250/263250 [15:12<00:00, 288.38it/s]


Epoch 5 | Train Loss: 0.7386 | EM: 0.8277


Train epoch 6: 100%|██████████| 37020/37020 [04:51<00:00, 126.86it/s, loss=0.743]
Validation epoch 6: 100%|██████████| 263250/263250 [15:18<00:00, 286.76it/s]


Epoch 6 | Train Loss: 0.7200 | EM: 0.8384


Train epoch 7: 100%|██████████| 37020/37020 [04:45<00:00, 129.52it/s, loss=0.718]
Validation epoch 7: 100%|██████████| 263250/263250 [15:04<00:00, 291.03it/s]


Epoch 7 | Train Loss: 0.7082 | EM: 0.8615


Train epoch 8: 100%|██████████| 37020/37020 [04:44<00:00, 130.14it/s, loss=0.714]
Validation epoch 8: 100%|██████████| 263250/263250 [15:06<00:00, 290.34it/s]


Epoch 8 | Train Loss: 0.6997 | EM: 0.8623


Train epoch 9: 100%|██████████| 37020/37020 [04:40<00:00, 131.85it/s, loss=0.701]
Validation epoch 9: 100%|██████████| 263250/263250 [15:05<00:00, 290.65it/s]


Epoch 9 | Train Loss: 0.6932 | EM: 0.8763


Train epoch 10: 100%|██████████| 37020/37020 [04:35<00:00, 134.40it/s, loss=0.745]
Validation epoch 10: 100%|██████████| 263250/263250 [14:53<00:00, 294.56it/s]


Epoch 10 | Train Loss: 0.6876 | EM: 0.8856


Train epoch 11: 100%|██████████| 37020/37020 [04:45<00:00, 129.48it/s, loss=0.653]
Validation epoch 11: 100%|██████████| 263250/263250 [14:56<00:00, 293.67it/s]


Epoch 11 | Train Loss: 0.6834 | EM: 0.8602


Train epoch 12: 100%|██████████| 37020/37020 [04:43<00:00, 130.46it/s, loss=0.676]
Validation epoch 12: 100%|██████████| 263250/263250 [15:09<00:00, 289.58it/s]


Epoch 12 | Train Loss: 0.6802 | EM: 0.8898


Train epoch 13: 100%|██████████| 37020/37020 [04:47<00:00, 128.69it/s, loss=0.687]
Validation epoch 13: 100%|██████████| 263250/263250 [15:16<00:00, 287.22it/s]


Epoch 13 | Train Loss: 0.6772 | EM: 0.9060


Train epoch 14: 100%|██████████| 37020/37020 [04:41<00:00, 131.39it/s, loss=0.664]
Validation epoch 14: 100%|██████████| 263250/263250 [15:12<00:00, 288.35it/s]


Epoch 14 | Train Loss: 0.6739 | EM: 0.8895


Train epoch 15: 100%|██████████| 37020/37020 [04:42<00:00, 131.03it/s, loss=0.65] 
Validation epoch 15: 100%|██████████| 263250/263250 [15:24<00:00, 284.78it/s]


Epoch 15 | Train Loss: 0.6713 | EM: 0.8972


Train epoch 16: 100%|██████████| 37020/37020 [04:43<00:00, 130.73it/s, loss=0.723]
Validation epoch 16: 100%|██████████| 263250/263250 [15:28<00:00, 283.47it/s]


Epoch 16 | Train Loss: 0.6691 | EM: 0.9099


Train epoch 17: 100%|██████████| 37020/37020 [04:48<00:00, 128.44it/s, loss=0.641]
Validation epoch 17: 100%|██████████| 263250/263250 [15:25<00:00, 284.35it/s]


Epoch 17 | Train Loss: 0.6669 | EM: 0.9059


Train epoch 18: 100%|██████████| 37020/37020 [04:46<00:00, 129.35it/s, loss=0.671]
Validation epoch 18: 100%|██████████| 263250/263250 [15:21<00:00, 285.71it/s]


Epoch 18 | Train Loss: 0.6647 | EM: 0.9155


Train epoch 19: 100%|██████████| 37020/37020 [04:42<00:00, 131.25it/s, loss=0.69] 
Validation epoch 19: 100%|██████████| 263250/263250 [15:20<00:00, 285.95it/s]


Epoch 19 | Train Loss: 0.6630 | EM: 0.9123


Train epoch 20: 100%|██████████| 37020/37020 [04:43<00:00, 130.73it/s, loss=0.698]
Validation epoch 20: 100%|██████████| 263250/263250 [15:03<00:00, 291.32it/s]


Epoch 20 | Train Loss: 0.6617 | EM: 0.9023


Train epoch 21: 100%|██████████| 37020/37020 [04:43<00:00, 130.62it/s, loss=0.668]
Validation epoch 21: 100%|██████████| 263250/263250 [15:26<00:00, 284.17it/s]


Epoch 21 | Train Loss: 0.6603 | EM: 0.9173


Train epoch 22: 100%|██████████| 37020/37020 [04:44<00:00, 130.17it/s, loss=0.639]
Validation epoch 22: 100%|██████████| 263250/263250 [15:19<00:00, 286.21it/s]


Epoch 22 | Train Loss: 0.6592 | EM: 0.9183


Train epoch 23: 100%|██████████| 37020/37020 [04:49<00:00, 128.07it/s, loss=0.621]
Validation epoch 23: 100%|██████████| 263250/263250 [14:45<00:00, 297.18it/s]


Epoch 23 | Train Loss: 0.6581 | EM: 0.9141


Train epoch 24: 100%|██████████| 37020/37020 [04:41<00:00, 131.72it/s, loss=0.662]
Validation epoch 24: 100%|██████████| 263250/263250 [15:20<00:00, 285.87it/s]


Epoch 24 | Train Loss: 0.6568 | EM: 0.9307


Train epoch 25: 100%|██████████| 37020/37020 [04:42<00:00, 130.99it/s, loss=0.684]
Validation epoch 25: 100%|██████████| 263250/263250 [15:02<00:00, 291.74it/s]


Epoch 25 | Train Loss: 0.6565 | EM: 0.9254


Train epoch 26: 100%|██████████| 37020/37020 [04:42<00:00, 131.19it/s, loss=0.751]
Validation epoch 26: 100%|██████████| 263250/263250 [14:57<00:00, 293.46it/s]


Epoch 26 | Train Loss: 0.6553 | EM: 0.9210


Train epoch 27: 100%|██████████| 37020/37020 [04:44<00:00, 130.14it/s, loss=0.616]
Validation epoch 27: 100%|██████████| 263250/263250 [14:59<00:00, 292.57it/s]


Epoch 27 | Train Loss: 0.6549 | EM: 0.9306


Train epoch 28:   4%|▎         | 1377/37020 [00:10<04:35, 129.30it/s, loss=0.642]

### RNN 

In [12]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=char_to_id['<pad>'])

        self.rnn_layer1 = nn.RNN(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)                 
        self.rnn_layer2 = nn.RNN(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)

        self.proj = nn.Linear(hidden_dim, vocab_size, bias=False)
        self.proj.weight = self.embedding.weight
        
    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)
    
    def encoder(self, batch_x, batch_x_lens):
        x = self.embedding(batch_x)                                  
        x = nn.utils.rnn.pack_padded_sequence(x, batch_x_lens.cpu(),
                                              batch_first=True, enforce_sorted=False)
        x, _ = self.rnn_layer1(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)  

        x = self.dropout(x)                                         

        x = nn.utils.rnn.pack_padded_sequence(x, batch_x_lens.cpu(),
                                              batch_first=True, enforce_sorted=False)
        x, _ = self.rnn_layer2(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)  

        logits = self.proj(x)                                       
        return logits
    
    def generator(self, start_char, max_len=200):
        self.eval()
        PAD_ID = char_to_id['<pad>']
        EOS_ID = char_to_id['<eos>']
        
        char_list = [char_to_id[c] for c in start_char]
        
        with torch.no_grad():
            h1 = None
            h2 = None
            
            # 第一次:處理整個起始序列
            x = torch.tensor([char_list], dtype=torch.long, device=device)
            x_lens = torch.tensor([len(char_list)], dtype=torch.long, device=device)
            
            x_emb = self.embedding(x)
            
            # 第一層 RNN
            out1, h1 = self.rnn_layer1(x_emb, h1)
            out1 = self.dropout(out1)
            
            # 第二層 RNN
            out2, h2 = self.rnn_layer2(out1, h2)
            
            # 從最後一個時間步開始生成
            while len(char_list) < max_len:
                # 投影到詞彙表
                logits = self.proj(out2[:, -1:, :])  # 取最後一個時間步
                next_char_id = int(torch.argmax(logits[0, 0, :]).item())
                
                if next_char_id == EOS_ID:
                    break
                char_list.append(next_char_id)
                
                # 用新生成的字元繼續
                x_next = torch.tensor([[next_char_id]], dtype=torch.long, device=device)
                x_emb_next = self.embedding(x_next)
                
                # 通過 RNN
                out1, h1 = self.rnn_layer1(x_emb_next, h1)
                out1 = self.dropout(out1)
                out2, h2 = self.rnn_layer2(out1, h2)
        
        return [id_to_char[ch_id] for ch_id in char_list]


### GRU  

In [11]:
import torch
import torch.nn as nn

class CharGRU(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2):
        super(CharGRU, self).__init__()
        # 嵌入層（<pad> 位置自動置零）
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=char_to_id['<pad>'])

        # 兩層 GRU（取代原本 LSTM）
        self.gru1 = nn.GRU(input_size=embed_dim,  hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.gru2 = nn.GRU(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)

        # 輸出投影；做 weight tying（需 hidden_dim == embed_dim）
        assert hidden_dim == embed_dim, "使用權重綁定需 hidden_dim == embed_dim"
        self.proj = nn.Linear(hidden_dim, vocab_size, bias=False)
        self.proj.weight = self.embedding.weight  # weight tying

    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)

    # 嵌入 → pack → GRU×2 → pad → Linear
    def encoder(self, batch_x, batch_x_lens):
        x = self.embedding(batch_x)  # [B,T,E]
        x = nn.utils.rnn.pack_padded_sequence(x, batch_x_lens.cpu(),
                                              batch_first=True, enforce_sorted=False)
        x, _ = self.gru1(x)                         # _ 是最後一步的 h（[1,B,H]），但此處不需用
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)  # [B,T,H]

        x = self.dropout(x)

        x = nn.utils.rnn.pack_padded_sequence(x, batch_x_lens.cpu(),
                                              batch_first=True, enforce_sorted=False)
        x, _ = self.gru2(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)  # [B,T,H]

        logits = self.proj(x)  # [B,T,V]
        return logits

    def generator(self, start_char, max_len=200):
        """
        自回歸生成：每次把目前序列丟進編碼器，取最後一個 time step 的 logits 做 argmax。
        也可改成 beam search；流程與 LSTM 相同，但 GRU 只有 h，沒有 (h,c)。
        """
        self.eval()
        EOS_ID = char_to_id['<eos>']
        ids = [char_to_id[c] for c in start_char]

        with torch.no_grad():
            while len(ids) < max_len:
                x = torch.tensor(ids, dtype=torch.long, device=device).unsqueeze(0)  # [1,T]
                lens = torch.tensor([x.size(1)], dtype=torch.long, device=device)
                logits = self.encoder(x, lens)          # [1,T,V]
                next_id = int(torch.argmax(logits[:, -1, :], dim=-1).item())
                if next_id == EOS_ID:
                    break
                ids.append(next_id)

        return [id_to_char[i] for i in ids]
