<a href="https://colab.research.google.com/github/ORAzzQWQ/NLP_2024/blob/main/NLP_HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random

In [7]:
!gdown --id 1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE- -O arithmetic_NLP.zip
!unzip arithmetic_NLP.zip

Downloading...
From (original): https://drive.google.com/uc?id=1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-
From (redirected): https://drive.google.com/uc?id=1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-&confirm=t&uuid=e3ad78d0-d11a-471d-86fb-817862aa5e88
To: /content/arithmetic_NLP.zip
100% 27.3M/27.3M [00:00<00:00, 85.2MB/s]
Archive:  arithmetic_NLP.zip
  inflating: arithmetic_eval.csv     
  inflating: arithmetic_train.csv    


In [8]:
df_train = pd.read_csv('arithmetic_train.csv')
df_eval = pd.read_csv('arithmetic_eval.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [9]:
# Transform the output data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

In [10]:
char_to_id = {}
id_to_char = {}

characters = ['<pad>', '<eos>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '-', '*', '(', ')', '=']
for idx, char in enumerate(characters):
    char_to_id[char] = idx
    id_to_char[idx] = char

vocab_size = len(char_to_id)

print('vocab_size: {}'.format(vocab_size))

vocab_size: 18


In [11]:
def char_id(expr, token_map):
    tokens = re.findall(r'\d|[+\-*/=()]', expr)
    id_list = [token_map[token] for token in tokens if token in token_map]
    id_list.append(token_map['<eos>'])
    return id_list

def label_id(char_id_list, token_map):
    equal_pos = char_id_list.index(token_map['='])
    return [0] * (equal_pos+1) + char_id_list[equal_pos + 1:]

df_train['char_id_list'] = df_train['src'].apply(lambda x: char_id(x, char_to_id))
df_train['label_id_list'] = df_train['char_id_list'].apply(label_id, token_map=char_to_id)  # 等號後的
df_train = df_train[['src', 'tgt', 'len', 'char_id_list', 'label_id_list']]

df_eval['char_id_list'] = df_eval['src'].apply(lambda x: char_id(x, char_to_id))
df_eval['label_id_list'] = df_eval['char_id_list'].apply(label_id, token_map=char_to_id)
df_eval = df_eval[['src', 'tgt', 'len', 'char_id_list', 'label_id_list']]

df_train.head()
# df_train.to_csv('df_train.csv', index=False)
# df_eval.to_csv('df_eval.csv', index=False)

Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,882,14,"[3, 6, 14, 15, 6, 5, 12, 4, 2, 16, 17, 10, 10,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 4, 1]"
1,(6+1)*5=35,35,10,"[15, 8, 12, 3, 16, 14, 7, 17, 5, 7, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 1]"
2,13+32+29=74,74,11,"[3, 5, 12, 5, 4, 12, 4, 11, 17, 9, 6, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 6, 1]"
3,31*(3-11)=-248,-248,14,"[5, 3, 14, 15, 5, 13, 3, 3, 16, 17, 13, 4, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 4, 6, 10, 1]"
4,24*49+1=1177,1177,12,"[4, 6, 14, 6, 11, 12, 3, 17, 3, 3, 9, 9, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 9, 9, 1]"


In [16]:
# Model (ASK Claude and TA example)
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        # return how much data is here in the Dataset object
        return len(self.sequences)

    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        data = self.sequences.iloc[index]
        x = torch.tensor(data['char_id_list'])
        y = torch.tensor(data['label_id_list'])
        return x, y

# Model Implementation
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()  # 修正：加上括號

        # Embedding layer
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=embed_dim,
                                          padding_idx=char_to_id['<pad>'])

        # Two LSTM layers
        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                       hidden_size=hidden_dim,
                                       batch_first=True)

        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_dim,
                                       hidden_size=hidden_dim,
                                       batch_first=True)

        # Sequential layer with linear transformations and ReLU
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(in_features=hidden_dim, out_features=hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=hidden_dim, out_features=vocab_size)
        )

    def forward(self, x, target=None):
        # x shape: (batch_size, sequence_length)
        batch_size = x.size(0)
        sequence_length = x.size(1)

        # 1. 嵌入層處理所有輸入
        embedded = self.embedding(x)
        # embedded shape: (batch_size, sequence_length, embed_dim)

        # 2. 通過 LSTM 層
        output1, _ = self.rnn_layer1(embedded)
        output2, _ = self.rnn_layer2(output1)
        # output2 shape: (batch_size, sequence_length, hidden_dim)

        # 3. 通過線性層得到預測
        outputs = self.linear(output2)
        # outputs shape: (batch_size, sequence_length, vocab_size)

        return outputs

    def generator(self, start_char, max_len=200):
        # Convert input characters to IDs
        char_list = [char_to_id[c] for c in start_char]

        next_char = None

        while len(char_list) < max_len:
            # Pack the char_list to tensor
            x = torch.tensor(char_list).unsqueeze(0).to(next(self.parameters()).device)

            # Input the tensor through the model layers
            embedded = self.embedding(x)
            output1, _ = self.rnn_layer1(embedded)
            output2, _ = self.rnn_layer2(output1)
            y = self.linear(output2)

            # Obtain the next token prediction
            y = y[:, -1, :]  # Get the last prediction

            # Use argmax function to get the next token prediction
            next_char = torch.argmax(y, dim=-1).item()

            if next_char == char_to_id['<eos>']:
                break

            char_list.append(next_char)

        # Convert IDs back to characters
        return [id_to_char[ch_id] for ch_id in char_list]

In [19]:
# (ASK Claude and ChatGPT)
def collate_fn(batch):
    # 把序列和標籤分開
    sequences, labels = zip(*batch)

    # 使用 pad_sequence 自動處理填充
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=char_to_id['<pad>'])
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=char_to_id['<pad>'])

    return padded_sequences, padded_labels

def train_step(model, optimizer, criterion, x, y):
    # 1. 獲取模型預測
    logits = model(x)

    # 2. 計算損失（注意：y需要錯一位，因為我們在預測下一個字符）
    loss = criterion(
        logits[:, :-1].reshape(-1, logits.size(-1)),  # 除去最後一個預測
        y[:, 1:].reshape(-1)  # 除去第一個目標
    )

    # 3. 反向傳播和優化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

# 使用示例


In [None]:
# (ASK Claude and ChatGPT)
model = CharRNN(vocab_size=len(char_to_id), embed_dim=64, hidden_dim=128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

num_epochs = 10
train_dataset = Dataset(df_train)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=16,  # 減小 batch_size
    shuffle=True,
    collate_fn=collate_fn
)

# 訓練循環
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    batch_count = 0

    try:
        for batch_idx, (x, y) in enumerate(train_loader):
            loss = train_step(model, optimizer, criterion, x, y)
            total_loss += loss
            batch_count += 1

            if (batch_idx + 1) % 1000 == 0:
                avg_loss = total_loss / batch_count
                print(f"Epoch {epoch+1}, Batch {batch_idx+1}, Loss: {loss:.4f}, Avg Loss: {avg_loss:.4f}")

        # 每個 epoch 結束後更新學習率
        epoch_loss = total_loss / batch_count
        scheduler.step(epoch_loss)
        print(f"Epoch {epoch+1} completed, Average Loss: {epoch_loss:.4f}")

    except Exception as e:
        print(f"Error in epoch {epoch+1}: {str(e)}")
        continue

# 保存模型
torch.save(model.state_dict(), 'char_rnn_model.pth')

# # 加载模型
# model.load_state_dict(torch.load('char_rnn_model.pth'))
# model.eval()


In [25]:
# (ASK Claude and ChatGPT)
def evaluate_model(model, eval_loader, device, char_to_id, id_to_char):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])

    # 用于存储每个字符的准确率统计
    char_stats = {char: {'correct': 0, 'total': 0} for char in char_to_id.keys()}

    with torch.no_grad():
        for batch_idx, (input_seq, target_seq) in enumerate(eval_loader):
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            # 获取模型输出
            output = model(input_seq)

            # 计算损失
            loss = criterion(
                output[:, :-1].reshape(-1, len(char_to_id)),  # 除去最后一个预测
                target_seq[:, 1:].reshape(-1)  # 除去第一个目标
            )
            total_loss += loss.item()

            # 获取预测
            _, predicted = torch.max(output[:, :-1].reshape(-1, len(char_to_id)), dim=1)
            targets = target_seq[:, 1:].reshape(-1)

            # 只考虑非填充字符的预测
            mask = targets != char_to_id['<pad>']
            predicted = predicted[mask]
            targets = targets[mask]

            # 统计每个字符的准确率
            for pred, targ in zip(predicted, targets):
                pred_char = id_to_char[pred.item()]
                targ_char = id_to_char[targ.item()]

                char_stats[targ_char]['total'] += 1
                if pred_char == targ_char:
                    char_stats[targ_char]['correct'] += 1
                    correct += 1
                total += 1

    accuracy = (correct / total * 100) if total > 0 else 0
    avg_loss = total_loss / len(eval_loader)

    char_accuracies = {}
    for char, stats in char_stats.items():
        if stats['total'] > 0:
            char_accuracies[char] = (stats['correct'] / stats['total'] * 100)

    return {
        'accuracy': accuracy,
        'loss': avg_loss,
        'char_accuracies': char_accuracies
    }


def main():

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = CharRNN(vocab_size=len(char_to_id), embed_dim=64, hidden_dim=128)
    model.load_state_dict(torch.load('char_rnn_model_v3', map_location=device))
    model.to(device)
    model.eval()

    eval_dataset = Dataset(df_eval)
    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=16,
        shuffle=False,
        collate_fn=collate_fn
    )

    results = evaluate_model(model, eval_loader, device, char_to_id, id_to_char)

    print(f"\nEvaluation Results:")
    print(f"Overall Accuracy: {results['accuracy']:.2f}%")
    print(f"Average Loss: {results['loss']:.4f}")
    print("\nPer-character Accuracy:")

    sorted_chars = sorted(
        results['char_accuracies'].items(),
        key=lambda x: x[1],
        reverse=True
    )

    for char, acc in sorted_chars:
        if char not in ['<pad>', '<eos>']:
            print(f"'{char}': {acc:.2f}%")

if __name__ == "__main__":
    main()

  model.load_state_dict(torch.load('char_rnn_model_v3', map_location=device))



Evaluation Results:
Overall Accuracy: 88.40%
Average Loss: 0.2694

Per-character Accuracy:
'-': 99.94%
'0': 93.45%
'1': 91.28%
'2': 86.78%
'5': 81.77%
'4': 80.64%
'6': 78.51%
'8': 77.92%
'9': 76.96%
'3': 76.89%
'7': 73.19%
