<a href="https://colab.research.google.com/github/ORAzzQWQ/NLP_2024/blob/main/NLP_HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
!gdown --id 1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE- -O arithmetic_NLP.zip
!unzip arithmetic_NLP.zip

Downloading...
From (original): https://drive.google.com/uc?id=1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-
From (redirected): https://drive.google.com/uc?id=1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-&confirm=t&uuid=57c26d67-1b57-41e4-9528-3de4881dd7a6
To: /content/arithmetic_NLP.zip
100% 27.3M/27.3M [00:01<00:00, 23.9MB/s]
Archive:  arithmetic_NLP.zip
  inflating: arithmetic_eval.csv     
  inflating: arithmetic_train.csv    


In [None]:
df_train = pd.read_csv('arithmetic_train.csv')
df_eval = pd.read_csv('arithmetic_eval.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [None]:
# Transform the output data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

In [None]:
char_to_id = {}
id_to_char = {}

characters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '<pad>', '<eos>', '+', '-', '*', '(', ')', '=']
for idx, char in enumerate(characters):
    char_to_id[char] = idx
    id_to_char[idx] = char

vocab_size = len(char_to_id)

print('vocab_size: {}'.format(vocab_size))

vocab_size: 8813


In [None]:
# Data processing
def char_id(expr, token_map):
    tokens = re.findall(r'\d+|[+\-*/=()]', expr)
    id_list = [token_map[token] for token in tokens]
    id_list.append(token_map['<eos>'])
    return id_list

def label_id(char_id_list, token_map):
    equal_pos = char_id_list.index(token_map['='])
    return [0] * (equal_pos+1) + char_id_list[equal_pos + 1:]

df_train['char_id_list'] = df_train['src'].apply(lambda x: char_id(x, char_to_id))
df_train['label_id_list'] = df_train['char_id_list'].apply(label_id, token_map=char_to_id)  # 等號後的
df_train = df_train[['src', 'tgt', 'len', 'char_id_list', 'label_id_list']]

df_train.head()

Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,882,14,"[1049, 4, 2, 5619, 5, 2429, 3, 8812, 8412, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 8412, 1]"
1,(6+1)*5=35,35,10,"[2, 6992, 5, 8, 3, 4, 6244, 8812, 4771, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 4771, 1]"
2,13+32+29=74,74,11,"[806, 5, 4410, 5, 4023, 8812, 7807, 1]","[0, 0, 0, 0, 0, 0, 7807, 1]"
3,31*(3-11)=-248,-248,14,"[4281, 4, 2, 4154, 6, 296, 3, 8812, 6, 3431, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 6, 3431, 1]"
4,24*49+1=1177,1177,12,"[3282, 4, 6160, 5, 8, 8812, 498, 1]","[0, 0, 0, 0, 0, 0, 498, 1]"


In [None]:
# Model
class DataSet(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x = torch.tensor(self.sequences[idx][0], dtype=torch.long)
        y = torch.tensor(self.sequences[idx][1], dtype=torch.long)
        return x, y

class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, embed_dim):
        super(CharRNN, self).__init__()

        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])

        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                        hidden_size=hidden_size,
                                        batch_first=True)

        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_size,
                                        hidden_size=hidden_size,
                                        batch_first=True)

        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_size,
                                                          out_features=hidden_size),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_size,
                                                          out_features=vocab_size))

    def generator(self, start_char, max_len=200):
          char_list = [char_to_id[c] for c in start_char]
          next_char = None

          while len(char_list) < max_len:
            # pack char_list to tensor, input it to embedding layer
            x = torch.tensor([char_list])
            y = self.embedding(x)
            y, _ = self.rnn_layer1(y)
            y, _ = self.rnn_layer2(y)

            next_char_logits = self.linear(y[:, -1, :])
            next_char = torch.argmax(next_char_logits, dim=1).item()

            if next_char == char_to_id['<eos>']:
              break
            char_list.append(next_char)
          return [id_to_char[c] for c in char_list]

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.rnn_layer1(x)
        output, _ = self.rnn_layer2(output)
        output = self.linear(output[:, -1, :])
        return output


In [None]:
# Load Data??????

In [None]:
vocab_size = len(char_to_id)
hidden_size = 128
embed_dim = 64

model = CharRNN(vocab_size, hidden_size, embed_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10\
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)

        # 前向传播
        optimizer.zero_grad()
        output = model(x)

        # 计算损失
        loss = criterion(output, y)

        # 反向传播
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

start_char = "1+2="
generated_text = model.generator(start_char, max_len=200)
print("Generated text:", ''.join(generated_text))

# 保存模型
torch.save(model.state_dict(), 'char_rnn_model.pth')

# 加载模型
model.load_state_dict(torch.load('char_rnn_model.pth'))
model.eval()


RuntimeError: stack expects each tensor to be equal size, but got [10] at entry 0 and [9] at entry 1