## NER

### データ準備

#### データロード

In [1]:
import pickle
# pickle.load('ner_dataset_senteces.txt')
def load_dataset(sentences_file_name='ner_dataset_sentences.txt', labels_file_name='ner_dataset_labels.txt'):
    with open(sentences_file_name, 'rb') as fp:
        sentences = pickle.load(fp)
    with open(labels_file_name, 'rb') as fp:
        labels = pickle.load(fp)
    return sentences, labels    


In [2]:
sentences, labels = load_dataset()

In [3]:
sentences[:10]

['The cat sat on the mat .',
 'John lives in New York .',
 'I have two dogs .',
 'She works at Google .',
 'The Eiffel Tower is in Paris .',
 'He is from Spain .',
 'I visited the Great Wall of China .',
 'She is studying at Oxford University .',
 'He works for the United Nations .',
 'Berlin is the capital of Germany .']

In [4]:
labels[:10]

[['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-per', 'O', 'O', 'B-geo', 'I-geo', 'O'],
 ['O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'B-org', 'O'],
 ['O', 'B-geo', 'I-geo', 'O', 'O', 'B-geo', 'O'],
 ['O', 'O', 'O', 'B-geo', 'O'],
 ['O', 'O', 'O', 'B-geo', 'I-geo', 'I-geo', 'I-geo', 'O'],
 ['O', 'O', 'O', 'O', 'B-org', 'I-org', 'O'],
 ['O', 'O', 'O', 'O', 'B-org', 'I-org', 'O'],
 ['B-geo', 'O', 'O', 'O', 'O', 'B-geo', 'O']]

#### ラベルエンコード

In [5]:
from sklearn.preprocessing import LabelEncoder
all_labels = [label for sublist in labels for label in sublist]
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
encoded_labels = [label_encoder.transform(label)+1 for label in labels] # paddingのラベルが0であることを考慮し，ラベルを1ずらす

In [6]:
encoded_labels[:3]

[array([7, 7, 7, 7, 7, 7, 7]),
 array([3, 7, 7, 1, 4, 7]),
 array([7, 7, 7, 7, 7])]

#### 辞書作成とエンコード

In [7]:
# 辞書初期化
word2idx = {"<PAD>": 0}

In [8]:
encoded_sentences = []
for sentence in sentences:
    encoded_sentence = [word2idx.setdefault(word, len(word2idx)) for word in sentence.split()]
    encoded_sentences.append(encoded_sentence)

In [9]:
encoded_sentences[:3]

[[1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 7], [13, 14, 15, 16, 7]]

#### 学習データと検証データ分割

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val, train_sentences, val_sentences = train_test_split(encoded_sentences, encoded_labels, sentences, test_size=0.2, random_state=42)

In [11]:
X_train[:3]

[[94, 51, 95, 96, 97, 98, 7], [17, 18, 19, 87, 7], [1, 50, 51, 52, 10, 53, 7]]

In [12]:
train_sentences[:3]

['They are visiting London this summer .',
 'She works at Facebook .',
 'The Pyramids are located in Egypt .']

#### padding

In [13]:
import torch
from torch.nn.utils.rnn import pad_sequence

seq1 = torch.tensor([1, 2, 3])
seq2 = torch.tensor([4, 5])
seq3 = torch.tensor([6, 7, 8, 9])

sequences = [seq1, seq2, seq3]
padded_sequences = pad_sequence(sequences, batch_first=True)
print(padded_sequences)

tensor([[1, 2, 3, 0],
        [4, 5, 0, 0],
        [6, 7, 8, 9]])


In [14]:
X_train = pad_sequence([torch.tensor(x) for x in X_train], batch_first=True)
X_val = pad_sequence([torch.tensor(x) for x in X_val], batch_first=True)
y_train = pad_sequence([torch.tensor(y) for y in y_train], batch_first=True)
y_val = pad_sequence([torch.tensor(y) for y in y_val], batch_first=True)

In [15]:
X_train.shape

torch.Size([24, 10])

#### NERのモデル

In [16]:
import torch.nn as nn
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, rnn_type='LSTM', bidirectional=False):

        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        
        # embedding layer追加 (vocab_size x embedding_dim)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        input_size = embedding_dim
        
        if rnn_type == 'RNN':
            self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        else:
            raise ValueError('Unsupported RNN type. Choose from ["LSTM", "RNN", "GRU", "UGRNN"]')
            
        self.fc = nn.Linear(hidden_size*self.num_directions, output_size)

    def forward(self, x):
        x = self.embedding(x)
        output_seq, _ = self.rnn(x)

        # many to many の場合は，
        # output_seq: [batch_size, seq_len, hidden_size*num_directions]
        out = self.fc(output_seq)
        return out

In [17]:
import torch
# BiRNNのテスト
# input_size = 10
vocab_size = 300
embedding_dim = 50
hidden_size = 3
batch_size = 24
seq_len = 10
output_size = 3
# サンプルのTensor
# input_tensor = torch.randn(batch_size, seq_len, vocab_size)
model = Model(vocab_size, embedding_dim, hidden_size, output_size, bidirectional=True, )
out = model(X_train)

In [18]:
out[0]

tensor([[ 0.2210, -0.3100,  0.0265],
        [ 0.0576, -0.1342,  0.1030],
        [ 0.4368,  0.0911,  0.2392],
        [ 0.6323,  0.0702,  0.0960],
        [ 0.2668, -0.0237,  0.0247],
        [-0.0732,  0.0372,  0.1260],
        [ 0.3789, -0.0255,  0.0600],
        [ 0.2555, -0.0246, -0.0072],
        [ 0.2557, -0.0593, -0.0093],
        [ 0.2613, -0.0771, -0.0071]], grad_fn=<SelectBackward0>)

### 学習ループ

In [19]:
from torch.utils.data import DataLoader, TensorDataset
vocab_size = len(word2idx)
num_classes = len(label_encoder.classes_) + 1 # padding用のクラスも含める
embedding_dim = 50
hidden_size = 40
output_size = num_classes
batch_size = 3
learning_rate = 0.003
num_epochs = 40

# モデル作成
model = Model(vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, rnn_type='LSTM', bidirectional=True)

# Data Loader作成
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 損失関数とOptimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# 学習ループ
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for i, (sentences, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        
        outputs = model(sentences)
        loss = criterion(outputs.view(-1, num_classes), labels.view(-1))
        
        loss.backward()
        
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

    # 検証データを使用して検証エラーを計算
    model.eval()
    val_loss = 0
    total_samples = 0
    total_correct = 0
    for sentences, labels in val_loader:
        
        outputs = model(sentences)
        # loss計算
        loss = criterion(outputs.view(-1, num_classes), labels.view(-1))
        val_loss += loss.item()

        # accuracy計算
        _, predicted = torch.max(outputs, dim=-1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.numel()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = total_correct / total_samples
    print(f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

Epoch 1/40, Average Training Loss: 1.9692
Val Loss: 1.7057, Val Accuracy: 0.5833
Epoch 2/40, Average Training Loss: 1.3873
Val Loss: 1.1822, Val Accuracy: 0.5833
Epoch 3/40, Average Training Loss: 0.9466
Val Loss: 1.1030, Val Accuracy: 0.5833
Epoch 4/40, Average Training Loss: 0.8277
Val Loss: 0.9434, Val Accuracy: 0.5833
Epoch 5/40, Average Training Loss: 0.6396
Val Loss: 0.8606, Val Accuracy: 0.5833
Epoch 6/40, Average Training Loss: 0.5389
Val Loss: 0.8200, Val Accuracy: 0.5833
Epoch 7/40, Average Training Loss: 0.4286
Val Loss: 0.8165, Val Accuracy: 0.5833
Epoch 8/40, Average Training Loss: 0.3184
Val Loss: 0.8276, Val Accuracy: 0.5833
Epoch 9/40, Average Training Loss: 0.2291
Val Loss: 0.8360, Val Accuracy: 0.5833
Epoch 10/40, Average Training Loss: 0.1611
Val Loss: 0.8449, Val Accuracy: 0.6042
Epoch 11/40, Average Training Loss: 0.1192
Val Loss: 0.8833, Val Accuracy: 0.6042
Epoch 12/40, Average Training Loss: 0.0849
Val Loss: 0.9146, Val Accuracy: 0.6042
Epoch 13/40, Average Trai

### 検証データの出力確認

In [20]:
model.eval()
val_outputs = model(X_val)

In [21]:
_, predicted_labels = torch.max(val_outputs, dim=-1)

In [22]:
for sentence, label in zip(val_sentences, predicted_labels):
    words = sentence.split()
    decoded_labels = label_encoder.inverse_transform(label[:len(words)]-1) # もとのラベルIDに戻す(padding分で+1していた分を戻す
    print(f'original sentence: {sentence}')
    print(f'predicted labels: {decoded_labels}')
    print()

original sentence: He lives in Los Angeles .
predicted labels: ['O' 'O' 'O' 'O' 'O' 'O']

original sentence: She is from Australia .
predicted labels: ['O' 'O' 'O' 'O' 'O']

original sentence: The Great Barrier Reef is in Australia .
predicted labels: ['O' 'B-geo' 'I-geo' 'O' 'O' 'O' 'O' 'O']

original sentence: The Amazon is the largest rainforest .
predicted labels: ['O' 'B-geo' 'O' 'O' 'O' 'B-geo' 'O']

original sentence: He works for the United Nations .
predicted labels: ['O' 'O' 'O' 'O' 'O' 'B-org' 'O']

original sentence: Berlin is the capital of Germany .
predicted labels: ['O' 'O' 'O' 'O' 'I-org' 'I-org' 'O']

