70. 単語埋め込みの読み込み




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install gensim



In [4]:
import numpy as np
from gensim.models import KeyedVectors

word2id = {}
id2word = {}
embeddings = []

EMB_DIM = 300
word2id["<PAD>"] = 0
id2word[0] = "<PAD>"
embeddings.append(np.zeros(EMB_DIM))

file_path = '/content/drive/MyDrive/Colab Notebooks/chapter8/GoogleNews-vectors-negative300.bin'

model = KeyedVectors.load_word2vec_format(
    file_path,
    binary=True
)

MAX_VOCAB = 300000

for word in model.index_to_key:
    if len(embeddings) >= MAX_VOCAB:
        break

    token_id = len(embeddings)
    word2id[word] = token_id
    id2word[token_id] = word
    embeddings.append(model[word])


E = np.vstack(embeddings)
print(E.shape)
print(word2id["<PAD>"])
print(E[0].sum())

(300000, 300)
0
0.0


71. データセットの読み込み




In [5]:
!pip install datasets



In [6]:
import csv
import torch

def load_sst_tsv(file_path, word2id):
    data = []

    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')

        for row in reader:
            text = row['sentence']
            label = int(row['label'])

            tokens = text.split()

            input_ids = [
                word2id[token]
                for token in tokens
                if token in word2id
            ]

            if len(input_ids) == 0:
                continue

            example = {
                'text': text,
                'label': torch.tensor([label], dtype=torch.float),
                'input_ids': torch.tensor(input_ids, dtype=torch.long)
            }

            data.append(example)

    return data


train_path = '/content/drive/MyDrive/Colab Notebooks/chapter8/SST-2/train.tsv'
dev_path = '/content/drive/MyDrive/Colab Notebooks/chapter8/SST-2/dev.tsv'

train_data = load_sst_tsv(train_path, word2id)
dev_data = load_sst_tsv(dev_path, word2id)

print(len(train_data), len(dev_data))
print(train_data[0])


66396 872
{'text': 'hide new secretions from the parental units ', 'label': tensor([0.]), 'input_ids': tensor([  5785,     66, 113845,     18,     12,  15095,   1594])}


72. Bag of wordsモデルの構築

In [21]:
import torch
import torch.nn as nn

embedding_matrix = torch.tensor(E, dtype=torch.float)
class BoWClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()
        # Convert numpy array to torch tensor if it's not already one
        if isinstance(embedding_matrix, np.ndarray):
            embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)

        vocab_size, emb_dim = embedding_matrix.shape

        self.embedding = nn.Embedding(
            vocab_size,
            emb_dim,
            padding_idx=0
        )

        self.embedding.weight.data.copy_(embedding_matrix)
        self.embedding.weight.requires_grad = False

        self.fc = nn.Linear(emb_dim, 1)

    def forward(self, input_ids):
        """
        input_ids: (batch_size, seq_len)
        """
        embeds = self.embedding(input_ids)
        avg_embeds = embeds.mean(dim=1)
        logits = self.fc(avg_embeds)
        return logits


73. モデルの学習

In [13]:
from torch.utils.data import DataLoader
import torch.nn as nn # Ensure nn is imported
import torch.optim as optim # Ensure optim is imported

def collate_fn(batch):
    texts = [x['text'] for x in batch]
    labels = torch.stack([x['label'] for x in batch])

    input_ids = [x['input_ids'] for x in batch]
    max_len = max(len(ids) for ids in input_ids)

    padded = []
    for ids in input_ids:
        pad_len = max_len - len(ids)
        padded.append(
            torch.cat([ids, torch.zeros(pad_len, dtype=torch.long)])
        )

    return {
        'text': texts,
        'input_ids': torch.stack(padded),
        'label': labels
    }

train_loader = DataLoader(
    train_data,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

model = BoWClassifier(embedding_matrix)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=1e-3)

EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        logits = model(batch['input_ids'])
        loss = criterion(logits, batch['label'])

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (i + 1) % 100 == 0:
            print(
                f"Epoch [{epoch+1}/{EPOCHS}], "
                f"Step [{i+1}/{len(train_loader)}], "
                f"Loss: {loss.item():.4f}"
            )

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} finished. Avg Loss: {avg_loss:.4f}")

Epoch [1/5], Step [100/2075], Loss: 0.6875
Epoch [1/5], Step [200/2075], Loss: 0.6599
Epoch [1/5], Step [300/2075], Loss: 0.6768
Epoch [1/5], Step [400/2075], Loss: 0.6170
Epoch [1/5], Step [500/2075], Loss: 0.6393
Epoch [1/5], Step [600/2075], Loss: 0.6443
Epoch [1/5], Step [700/2075], Loss: 0.6387
Epoch [1/5], Step [800/2075], Loss: 0.6581
Epoch [1/5], Step [900/2075], Loss: 0.6307
Epoch [1/5], Step [1000/2075], Loss: 0.5545
Epoch [1/5], Step [1100/2075], Loss: 0.6460
Epoch [1/5], Step [1200/2075], Loss: 0.6502
Epoch [1/5], Step [1300/2075], Loss: 0.5625
Epoch [1/5], Step [1400/2075], Loss: 0.6101
Epoch [1/5], Step [1500/2075], Loss: 0.6074
Epoch [1/5], Step [1600/2075], Loss: 0.6184
Epoch [1/5], Step [1700/2075], Loss: 0.5808
Epoch [1/5], Step [1800/2075], Loss: 0.4945
Epoch [1/5], Step [1900/2075], Loss: 0.5950
Epoch [1/5], Step [2000/2075], Loss: 0.6088
Epoch 1 finished. Avg Loss: 0.6196
Epoch [2/5], Step [100/2075], Loss: 0.4839
Epoch [2/5], Step [200/2075], Loss: 0.5536
Epoch [2

74. モデルの評価
問題73で学習したモデルの開発セットにおける正解率を求めよ。

In [15]:
def evaluate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            logits = model(batch['input_ids'])
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            correct += (preds == batch['label']).sum().item()
            total += batch['label'].size(0)

    return correct / total
from torch.utils.data import DataLoader

dev_loader = DataLoader(
    dev_data,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)
dev_acc = evaluate_accuracy(model, dev_loader)
print(f"Dev Accuracy: {dev_acc:.4f}")


Dev Accuracy: 0.7787


75. パディング

In [16]:
import torch

def collate(batch):
    # 1. 按 token 长度从长到短排序
    batch = sorted(batch, key=lambda x: len(x['input_ids']), reverse=True)

    # 2. 取最长序列长度
    max_len = len(batch[0]['input_ids'])

    padded_input_ids = []
    labels = []

    # 3. padding
    for example in batch:
        input_ids = example['input_ids']
        pad_len = max_len - len(input_ids)

        padded_ids = torch.cat(
            [input_ids, torch.zeros(pad_len, dtype=torch.long)]
        )

        padded_input_ids.append(padded_ids)
        labels.append(example['label'])

    # 4. stack 成 tensor
    return {
        'input_ids': torch.stack(padded_input_ids),
        'label': torch.stack(labels)
    }
batch = train_data[:4]
out = collate(batch)

print(out['input_ids'])
print(out['label'])


tensor([[     4,   5053,     45,   3305,  31647,    348,    904,   2815,     47,
           1276,   1964],
        [  5785,     66, 113845,     18,     12,  15095,   1594,      0,      0,
              0,      0],
        [   987,  14528,   4941,    873,     12,    208,    898,      0,      0,
              0,      0],
        [  3475,     87,  15888,     90,  27695,  42637,      0,      0,      0,
              0,      0]])
tensor([[1.],
        [0.],
        [0.],
        [0.]])



76. ミニバッチ学習

In [17]:

from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_data,
    batch_size=32,
    shuffle=True,
    collate_fn=collate
)

dev_loader = DataLoader(
    dev_data,
    batch_size=32,
    shuffle=False,
    collate_fn=collate
)
import torch.nn as nn
import torch.optim as optim

EPOCHS = 5

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        logits = model(batch['input_ids'])
        loss = criterion(logits, batch['label'])

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (i + 1) % 100 == 0:
            print(
                f"Epoch [{epoch+1}/{EPOCHS}] "
                f"Step [{i+1}/{len(train_loader)}] "
                f"Loss: {loss.item():.4f}"
            )

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} finished. Avg Loss: {avg_loss:.4f}")
def evaluate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            logits = model(batch['input_ids'])
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            correct += (preds == batch['label']).sum().item()
            total += batch['label'].size(0)

    return correct / total
dev_acc = evaluate_accuracy(model, dev_loader)
print(f"Dev Accuracy: {dev_acc:.4f}")


Epoch [1/5] Step [100/2075] Loss: 0.3536
Epoch [1/5] Step [200/2075] Loss: 0.3945
Epoch [1/5] Step [300/2075] Loss: 0.5544
Epoch [1/5] Step [400/2075] Loss: 0.4982
Epoch [1/5] Step [500/2075] Loss: 0.3710
Epoch [1/5] Step [600/2075] Loss: 0.3271
Epoch [1/5] Step [700/2075] Loss: 0.4676
Epoch [1/5] Step [800/2075] Loss: 0.5052
Epoch [1/5] Step [900/2075] Loss: 0.3585
Epoch [1/5] Step [1000/2075] Loss: 0.4435
Epoch [1/5] Step [1100/2075] Loss: 0.4646
Epoch [1/5] Step [1200/2075] Loss: 0.5190
Epoch [1/5] Step [1300/2075] Loss: 0.3942
Epoch [1/5] Step [1400/2075] Loss: 0.4658
Epoch [1/5] Step [1500/2075] Loss: 0.4435
Epoch [1/5] Step [1600/2075] Loss: 0.3105
Epoch [1/5] Step [1700/2075] Loss: 0.4259
Epoch [1/5] Step [1800/2075] Loss: 0.4721
Epoch [1/5] Step [1900/2075] Loss: 0.3741
Epoch [1/5] Step [2000/2075] Loss: 0.4527
Epoch 1 finished. Avg Loss: 0.4403
Epoch [2/5] Step [100/2075] Loss: 0.4975
Epoch [2/5] Step [200/2075] Loss: 0.3764
Epoch [2/5] Step [300/2075] Loss: 0.4691
Epoch [2/5]

77. GPU上での学習


In [18]:
import torch

device = torch.device("cpu")

model = model.to(device)

EPOCHS = 5

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
def evaluate_accuracy_gpu(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

dev_acc = evaluate_accuracy_gpu(model, dev_loader, device)
print(f"Dev Accuracy (GPU): {dev_acc:.4f}")



Epoch 1, Loss: 0.4141
Epoch 2, Loss: 0.4106
Epoch 3, Loss: 0.4087
Epoch 4, Loss: 0.4062
Epoch 5, Loss: 0.4047
Dev Accuracy (GPU): 0.7913






78. 単語埋め込みのファインチューニング




In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"[Fine-tune] Epoch {epoch+1}, Loss: {avg_loss:.4f}")
dev_acc = evaluate_accuracy_gpu(model, dev_loader, device)
print(f"Dev Accuracy (Fine-tuning): {dev_acc:.4f}")


[Fine-tune] Epoch 1, Loss: 0.6655
[Fine-tune] Epoch 2, Loss: 0.5650
[Fine-tune] Epoch 3, Loss: 0.4601
[Fine-tune] Epoch 4, Loss: 0.3890
[Fine-tune] Epoch 5, Loss: 0.3422
Dev Accuracy (Fine-tuning): 0.8096


79. アーキテクチャの変更


In [35]:
class BoWMLPClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()
        V, D = embedding_matrix.shape

        self.embedding = nn.Embedding(V, D, padding_idx=0)
        self.embedding.weight.data.copy_(
    torch.from_numpy(embedding_matrix)
)

        self.embedding.weight.requires_grad = False  # 是否 fine-tune 可选

        self.fc1 = nn.Linear(D, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)

    def forward(self, input_ids):
        emb = self.embedding(input_ids)        # (B, L, D)
        avg_emb = emb.mean(dim=1)               # (B, D)

        h = self.relu(self.fc1(avg_emb))        # (B, 128)
        logits = self.fc2(h)                    # (B, 1)
        return logits
model = BoWMLPClassifier(E).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
dev_acc = evaluate_accuracy_gpu(model, dev_loader, device)
print(f"Dev Accuracy (MLP): {dev_acc:.4f}")


Dev Accuracy (MLP): 0.4908
