In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.functional import to_map_style_dataset
import random
from datasets import load_dataset

# 加载IMDb数据集
dataset = load_dataset("imdb")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# 设置随机种子以保证可复现
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)

# 超参数设置
EMBED_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 32
NUM_EPOCHS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 构建词汇表
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for label, line in data_iter:
        yield tokenizer(line)

vocab = build_vocab_from_iterator(yield_tokens(train_dataset), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# 编码文本
def encode(text):
    return vocab(tokenizer(text))

# 编码标签
def label_to_tensor(label):
    return torch.tensor(1 if label == "pos" else 0)

# 数据预处理函数
def collate_batch(batch):
    text_list, label_list = [], []
    for label, text in batch:
        processed_text = torch.tensor(encode(text), dtype=torch.long)
        text_list.append(processed_text)
        label_list.append(label_to_tensor(label))
    text_list = pad_sequence(text_list, padding_value=vocab["<pad>"])
    return text_list.to(DEVICE), torch.tensor(label_list, dtype=torch.long).to(DEVICE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# 定义LSTM情感分类模型
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=False)
        self.fc = nn.Linear(hidden_dim, 2)  # 二分类
        self.dropout = nn.Dropout(0.3)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        out = self.fc(self.dropout(hidden[-1]))
        return out

model = SentimentLSTM(len(vocab), EMBED_DIM, HIDDEN_DIM).to(DEVICE)

# 训练配置
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练过程
def train(model, dataloader):
    model.train()
    total_loss = 0
    for text, labels in dataloader:
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# 测试过程
def evaluate(model, dataloader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for text, labels in dataloader:
            outputs = model(text)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

# 训练与评估
for epoch in range(NUM_EPOCHS):
    train_loss = train(model, train_loader)
    acc = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Test Accuracy = {acc:.4f}")


OSError: [WinError 127] 找不到指定的程序。