In [6]:

"""
任务 1：多特征方式的电影评论分类
    1. one-hot + MLP
    2. TF-IDF + LogisticRegression
    3. Embedding + LSTM（可调 maxlen / embed_dim）
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# ===========================
# 参数
# ===========================
vocab_size = 20000
maxlen_list = [100, 200]
embed_dim_list = [64, 128]
batch_size = 64
epochs = 3

# ===========================
# 加载 Keras IMDB
# ===========================
print("Loading IMDB...")
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)


train_text = [" ".join(map(str, seq)) for seq in x_train]
test_text  = [" ".join(map(str, seq)) for seq in x_test]


Loading IMDB...


In [9]:
# ===========================
# 1. TF-IDF + Logistic Regression
# ===========================
def tfidf_classification():
    print("\n=== TF-IDF + Logistic Regression ===")
    vectorizer = TfidfVectorizer(max_features=20000)
    X_train = vectorizer.fit_transform(train_text)
    X_test = vectorizer.transform(test_text)

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred))

In [10]:
# ===========================
# 2. One-hot + MLP
# ===========================
def one_hot_mlp():
    print("\n=== One-hot + MLP ===")
    # 多热编码
    X_train = np.zeros((len(x_train), vocab_size))
    X_test = np.zeros((len(x_test), vocab_size))

    for i, seq in enumerate(x_train):
        X_train[i, seq] = 1
    for i, seq in enumerate(x_test):
        X_test[i, seq] = 1

    X_train = torch.FloatTensor(X_train)
    y_train_t = torch.FloatTensor(y_train)
    X_test = torch.FloatTensor(X_test)
    y_test_t = torch.FloatTensor(y_test)

    class MLP(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Sequential(
                nn.Linear(vocab_size, 256),
                nn.ReLU(),
                nn.Linear(256, 1),
                nn.Sigmoid()
            )

        def forward(self, x):
            return self.fc(x)

    model = MLP().to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    dataset = torch.utils.data.TensorDataset(X_train, y_train_t)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(2):  # one-hot 很慢，所以只训练 2 epoch
        for x_b, y_b in loader:
            x_b, y_b = x_b.to(device), y_b.to(device)

            pred = model(x_b).squeeze()
            loss = criterion(pred, y_b)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}, Loss = {loss.item():.4f}")

    with torch.no_grad():
        pred = model(X_test.to(device)).squeeze()
        pred_label = (pred >= 0.5).long().cpu().numpy()

    print("One-hot MLP Accuracy:", accuracy_score(y_test, pred_label))

In [11]:
# ===========================
# 3. Embedding + LSTM（可调 maxlen 和 embed_dim）
# ===========================
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, (h, c) = self.lstm(x)
        h = h[-1]
        return self.sigmoid(self.fc(h))

def train_lstm(maxlen, embed_dim):
    print(f"\n=== LSTM (maxlen={maxlen}, embed_dim={embed_dim}) ===")

    X_train = pad_sequences(x_train, maxlen=maxlen)
    X_test = pad_sequences(x_test, maxlen=maxlen)

    X_train = torch.LongTensor(X_train)
    X_test = torch.LongTensor(X_test)
    y_train_t = torch.FloatTensor(y_train)
    y_test_t = torch.LongTensor(y_test)

    loader = DataLoader(torch.utils.data.TensorDataset(X_train, y_train_t),
                        batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(torch.utils.data.TensorDataset(X_test, y_test_t),
                             batch_size=batch_size)

    model = LSTMClassifier(vocab_size, embed_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(epochs):
        total_loss = 0
        model.train()
        for x_b, y_b in loader:
            x_b, y_b = x_b.to(device), y_b.to(device)
            pred = model(x_b).squeeze()
            loss = criterion(pred, y_b)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss={total_loss/len(loader):.4f}")

    # 评估
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x_b, y_b in test_loader:
            x_b, y_b = x_b.to(device), y_b.to(device)
            pred = model(x_b).squeeze()
            pred_label = (pred >= 0.5).long()
            correct += (pred_label == y_b).sum().item()
            total += y_b.size(0)

    print(f"LSTM Accuracy = {correct/total:.4f}")

In [12]:


# ===========================
# 主程序
# ===========================
if __name__ == "__main__":
    tfidf_classification()
    one_hot_mlp()

    for maxlen in maxlen_list:
        for emb in embed_dim_list:
            train_lstm(maxlen, emb)



=== TF-IDF + Logistic Regression ===
TF-IDF Accuracy: 0.8884

=== One-hot + MLP ===
Epoch 1, Loss = 0.1651
Epoch 2, Loss = 0.3243
One-hot MLP Accuracy: 0.87048

=== LSTM (maxlen=100, embed_dim=64) ===
Epoch 1, Loss=0.6175
Epoch 2, Loss=0.4636
Epoch 3, Loss=0.3626
LSTM Accuracy = 0.8223

=== LSTM (maxlen=100, embed_dim=128) ===
Epoch 1, Loss=0.5612
Epoch 2, Loss=0.3974
Epoch 3, Loss=0.2961
LSTM Accuracy = 0.8411

=== LSTM (maxlen=200, embed_dim=64) ===
Epoch 1, Loss=0.6144
Epoch 2, Loss=0.4694
Epoch 3, Loss=0.4150
LSTM Accuracy = 0.8342

=== LSTM (maxlen=200, embed_dim=128) ===
Epoch 1, Loss=0.6099
Epoch 2, Loss=0.5409
Epoch 3, Loss=0.3970
LSTM Accuracy = 0.8216
