In [None]:

import os
import sys
import math
import random
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM, Bidirectional, Dropout

print("Python version:", sys.version)
print("TensorFlow version:", tf.__version__)
print("GPU devices:", tf.config.list_physical_devices("GPU"))


In [None]:
# IMDB 数据集配置
MAX_FEATURES = 10000   # 使用最常见的前 10,000 个单词
MAXLEN_DEFAULT = 200   # 默认序列长度

# 为了训练速度，我们只使用部分样本
N_TRAIN_SAMPLES = 20000
N_TEST_SAMPLES  = 10000

BATCH_SIZE = 128

# 嵌入层实验用的超参数候选（实验 1-1）
MAXLEN_LIST = [100, 200, 300]     # 不同的样本长度
EMBED_DIM_LIST = [8, 16, 32]      # 不同的词向量维度
EPOCHS_EMBED = 5                  # 每个组合训练轮数

# RNN 实验（实验 2）超参数
MAXLEN_RNN = 200
EMBED_DIM_RNN = 32
EPOCHS_RNN = 5                    # LSTM / 堆叠 LSTM / BiLSTM 的训练轮数

print("MAX_FEATURES:", MAX_FEATURES)
print("N_TRAIN_SAMPLES:", N_TRAIN_SAMPLES, "N_TEST_SAMPLES:", N_TEST_SAMPLES)


In [None]:
# num_words=MAX_FEATURES: 只保留词频最高的 MAX_FEATURES 个单词
(x_train_raw, y_train_raw), (x_test_raw, y_test_raw) = imdb.load_data(num_words=MAX_FEATURES)

print("原始训练集大小:", len(x_train_raw))
print("原始测试集大小:", len(x_test_raw))

# 为了节省时间，截取前 N_TRAIN_SAMPLES / N_TEST_SAMPLES 条样本
x_train_raw = x_train_raw[:N_TRAIN_SAMPLES]
y_train_raw = y_train_raw[:N_TRAIN_SAMPLES]
x_test_raw  = x_test_raw[:N_TEST_SAMPLES]
y_test_raw  = y_test_raw[:N_TEST_SAMPLES]

print("使用的训练集大小:", len(x_train_raw))
print("使用的测试集大小:", len(x_test_raw))
print("示例原始序列（前 1 条）:", x_train_raw[0][:20], "...")


In [None]:
def plot_history(history, title_prefix=""):
    """绘制 accuracy / loss 曲线。"""
    acc = history.history.get("acc") or history.history.get("accuracy")
    val_acc = history.history.get("val_acc") or history.history.get("val_accuracy")
    loss = history.history.get("loss")
    val_loss = history.history.get("val_loss")

    epochs = range(1, len(acc) + 1)

    plt.figure()
    plt.plot(epochs, acc, "bo", label="Training acc")
    plt.plot(epochs, val_acc, "b", label="Validation acc")
    plt.title(f"{title_prefix} Training and validation accuracy")
    plt.legend()

    plt.figure()
    plt.plot(epochs, loss, "bo", label="Training loss")
    plt.plot(epochs, val_loss, "b", label="Validation loss")
    plt.title(f"{title_prefix} Training and validation loss")
    plt.legend()

    plt.show()


## 词嵌入进行电影评论分类

In [None]:
def build_embedding_model(maxlen, embedding_dim):
    """构建一个 Embedding + Flatten + Dense 的简单网络。"""
    model = models.Sequential()
    model.add(Embedding(MAX_FEATURES, embedding_dim, input_length=maxlen))
    model.add(Flatten())
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    return model


In [None]:
results_embed = []

for maxlen in MAXLEN_LIST:
    # 对当前 maxlen 重新进行 padding
    x_train = sequence.pad_sequences(x_train_raw, maxlen=maxlen)
    x_test  = sequence.pad_sequences(x_test_raw,  maxlen=maxlen)
    print("\n=== 当前样本长度 maxlen =", maxlen, "===")
    print("x_train shape:", x_train.shape, "x_test shape:", x_test.shape)

    for embedding_dim in EMBED_DIM_LIST:
        print(f"\n--> 训练模型：maxlen={maxlen}, embedding_dim={embedding_dim}")
        model = build_embedding_model(maxlen=maxlen, embedding_dim=embedding_dim)
        model.summary()

        history = model.fit(
            x_train, y_train_raw,
            epochs=EPOCHS_EMBED,
            batch_size=BATCH_SIZE,
            validation_split=0.2,
            verbose=2
        )

        test_loss, test_acc = model.evaluate(x_test, y_test_raw, verbose=0)
        print(f"[结果] maxlen={maxlen}, embedding_dim={embedding_dim}, test_acc={test_acc:.4f}")

        results_embed.append({
            "maxlen": maxlen,
            "embedding_dim": embedding_dim,
            "test_loss": float(test_loss),
            "test_acc": float(test_acc)
        })

        plot_history(history, title_prefix=f"[Embedding] len={maxlen}, dim={embedding_dim}")

import json
print("\n=== 不同参数组合的测试集结果汇总 ===")
print(json.dumps(results_embed, indent=4))


### One-Hot（词袋）/TF-IDF/词嵌入的对比  


In [None]:
import sys
!{sys.executable} -m pip install scikit-learn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def sequences_to_texts(sequences):
    """将整数序列转换为以空格分隔的 token 字符串（便于 sklearn 使用）。"""
    return [" ".join(str(i) for i in seq) for seq in sequences]

train_texts = sequences_to_texts(x_train_raw)
test_texts  = sequences_to_texts(x_test_raw)

print("示例文本：", train_texts[0][:200], "...")

print("\n=== 词袋模型（One-Hot / 计数） ===")
vectorizer_bow = CountVectorizer(
    max_features=MAX_FEATURES,
    token_pattern=r"(?u)\b\w+\b"   # 保留所有 token（包括单个数字）
)
X_train_bow = vectorizer_bow.fit_transform(train_texts)
X_test_bow  = vectorizer_bow.transform(test_texts)
print("BOW 特征矩阵形状：", X_train_bow.shape)

clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train_bow, y_train_raw)
y_pred_bow = clf_bow.predict(X_test_bow)
acc_bow = accuracy_score(y_test_raw, y_pred_bow)
print("BOW + LogisticRegression 测试集准确率: {:.4f}".format(acc_bow))

print("\n=== TF-IDF 模型 ===")
vectorizer_tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    token_pattern=r"(?u)\b\w+\b"
)
X_train_tfidf = vectorizer_tfidf.fit_transform(train_texts)
X_test_tfidf  = vectorizer_tfidf.transform(test_texts)
print("TF-IDF 特征矩阵形状：", X_train_tfidf.shape)

clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train_raw)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
acc_tfidf = accuracy_score(y_test_raw, y_pred_tfidf)
print("TF-IDF + LogisticRegression 测试集准确率: {:.4f}".format(acc_tfidf))


print("\n=== Embedding 神经网络 ===")
maxlen_embed = MAXLEN_DEFAULT
embedding_dim_embed = 32

x_train_embed = sequence.pad_sequences(x_train_raw, maxlen=maxlen_embed)
x_test_embed  = sequence.pad_sequences(x_test_raw,  maxlen=maxlen_embed)

model_embed = build_embedding_model(maxlen_embed, embedding_dim_embed)
model_embed.summary()

history_embed = model_embed.fit(
    x_train_embed, y_train_raw,
    epochs=EPOCHS_EMBED,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    verbose=2
)

test_loss_embed, test_acc_embed = model_embed.evaluate(x_test_embed, y_test_raw, verbose=0)
print("Embedding + NN 测试集准确率: {:.4f}".format(test_acc_embed))


results_compare = {
    "bow_logreg": float(acc_bow),
    "tfidf_logreg": float(acc_tfidf),
    "embedding_nn": float(test_acc_embed)
}
print("\n=== 结果汇总 ===")
print(results_compare)

plot_history(history_embed, title_prefix="[Embedding vs BOW/TF-IDF] 选定参数")


### 实验 1 小结（请在此写上你自己的结论）

你可以围绕以下几点写总结（示例）：

- 在不同 **maxlen** 与 **embedding_dim** 组合下，模型的验证/测试准确率如何变化？  
  - 例如：序列过短是否会丢失关键信息？序列过长是否带来过拟合或训练变慢？  
- 对比 **One-Hot(BOW)**、**TF-IDF** 与 **Embedding 网络** 的性能：  
  - 哪种方法效果最好？  
  - 传统的 BOW / TF-IDF 与神经网络 Embedding 各自的优缺点？  
- 结合 PPT 中对 **文本向量化方法** 的介绍，谈谈你对词嵌入优势的理解。fileciteturn1file0

> 请在这里用自然语言写 5~10 句自己的分析，而不是简单贴数字。


## LSTM / 堆叠 LSTM / BiLSTM 电影评论分类


In [None]:
# 使用 MAXLEN_RNN 重新做 padding
x_train_rnn = sequence.pad_sequences(x_train_raw, maxlen=MAXLEN_RNN)
x_test_rnn  = sequence.pad_sequences(x_test_raw,  maxlen=MAXLEN_RNN)

print("x_train_rnn shape:", x_train_rnn.shape)
print("x_test_rnn shape:", x_test_rnn.shape)


In [None]:
def build_lstm_model():
    model = models.Sequential()
    model.add(Embedding(MAX_FEATURES, EMBED_DIM_RNN, input_length=MAXLEN_RNN))
    model.add(LSTM(32))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    return model

def build_stacked_lstm_model():
    model = models.Sequential()
    model.add(Embedding(MAX_FEATURES, EMBED_DIM_RNN, input_length=MAXLEN_RNN))
    model.add(LSTM(32, dropout=0.1, return_sequences=True))
    model.add(LSTM(32, dropout=0.1, return_sequences=True))
    model.add(LSTM(32, dropout=0.1))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    return model

def build_bilstm_model():
    model = models.Sequential()
    model.add(Embedding(MAX_FEATURES, EMBED_DIM_RNN, input_length=MAXLEN_RNN))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    return model


In [None]:
rnn_results = {}

print("\n=== 单层 LSTM ===")
model_lstm = build_lstm_model()
model_lstm.summary()
history_lstm = model_lstm.fit(
    x_train_rnn, y_train_raw,
    epochs=EPOCHS_RNN,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    verbose=2
)
test_loss_lstm, test_acc_lstm = model_lstm.evaluate(x_test_rnn, y_test_raw, verbose=0)
rnn_results["lstm"] = float(test_acc_lstm)
print("LSTM 测试集准确率: {:.4f}".format(test_acc_lstm))
plot_history(history_lstm, title_prefix="[RNN] 单层 LSTM")

print("\n=== 堆叠 LSTM ===")
model_stacked = build_stacked_lstm_model()
model_stacked.summary()
history_stacked = model_stacked.fit(
    x_train_rnn, y_train_raw,
    epochs=EPOCHS_RNN,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    verbose=2
)
test_loss_stacked, test_acc_stacked = model_stacked.evaluate(x_test_rnn, y_test_raw, verbose=0)
rnn_results["stacked_lstm"] = float(test_acc_stacked)
print("堆叠 LSTM 测试集准确率: {:.4f}".format(test_acc_stacked))
plot_history(history_stacked, title_prefix="[RNN] 堆叠 LSTM")


print("\n=== BiLSTM ===")
model_bilstm = build_bilstm_model()
model_bilstm.summary()
history_bilstm = model_bilstm.fit(
    x_train_rnn, y_train_raw,
    epochs=EPOCHS_RNN,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    verbose=2
)
test_loss_bilstm, test_acc_bilstm = model_bilstm.evaluate(x_test_rnn, y_test_raw, verbose=0)
rnn_results["bilstm"] = float(test_acc_bilstm)
print("BiLSTM 测试集准确率: {:.4f}".format(test_acc_bilstm))
plot_history(history_bilstm, title_prefix="[RNN] BiLSTM")

print("\n=== RNN 模型结果汇总 ===")
print(rnn_results)


### 实验 2 小结（请在此写上你自己的结论）

你可以围绕以下问题进行总结（示例）：

- 单层 LSTM、堆叠 LSTM、BiLSTM 三种结构在验证集 / 测试集上的准确率分别是多少？  
- 堆叠 LSTM 是否一定比单层 LSTM 好？在本实验中的结果如何？  
- BiLSTM 在本任务上是否带来了明显收益？结合 PPT 中对 **双向 RNN / 堆叠 RNN** 的解释谈谈你的理解。fileciteturn1file0  
- 模型深度、双向结构会带来怎样的计算代价？在小数据任务中是否总是值得？

> 在完成实验后，请写出你自己的分析和感想（5~10 句即可），将其作为作业报告的一部分。
