In [3]:
import os
from sklearn.model_selection import train_test_split

# 读取IMDb评论数据集
def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

# 加载数据
data_dir = "D:/Data/aclImdb/"
train_texts, train_labels = read_imdb(data_dir, is_train=True)
test_texts, test_labels = read_imdb(data_dir, is_train=False)

# 分割训练集和验证集（可选）
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

print("Tamaño del conjunto de entrenamiento:", len(train_texts))
print("Tamaño del conjunto de validación:", len(val_texts))
print("Tamaño del conjunto de pruebas:", len(test_texts))

Tamaño del conjunto de entrenamiento: 20000
Tamaño del conjunto de validación: 5000
Tamaño del conjunto de pruebas: 25000


In [11]:
# -*- coding: utf-8 -*-
import re
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# 初始化VADER分析器
sid = SentimentIntensityAnalyzer()

# === 1. 改进的预处理函数 ===
def preprocess(text):
    """保留情感关键标点，移除HTML标签和无关符号"""
    text = text.lower()
    text = re.sub(r'<br\s*/?>', ' ', text)       # 优化HTML标签处理
    text = re.sub(r'([!?])\1+', r'\1', text)    # 合并重复标点（如"!!!"→"!"）
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)   # 保留字母和关键标点
    return text.strip()

# === 2. 数据清洗 ===
# 清洗验证集和测试集
cleaned_val_texts = [preprocess(text) for text in val_texts]
cleaned_test_texts = [preprocess(text) for text in test_texts]

# === 3. 改进的阈值搜索（基于验证集） ===
def find_best_threshold(y_true, texts):
    thresholds = np.arange(-0.5, 0.5, 0.05)  # 步长0.05
    best_f1 = 0
    best_thresh = 0
    for t in thresholds:
        predictions = [
            1 if sid.polarity_scores(text)['compound'] >= t else 0 
            for text in texts
        ]
        current_f1 = f1_score(y_true, predictions)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_thresh = t
    return best_thresh, best_f1  # 返回阈值和对应的F1分数

# 使用验证集寻找最佳阈值
best_threshold, best_f1 = find_best_threshold(val_labels, cleaned_val_texts)
print(f"Mejor umbral en validación: {best_threshold:.2f} (F1-Score: {best_f1:.4f})")

# === 4. 最终预测（测试集） ===
vader_labels = [
    1 if sid.polarity_scores(text)['compound'] >= best_threshold else 0
    for text in cleaned_test_texts
]

# === 5. 计算指标 ===
accuracy = accuracy_score(test_labels, vader_labels)
precision = precision_score(test_labels, vader_labels)
recall = recall_score(test_labels, vader_labels)
f1 = f1_score(test_labels, vader_labels)

print("\n=== Rendimiento final de VADER en IMDb ===")
print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

Mejor umbral en validación: 0.45 (F1-Score: 0.7385)

=== Rendimiento final de VADER en IMDb ===
Accuracy: 0.7117 | Precision: 0.6737 | Recall: 0.8211 | F1-Score: 0.7401


In [12]:
# === 6. Análisis de casos de error ===
def analyze_errors(true_labels, pred_labels, texts):
    errors = []
    for i in range(len(true_labels)):
        if pred_labels[i] != true_labels[i]:
            errors.append({
                'Texto': texts[i],  # Traducido de '文本'
                'Texto limpiado': cleaned_test_texts[i],  # Traducido de '清洗后文本'
                'Etiqueta real': true_labels[i],  # Traducido de '真实标签'
                'Etiqueta predicha': pred_labels[i],  # Traducido de '预测标签'
                'Puntuación de sentimiento': sid.polarity_scores(cleaned_test_texts[i])['compound']  # Traducido de '情感分数'
            })
    return errors

# Obtener casos de error
error_cases = analyze_errors(test_labels, vader_labels, test_texts)

# Mostrar los primeros 5 casos de error
print(f"\nCantidad de casos de error: {len(error_cases)} (Tasa de error: {len(error_cases)/len(test_labels):.2%})")  # Traducido de "错误案例数量" y "错误率"
print("Primeros 5 casos de error:")  # Traducido de "前5个错误案例"
for idx, err in enumerate(error_cases[:5], 1):
    print(f"\nCaso {idx}:")
    print(f"Real: {err['Etiqueta real']}, Predicción: {err['Etiqueta predicha']} (Puntuación: {err['Puntuación de sentimiento']:.2f})")  # Traducido de "真实", "预测", "分数"
    print(f"Texto original: {err['Texto'][:100]}...")  # Traducido de "原始文本"
    print(f"Texto limpiado: {err['Texto limpiado'][:100]}...")  # Traducido de "清洗后文本"


Cantidad de casos de error: 7207 (Tasa de error: 28.83%)
Primeros 5 casos de error:

Caso 1:
Real: 1, Predicción: 0 (Puntuación: -0.12)
Texto original: I felt this film did have many good qualities. The cinematography was certainly different exposing t...
Texto limpiado: i felt this film did have many good qualities the cinematography was certainly different exposing th...

Caso 2:
Real: 1, Predicción: 0 (Puntuación: -0.30)
Texto original: This movie is amazing because the fact that the real people portray themselves and their real life e...
Texto limpiado: this movie is amazing because the fact that the real people portray themselves and their real life e...

Caso 3:
Real: 1, Predicción: 0 (Puntuación: -0.78)
Texto original: "Night of the Hunted" stars French porn star Brigitte Lahaie.In fact,many of the cast members in thi...
Texto limpiado: night of the hunted stars french porn star brigitte lahaiein factmany of the cast members in this sl...

Caso 4:
Real: 1, Predicción: 0 (Puntua

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# === 优化参数 ===
count_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    max_features=20000,
    stop_words='english',
    min_df=5
)
nb = MultinomialNB(alpha=0.1)

# 构建管道
bow_nb_model = make_pipeline(count_vectorizer, nb)

# 训练模型
bow_nb_model.fit(train_texts, train_labels)

# 预测与评估
bow_nb_labels = bow_nb_model.predict(test_texts)

accuracy = accuracy_score(test_labels, bow_nb_labels)
precision = precision_score(test_labels, bow_nb_labels)
recall = recall_score(test_labels, bow_nb_labels)
f1 = f1_score(test_labels, bow_nb_labels)

print(f"BoW+NB - Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

BoW+NB - Accuracy: 0.8413 | Precision: 0.8632 | Recall: 0.8112 | F1-Score: 0.8364


In [6]:
# === 错误案例分析 ===
errors = []
for i in range(len(test_labels)):
    if bow_nb_labels[i] != test_labels[i]:
        errors.append({
            'Text': test_texts[i],
            'True': test_labels[i],
            'Predicted': bow_nb_labels[i]
        })

# 输出总错误数和错误率
total_errors = len(errors)
error_rate = total_errors / len(test_labels) * 100
print(f"\nNúmero total de errores: {total_errors} (tasa de error: {error_rate:.2f}%)")

# 显示前5个错误案例
print("\nEjemplos de casos de error:")
for idx, err in enumerate(errors[:5], 1):
    print(f"Caso {idx}:")
    print(f"Etiqueta real: {err['True']}, Prediction: {err['Predicted']}")
    print(f"Texto original: {err['Text'][:100]!r}\n")


Número total de errores: 3967 (tasa de error: 15.87%)

Ejemplos de casos de error:
Caso 1:
Etiqueta real: 1, Prediction: 0
Texto original: 'My wife is a mental health therapist and we watched it from beginning to end. I am the typical man a'

Caso 2:
Etiqueta real: 1, Prediction: 0
Texto original: "While I can't say whether or not Larry Hama ever saw any of the old cartoons, I would think that wri"

Caso 3:
Etiqueta real: 1, Prediction: 0
Texto original: 'Why does everyone feel they have to constantly put this movie down? It is cute and funny (exactly wh'

Caso 4:
Etiqueta real: 1, Prediction: 0
Texto original: "Madonna gets into action, again and she fails again! Who's That Girl was released just one year afte"

Caso 5:
Etiqueta real: 1, Prediction: 0
Texto original: "So, Madonna isn't Meryl Streep. Still, this is one of her first films and a comedy at that. Give her"


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ====== 关键参数设置 ======
# 1. TF-IDF参数
tfidf_params = {
    "ngram_range": (1, 2),
    "max_features": 20000,  # 放宽至10,000
    "stop_words": "english",  # 基础停用词过滤
    "min_df": 5,            # 允许出现2次的词
    "max_df": 0.8           # 保留更多常见词
}

# 2. SVM参数
svm_params = {
    "C": 0.1,                 # 正则化强度（值越小正则化越强）
    "class_weight": "balanced", # 自动平衡类别权重（IMDB数据均衡可省略）
}

# ====== 修改后的管道构建 ======
tfidf_svm_model = make_pipeline(
    TfidfVectorizer(**tfidf_params),  # 注入TF-IDF参数
    LinearSVC(**svm_params)           # 注入SVM参数
)

# ====== 训练与测试（保持不变） ======
tfidf_svm_model.fit(train_texts, train_labels)
tfidf_svm_labels = tfidf_svm_model.predict(test_texts)

# ====== 计算指标 ======
accuracy_tfidf_svm = accuracy_score(test_labels, tfidf_svm_labels)
precision_tfidf_svm = precision_score(test_labels, tfidf_svm_labels)
recall_tfidf_svm = recall_score(test_labels, tfidf_svm_labels)
f1_tfidf_svm = f1_score(test_labels, tfidf_svm_labels)

print(f"TF-IDF+SVM - Accuracy: {accuracy_tfidf_svm:.4f}, Precision: {precision_tfidf_svm:.4f}, Recall: {recall_tfidf_svm:.4f}, F1 Score: {f1_tfidf_svm:.4f}")

TF-IDF+SVM - Accuracy: 0.8805, Precision: 0.8770, Recall: 0.8851, F1 Score: 0.8810


In [7]:
# === 错误案例分析 ===
errors = []
for i in range(len(test_labels)):
    if tfidf_svm_labels[i] != test_labels[i]:
        errors.append({
            'Text': test_texts[i],
            'True': test_labels[i],
            'Predicted': tfidf_svm_labels[i]
        })

# 输出总错误数和错误率
total_errors = len(errors)
error_rate = total_errors / len(test_labels) * 100
print(f"\nNúmero total de errores: {total_errors} (tasa de error: {error_rate:.2f}%)")

# 显示前5个错误案例
print("\nEjemplos de casos de error:")
for idx, err in enumerate(errors[:5], 1):
    print(f"Caso {idx}:")
    print(f"Etiqueta real: {err['True']}, Prediction: {err['Predicted']}")
    print(f"Texto original: {err['Text'][:100]!r}\n")


Número total de errores: 2988 (tasa de error: 11.95%)

Ejemplos de casos de error:
Caso 1:
Etiqueta real: 1, Prediction: 0
Texto original: 'My wife is a mental health therapist and we watched it from beginning to end. I am the typical man a'

Caso 2:
Etiqueta real: 1, Prediction: 0
Texto original: "I have certainly not seen all of Jean Rollin's films, but they mostly seem to be bloody vampire nake"

Caso 3:
Etiqueta real: 1, Prediction: 0
Texto original: "While I can't say whether or not Larry Hama ever saw any of the old cartoons, I would think that wri"

Caso 4:
Etiqueta real: 1, Prediction: 0
Texto original: 'Naturally, along with everyone else, I was primed to expect a lot of Hollywood fantasy revisionism i'

Caso 5:
Etiqueta real: 1, Prediction: 0
Texto original: "For late-80s cheese, this really isn't so bad. There are a lot of pretty funny throwaway one-liners "
