In [1]:
import os
from sklearn.model_selection import train_test_split

# 读取IMDb评论数据集
def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

# 加载数据
data_dir = "D:/Data1/aclImdb/"
train_texts, train_labels = read_imdb(data_dir, is_train=True)
test_texts, test_labels = read_imdb(data_dir, is_train=False)

# 分割训练集和验证集（可选）
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

print("Tamaño del conjunto de entrenamiento:", len(train_texts))
print("Tamaño del conjunto de validación:", len(val_texts))
print("Tamaño del conjunto de pruebas:", len(test_texts))

Tamaño del conjunto de entrenamiento: 20000
Tamaño del conjunto de validación: 5000
Tamaño del conjunto de pruebas: 25000


In [2]:
# -*- coding: utf-8 -*-
import re
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# 初始化VADER分析器
sid = SentimentIntensityAnalyzer()

# === 1. 改进的预处理函数 ===
def preprocess(text):
    """保留情感关键标点，移除HTML标签和无关符号"""
    text = text.lower()
    text = re.sub(r'<br\s*/?>', ' ', text)       # 优化HTML标签处理
    text = re.sub(r'([!?])\1+', r'\1', text)    # 合并重复标点（如"!!!"→"!"）
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)   # 保留字母和关键标点
    return text.strip()

# === 2. 数据清洗 ===
# 清洗验证集和测试集（保持原有逻辑）
cleaned_val_texts = [preprocess(text) for text in val_texts]
cleaned_test_texts = [preprocess(text) for text in test_texts]

# === 移除阈值搜索逻辑 ===
# 已删除 find_best_threshold 函数和验证集搜索代码

# === 3. 最终预测（测试集） ===
vader_labels = []
for text in cleaned_test_texts:
    compound = sid.polarity_scores(text)['compound']
    if compound >= 0.05:
        vader_labels.append(1)   # 正面
    elif compound <= -0.05:
        vader_labels.append(0)   # 负面
    else:
        vader_labels.append(0)   # 中间区域强制负面

# === 4. 计算指标 ===
accuracy = accuracy_score(test_labels, vader_labels)
precision = precision_score(test_labels, vader_labels)
recall = recall_score(test_labels, vader_labels)
f1 = f1_score(test_labels, vader_labels)

print("\n=== Rendimiento final de VADER en IMDb ===")
print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")


=== Rendimiento final de VADER en IMDb ===
Accuracy: 0.6997 | Precision: 0.6526 | Recall: 0.8540 | F1-Score: 0.7398


In [11]:
# -*- coding: utf-8 -*-
import re
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# 初始化VADER分析器
sid = SentimentIntensityAnalyzer()

# === 1. 改进的预处理函数 ===
def preprocess(text):
    """保留情感关键标点，移除HTML标签和无关符号"""
    text = text.lower()
    text = re.sub(r'<br\s*/?>', ' ', text)       # 优化HTML标签处理
    text = re.sub(r'([!?])\1+', r'\1', text)    # 合并重复标点（如"!!!"→"!"）
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)   # 保留字母和关键标点
    return text.strip()

# === 2. 数据清洗 ===
# 清洗验证集和测试集
cleaned_val_texts = [preprocess(text) for text in val_texts]
cleaned_test_texts = [preprocess(text) for text in test_texts]

# === 3. 改进的阈值搜索（基于验证集） ===
def find_best_threshold(y_true, texts):
    thresholds = np.arange(-0.5, 0.5, 0.05)  # 步长0.05
    best_f1 = 0
    best_thresh = 0
    for t in thresholds:
        predictions = [
            1 if sid.polarity_scores(text)['compound'] >= t else 0 
            for text in texts
        ]
        current_f1 = f1_score(y_true, predictions)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_thresh = t
    return best_thresh, best_f1  # 返回阈值和对应的F1分数

# 使用验证集寻找最佳阈值
best_threshold, best_f1 = find_best_threshold(val_labels, cleaned_val_texts)
print(f"Mejor umbral en validación: {best_threshold:.2f} (F1-Score: {best_f1:.4f})")

# === 4. 最终预测（测试集） ===
vader_labels = [
    1 if sid.polarity_scores(text)['compound'] >= best_threshold else 0
    for text in cleaned_test_texts
]

# === 5. 计算指标 ===
accuracy = accuracy_score(test_labels, vader_labels)
precision = precision_score(test_labels, vader_labels)
recall = recall_score(test_labels, vader_labels)
f1 = f1_score(test_labels, vader_labels)

print("\n=== Rendimiento final de VADER en IMDb ===")
print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

Mejor umbral en validación: 0.45 (F1-Score: 0.7385)

=== Rendimiento final de VADER en IMDb ===
Accuracy: 0.7117 | Precision: 0.6737 | Recall: 0.8211 | F1-Score: 0.7401


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# === 优化参数 ===
count_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    max_features=20000,
    stop_words='english',
    min_df=5
)
nb = MultinomialNB(alpha=0.1)

# 构建管道
bow_nb_model = make_pipeline(count_vectorizer, nb)

# 训练模型
bow_nb_model.fit(train_texts, train_labels)

# 预测与评估
bow_nb_labels = bow_nb_model.predict(test_texts)

accuracy = accuracy_score(test_labels, bow_nb_labels)
precision = precision_score(test_labels, bow_nb_labels)
recall = recall_score(test_labels, bow_nb_labels)
f1 = f1_score(test_labels, bow_nb_labels)

print(f"BoW+NB: Cuando: binary=False  - Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

BoW+NB: Cuando: binary=False  - Accuracy: 0.8416 | Precision: 0.8637 | Recall: 0.8114 | F1-Score: 0.8367


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# === 优化参数 ===
count_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    max_features=20000,
    stop_words='english',
    min_df=5,
    binary=True   # 添加参数
)
nb = MultinomialNB(alpha=0.1)

# 构建管道
bow_nb_model = make_pipeline(count_vectorizer, nb)

# 训练模型
bow_nb_model.fit(train_texts, train_labels)

# 预测与评估
bow_nb_labels = bow_nb_model.predict(test_texts)

accuracy = accuracy_score(test_labels, bow_nb_labels)
precision = precision_score(test_labels, bow_nb_labels)
recall = recall_score(test_labels, bow_nb_labels)
f1 = f1_score(test_labels, bow_nb_labels)

print(f"BoW+NB: Cuando: binary=True - Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

BoW+NB: Cuando: binary=True - Accuracy: 0.8557 | Precision: 0.8655 | Recall: 0.8424 | F1-Score: 0.8538


In [2]:
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split

# === 1. 数据预处理优化 ===
def imdb_preprocess(text):
    # 保留否定词处理（对影评情感分析至关重要）
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)            # 移除IMDB特有的HTML标签
    text = re.sub(r'(http|https)://\S+', '', text) # 移除URL
    text = re.sub(r"[^a-zA-Z\d\s'\-]", " ", text)  # 保留必要符号
    text = re.sub(r"\s'(s|t|ve|re|d|m|ll)\b", " \\1", text)  # 处理缩写
    
    # 强化否定词处理（适用于影评场景）
    text = re.sub(
        r'\b(not|no|never|none|neither|nor)\s+([a-zA-Z]+\b)',
        lambda m: f"{m.group(1)}_{m.group(2)}",
        text
    )
    return text.strip()

# 数据清洗（假设已加载train_texts/test_texts）
cleaned_train = [imdb_preprocess(t) for t in train_texts]
cleaned_test = [imdb_preprocess(t) for t in test_texts]

# === 2. 特征工程优化 ===
vectorizer = CountVectorizer(
    ngram_range=(1, 2),       # 包含unigram和bigram
    max_features=30000,       # IMDB需要更多特征（根据数据集规模调整）
    stop_words='english',
    min_df=3,                 # 过滤低频词
    binary=True               # 二进制特征提升速度
)

# === 3. 超参数调优 ===
# 划分验证集（如果尚未划分）
if not hasattr(train_labels, 'val_labels'):
    train_texts_split, val_texts_split, train_labels_split, val_labels_split = train_test_split(
        cleaned_train, train_labels, test_size=0.15, random_state=42
    )

param_grid = {
    'countvectorizer__max_features': [20000, 30000],  # 适应IMDB数据规模
    'countvectorizer__ngram_range': [(1,1), (1,2)],
    'countvectorizer__min_df': [2, 3],
    'multinomialnb__alpha': [0.01, 0.1, 0.5],         # 优化平滑参数
    'countvectorizer__binary': [True, False]                 # 固定二进制特征
}

# 创建管道
pipeline = make_pipeline(vectorizer, MultinomialNB())

# 使用验证集进行网格搜索
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,                     # 3折交叉验证
    scoring='accuracy',       # IMDB通常以准确率为主要指标
    n_jobs=-1,
    verbose=1
)

# 在验证集上搜索
grid.fit(val_texts_split, val_labels_split)

# 输出最优参数
print(f"Best parameters for IMDB: {grid.best_params_}")

# === 4. 最终训练与评估 ===
best_model = grid.best_estimator_.fit(train_texts_split, train_labels_split)

# 测试集预测
predictions = best_model.predict(cleaned_test)

# 评估指标
# === 最终评估（保持原有打印格式） ===


accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)

print(f"\nBoW+NB: Cuando: binary=True - Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters for IMDB: {'countvectorizer__binary': True, 'countvectorizer__max_features': 20000, 'countvectorizer__min_df': 2, 'countvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha': 0.5}

BoW+NB: Cuando: binary=True - Accuracy: 0.8587 | Precision: 0.8704 | Recall: 0.8430 | F1-Score: 0.8565


In [2]:
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split

# === 1. 数据预处理优化 ===
def imdb_preprocess(text):
    # 保留否定词处理（对影评情感分析至关重要）
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)            # 移除IMDB特有的HTML标签
    text = re.sub(r'(http|https)://\S+', '', text) # 移除URL
    text = re.sub(r"[^a-zA-Z\d\s'\-]", " ", text)  # 保留必要符号
    text = re.sub(r"\s'(s|t|ve|re|d|m|ll)\b", " \\1", text)  # 处理缩写
    
    # 强化否定词处理（适用于影评场景）
    text = re.sub(
        r'\b(not|no|never|none|neither|nor)\s+([a-zA-Z]+\b)',
        lambda m: f"{m.group(1)}_{m.group(2)}",
        text
    )
    return text.strip()

# 数据清洗（假设已加载train_texts/test_texts）
cleaned_train = [imdb_preprocess(t) for t in train_texts]
cleaned_test = [imdb_preprocess(t) for t in test_texts]

# === 2. 特征工程优化 ===
vectorizer = CountVectorizer(
    ngram_range=(1, 2),       # 包含unigram和bigram
    max_features=30000,       # IMDB需要更多特征（根据数据集规模调整）
    stop_words='english',
    min_df=3,                 # 过滤低频词
    binary=True               # 二进制特征提升速度
)

# === 3. 超参数调优 ===
# 划分验证集（如果尚未划分）
if not hasattr(train_labels, 'val_labels'):
    train_texts_split, val_texts_split, train_labels_split, val_labels_split = train_test_split(
        cleaned_train, train_labels, test_size=0.15, random_state=42
    )

param_grid = {
    'countvectorizer__max_features': [10000, 20000, 30000, 40000],  # 适应IMDB数据规模
    'countvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2)],
    'countvectorizer__min_df': [1, 2, 3, 4],
    'multinomialnb__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],         # 优化平滑参数
    'countvectorizer__binary': [True, False]                 # 固定二进制特征
}

# 创建管道
pipeline = make_pipeline(vectorizer, MultinomialNB())

# 使用验证集进行网格搜索
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,                     # 3折交叉验证
    scoring='accuracy',       # IMDB通常以准确率为主要指标
    n_jobs=-1,
    verbose=1
)

# 在验证集上搜索
grid.fit(val_texts_split, val_labels_split)

# 输出最优参数
print(f"Best parameters for IMDB: {grid.best_params_}")

# === 4. 最终训练与评估 ===
best_model = grid.best_estimator_.fit(train_texts_split, train_labels_split)

# 测试集预测
predictions = best_model.predict(cleaned_test)

# 评估指标
# === 最终评估（保持原有打印格式） ===


accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)

print(f"\nBoW+NB: Cuando: binary=True - Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

Fitting 3 folds for each of 896 candidates, totalling 2688 fits
Best parameters for IMDB: {'countvectorizer__binary': True, 'countvectorizer__max_features': 10000, 'countvectorizer__min_df': 1, 'countvectorizer__ngram_range': (1, 3), 'multinomialnb__alpha': 1.0}

BoW+NB: Cuando: binary=True - Accuracy: 0.8561 | Precision: 0.8606 | Recall: 0.8498 | F1-Score: 0.8552


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ====== 关键参数设置 ======
# 1. TF-IDF参数
tfidf_params = {
    "ngram_range": (1, 2),
    "max_features": 20000,  # 放宽至10,000
    "stop_words": "english",  # 基础停用词过滤
    "min_df": 5,            # 允许出现2次的词
    "max_df": 0.8           # 保留更多常见词
}

# 2. SVM参数
svm_params = {
    "C": 0.1,                 # 正则化强度（值越小正则化越强）
    "class_weight": "balanced", # 自动平衡类别权重（IMDB数据均衡可省略）
}

# ====== 修改后的管道构建 ======
tfidf_svm_model = make_pipeline(
    TfidfVectorizer(**tfidf_params),  # 注入TF-IDF参数
    LinearSVC(**svm_params)           # 注入SVM参数
)

# ====== 训练与测试（保持不变） ======
tfidf_svm_model.fit(train_texts, train_labels)
tfidf_svm_labels = tfidf_svm_model.predict(test_texts)

# ====== 计算指标 ======
accuracy_tfidf_svm = accuracy_score(test_labels, tfidf_svm_labels)
precision_tfidf_svm = precision_score(test_labels, tfidf_svm_labels)
recall_tfidf_svm = recall_score(test_labels, tfidf_svm_labels)
f1_tfidf_svm = f1_score(test_labels, tfidf_svm_labels)

print(f"TF-IDF+SVM - Accuracy: {accuracy_tfidf_svm:.4f}, Precision: {precision_tfidf_svm:.4f}, Recall: {recall_tfidf_svm:.4f}, F1 Score: {f1_tfidf_svm:.4f}")

TF-IDF+SVM - Accuracy: 0.8801, Precision: 0.8764, Recall: 0.8851, F1 Score: 0.8807


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ====== 关键参数设置 ======
# 1. TF-IDF参数
tfidf_params = {
     "ngram_range": (1, 3),       # 尝试包含三元词组 (trigrams)
     "max_features": 25000,       # 稍微增加最大特征数
     "stop_words": "english",
     "min_df": 3,                 # 词语至少在3个文档中出现 (更宽松一些)
     "max_df": 0.75,              # 排除出现在超过75%文档中的词语 (更严格一些)
     "use_idf": True,             # 明确开启 IDF (通常默认开启)
     "smooth_idf": True,          # 明确开启 IDF 平滑 (通常默认开启)
     "norm": 'l2'                 # 明确使用 l2 归一化 (通常默认)
}

# 2. SVM参数
svm_params = {
     "C": 0.5,                    # 尝试一个稍大的 C 值 (减弱正则化强度)
     "class_weight": "balanced",  # 保持类别权重平衡
     "max_iter": 2000,            # 增加迭代次数以帮助收敛
     "dual": "auto"               # 自动选择 dual 参数，推荐
}

# ====== 修改后的管道构建 ======
tfidf_svm_model = make_pipeline(
    TfidfVectorizer(**tfidf_params),  # 关键修正：使用 ** 解包参数字典
    LinearSVC(**svm_params)            # 注入SVM参数
)

print(f"Parámetros TF-IDF: {tfidf_params}")
print(f"Parámetros SVM: {svm_params}")

# ====== 训练与测试（保持不变） ======
tfidf_svm_model.fit(train_texts, train_labels)
tfidf_svm_labels = tfidf_svm_model.predict(test_texts)

# ====== 计算指标 ======
accuracy_tfidf_svm = accuracy_score(test_labels, tfidf_svm_labels)
precision_tfidf_svm = precision_score(test_labels, tfidf_svm_labels)
recall_tfidf_svm = recall_score(test_labels, tfidf_svm_labels)
f1_tfidf_svm = f1_score(test_labels, tfidf_svm_labels)

print(f"TF-IDF+SVM - Accuracy: {accuracy_tfidf_svm:.4f}, Precision: {precision_tfidf_svm:.4f}, Recall: {recall_tfidf_svm:.4f}, F1 Score: {f1_tfidf_svm:.4f}")

Parámetros TF-IDF: {'ngram_range': (1, 3), 'max_features': 25000, 'stop_words': 'english', 'min_df': 3, 'max_df': 0.75, 'use_idf': True, 'smooth_idf': True, 'norm': 'l2'}
Parámetros SVM: {'C': 0.5, 'class_weight': 'balanced', 'max_iter': 2000, 'dual': 'auto'}
TF-IDF+SVM - Accuracy: 0.8742, Precision: 0.8775, Recall: 0.8697, F1 Score: 0.8736


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ====== 关键参数设置 ======
# 1. TF-IDF参数
tfidf_params = {
     "ngram_range": (1, 1),       # 尝试包含三元词组 (trigrams)
     "max_features": 25000,       # 稍微增加最大特征数
     "stop_words": "english",
     "min_df": 1,                 # 词语至少在3个文档中出现 (更宽松一些)
     "max_df": 0.85,              # 排除出现在超过75%文档中的词语 (更严格一些)
     "use_idf": True,             # 明确开启 IDF (通常默认开启)
     "smooth_idf": True,          # 明确开启 IDF 平滑 (通常默认开启)
     "norm": 'l2'                 # 明确使用 l2 归一化 (通常默认)
}

# 2. SVM参数
svm_params = {
     "C": 1.0,                    # 尝试一个稍大的 C 值 (减弱正则化强度)
     "class_weight": "balanced",  # 保持类别权重平衡
     "max_iter": 2000,            # 增加迭代次数以帮助收敛
     "dual": "auto"               # 自动选择 dual 参数，推荐
}

# ====== 修改后的管道构建 ======
tfidf_svm_model = make_pipeline(
    TfidfVectorizer(**tfidf_params),  # 关键修正：使用 ** 解包参数字典
    LinearSVC(**svm_params)            # 注入SVM参数
)

print(f"Parámetros TF-IDF: {tfidf_params}")
print(f"Parámetros SVM: {svm_params}")

# ====== 训练与测试（保持不变） ======
tfidf_svm_model.fit(train_texts, train_labels)
tfidf_svm_labels = tfidf_svm_model.predict(test_texts)

# ====== 计算指标 ======
accuracy_tfidf_svm = accuracy_score(test_labels, tfidf_svm_labels)
precision_tfidf_svm = precision_score(test_labels, tfidf_svm_labels)
recall_tfidf_svm = recall_score(test_labels, tfidf_svm_labels)
f1_tfidf_svm = f1_score(test_labels, tfidf_svm_labels)

print(f"TF-IDF+SVM - Accuracy: {accuracy_tfidf_svm:.4f}, Precision: {precision_tfidf_svm:.4f}, Recall: {recall_tfidf_svm:.4f}, F1 Score: {f1_tfidf_svm:.4f}")

Parámetros TF-IDF: {'ngram_range': (1, 1), 'max_features': 25000, 'stop_words': 'english', 'min_df': 1, 'max_df': 0.85, 'use_idf': True, 'smooth_idf': True, 'norm': 'l2'}
Parámetros SVM: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 2000, 'dual': 'auto'}
TF-IDF+SVM - Accuracy: 0.8611, Precision: 0.8701, Recall: 0.8490, F1 Score: 0.8594
