In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 读取讽刺数据集
def read_sarcasm_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['text'].tolist()
    labels = df['Y'].tolist()  # 0 -> sin sarcasm, 1 -> sarcasm
    return texts, labels

# 数据路径
train_file = "D:/Data/sarcasm/sarcasm_train.csv"
test_file = "D:/Data/sarcasm/sarcasm_test.csv"

# 读取数据
train_texts, train_labels = read_sarcasm_data(train_file)
test_texts, test_labels = read_sarcasm_data(test_file)

# 分割训练集和验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

# 输出数据集大小
print("\nMapeo de etiquetas: {0: 'sin sarcasm', 1: 'sarcasm'}")
print(f"Conjunto de entrenamiento: {len(train_texts)} (0: {train_labels.count(0)}, 1: {train_labels.count(1)})")
print(f"Conjunto de entrenamiento: {len(val_texts)} (0: {val_labels.count(0)}, 1: {val_labels.count(1)})")
print(f"Conjunto de entrenamiento: {len(test_texts)} (0: {test_labels.count(0)}, 1: {test_labels.count(1)})")



Mapeo de etiquetas: {0: 'sin sarcasm', 1: 'sarcasm'}
Conjunto de entrenamiento: 16026 (0: 8363, 1: 7663)
Conjunto de entrenamiento: 4007 (0: 2116, 1: 1891)
Conjunto de entrenamiento: 8586 (0: 4506, 1: 4080)


In [2]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import mark_negation

# 初始化VADER
sid = SentimentIntensityAnalyzer()

# === 1. 自定义讽刺词典（扩展）===
## 新增动态词典更新（基于训练数据）
def update_vader_lexicon(train_texts, train_labels):
    from collections import defaultdict
    sarcasm_counts = defaultdict(int)
    non_sarcasm_counts = defaultdict(int)
    
    for text, label in zip(train_texts, train_labels):
        tokens = word_tokenize(text.lower())
        for token in tokens:
            if label == 1:
                sarcasm_counts[token] += 1
            else:
                non_sarcasm_counts[token] += 1
                
    for word in sarcasm_counts:
        if sarcasm_counts[word] > 2*non_sarcasm_counts[word] + 5:
            sid.lexicon[word] = sid.lexicon.get(word, 0) + np.log(sarcasm_counts[word]+1)*0.8

update_vader_lexicon(train_texts, train_labels)  # 在此插入动态更新

extra_sarcasm_words = {
    'sarcasm': 3.5, 'sarcastic': 3.0, 'irony': 2.5, 'mock': 2.0, 'pretend': 1.8,
    'brilliant': 2.5, 'genius': 3.0, 'totally': 2.2, 'sure': 2.0, 'obviously': 2.8,
    'yeah': 1.5, 'of_course': 3.0, 'as_if': 2.5  # 修改短语格式
}
sid.lexicon.update(extra_sarcasm_words)

# === 2. 文本预处理 ===
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)       # 移除URL
    text = re.sub(r'([!?])', r' \1 ', text)   # 保留并隔离!?符号
    text = re.sub(r'[^a-zA-Z!? ]', '', text)  # 保留字母和!? 
    tokens = word_tokenize(text)
    tokens = mark_negation(tokens)            # 使用NLTK否定处理
    return ' '.join(tokens)

# 重新清洗所有数据集
cleaned_train = [preprocess(t) for t in train_texts]
cleaned_val = [preprocess(t) for t in val_texts]
cleaned_test = [preprocess(t) for t in test_texts]

# === 3. 计算讽刺得分（优化版）===
def calculate_sarcasm_score(text):
    scores = sid.polarity_scores(text)
    # 新增情感冲突检测和符号加权
    conflict = max(scores['pos'], scores['neg']) - min(scores['pos'], scores['neg'])
    punctuation_bonus = text.count('!')*0.15 + text.count('?')*0.1
    return conflict * (1 - scores['neu']) + punctuation_bonus

# === 4. 动态优化阈值 ===
def find_best_threshold(y_true, texts):
    thresholds = np.arange(0.1, 1.0, 0.05)  # 扩大搜索范围
    best_f1, best_thresh = 0, 0.5
    for t in thresholds:
        preds = [1 if calculate_sarcasm_score(text) >= t else 0 for text in texts]
        current_f1 = f1_score(y_true, preds)
        if current_f1 > best_f1:
            best_f1, best_thresh = current_f1, t
    return best_thresh

best_threshold = find_best_threshold(val_labels, cleaned_val)  # 改用验证集

print(f"Optimal threshold found: {best_threshold:.2f}")  # 新增的打印语句
# === 5. 进行预测 ===
vader_preds = [1 if calculate_sarcasm_score(text) >= best_threshold else 0 for text in cleaned_test]

# === 6. 计算指标 ===
accuracy = accuracy_score(test_labels, vader_preds)
precision = precision_score(test_labels, vader_preds)
recall = recall_score(test_labels, vader_preds)
f1 = f1_score(test_labels, vader_preds)

print(f"Optimized VADER Results (threshold={best_threshold:.2f}):")
print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

Optimal threshold found: 0.10
Optimized VADER Results (threshold=0.10):
Accuracy: 0.6358 | Precision: 0.6180 | Recall: 0.6118 | F1-Score: 0.6149


In [2]:
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# === 1. 预处理优化 ===
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)           # 移除URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)       # 仅保留字母和空格
    text = re.sub(r'\b(not|no|never)\b', r'\1_', text)  # 强化否定处理
    return text.strip()

# 清洗训练/测试数据
cleaned_train = [preprocess(t) for t in train_texts]
cleaned_test = [preprocess(t) for t in test_texts]

# === 2. 特征增强 ===
vectorizer = CountVectorizer(
    ngram_range=(1, 2),    # 启用bigram
    max_features=15000,    # 扩展特征数量
    stop_words='english',
    binary=True   # 添加参数
)

# === 3. 超参数调优 ===
param_grid = {
    'multinomialnb__alpha': [0.01, 0.1, 0.5, 1.0],  # 更细粒度的平滑参数
    'countvectorizer__ngram_range': [(1,1), (1,2)], # 专注实用ngram范围
    'countvectorizer__max_features': [10000, 15000]
}

grid = GridSearchCV(
    make_pipeline(vectorizer, MultinomialNB()),
    param_grid,
    cv=5,                  # 增加交叉验证稳定性
    scoring='f1',
    n_jobs=-1
)

# 使用清洗后的数据训练
grid.fit(cleaned_train, train_labels)

# 最优参数
print(f"Mejores parámetros: {grid.best_params_}")

# === 4. 最终评估 ===
best_model = grid.best_estimator_
bow_nb_labels = best_model.predict(cleaned_test)

accuracy_bow = accuracy_score(test_labels, bow_nb_labels)
precision_bow = precision_score(test_labels, bow_nb_labels)
recall_bow = recall_score(test_labels, bow_nb_labels)
f1_bow = f1_score(test_labels, bow_nb_labels)

print("\nOptimized BoW+NB Results:")
print(f"Accuracy: {accuracy_bow:.4f} | Precision: {precision_bow:.4f} | Recall: {recall_bow:.4f} | F1-Score: {f1_bow:.4f}")

Mejores parámetros: {'countvectorizer__max_features': 15000, 'countvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha': 1.0}

Optimized BoW+NB Results:
Accuracy: 0.7939 | Precision: 0.7829 | Recall: 0.7833 | F1-Score: 0.7831


In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# === 1. 预处理优化 ===
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)           # 移除URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)       # 仅保留字母和空格
    text = re.sub(r'\b(not|no|never)\b', r'\1_', text)  # 强化否定处理
    return text.strip()

# 清洗训练/测试数据
cleaned_train = [preprocess(t) for t in train_texts]
cleaned_test = [preprocess(t) for t in test_texts]

# === 2. 特征增强 ===
vectorizer = CountVectorizer(
    ngram_range=(1, 2),    # 启用bigram
    max_features=15000,    # 扩展特征数量
    stop_words='english'
    
)

# === 3. 超参数调优 ===
param_grid = {
    'multinomialnb__alpha': [0.01, 0.1, 0.5, 1.0],  # 更细粒度的平滑参数
    'countvectorizer__ngram_range': [(1,1), (1,2)], # 专注实用ngram范围
    'countvectorizer__max_features': [10000, 15000]
}

grid = GridSearchCV(
    make_pipeline(vectorizer, MultinomialNB()),
    param_grid,
    cv=5,                  # 增加交叉验证稳定性
    scoring='f1',
    n_jobs=-1
)

# 使用清洗后的数据训练
grid.fit(cleaned_train, train_labels)

# 最优参数
print(f"Mejores parámetros: {grid.best_params_}")

# === 4. 最终评估 ===
best_model = grid.best_estimator_
bow_nb_labels = best_model.predict(cleaned_test)

accuracy_bow = accuracy_score(test_labels, bow_nb_labels)
precision_bow = precision_score(test_labels, bow_nb_labels)
recall_bow = recall_score(test_labels, bow_nb_labels)
f1_bow = f1_score(test_labels, bow_nb_labels)

print("\nOptimized BoW+NB Results:")
print(f"Accuracy: {accuracy_bow:.4f} | Precision: {precision_bow:.4f} | Recall: {recall_bow:.4f} | F1-Score: {f1_bow:.4f}")

Mejores parámetros: {'countvectorizer__max_features': 15000, 'countvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha': 1.0}

Optimized BoW+NB Results:
Accuracy: 0.7935 | Precision: 0.7833 | Recall: 0.7816 | F1-Score: 0.7825


In [15]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# === 1. 预处理优化 ===
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)          # 移除URL（社交媒体特性）
    text = re.sub(r'[^a-zA-Z\s]', '', text)      # 仅保留字母
    text = re.sub(r'\b(not|no|never)\b\s*', r'\1_', text)  # 处理否定结构（生成"not_good"等特征）
    return text.strip()

# 清洗训练/测试数据
cleaned_train_texts = [preprocess(text) for text in train_texts]
cleaned_test_texts = [preprocess(text) for text in test_texts]

# === 2. 构建TF-IDF + SVM管道 ===
tfidf = TfidfVectorizer(
    ngram_range=(1, 3),    # 捕捉tri-gram特征（如"love_waiting_in_line"）
    max_features=20000,    # 扩展特征数量
    stop_words=None        # 保留停用词（否定词如"not"对讽刺检测很重要）
)

svm = LinearSVC(
    class_weight='balanced',  # 处理类别不平衡
    random_state=42
)

# 构建管道
tfidf_svm_model = make_pipeline(tfidf, svm)

# === 3. 超参数调优 ===
param_grid = {
    'tfidfvectorizer__ngram_range': [(1,2), (1,3)],  # 优化n-gram范围
    'tfidfvectorizer__max_features': [10000, 20000],
    'linearsvc__C': [0.1, 1, 10]  # 调整正则化强度
}

grid = GridSearchCV(
    tfidf_svm_model,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# 使用清洗后的数据训练
grid.fit(cleaned_train_texts, train_labels)

# 最优参数
print(f"Mejores parámetros encontrados: {grid.best_params_}")

# === 4. 最终模型训练 ===
best_model = grid.best_estimator_
tfidf_svm_labels = best_model.predict(cleaned_test_texts)

# === 5. 性能评估 ===
accuracy = accuracy_score(test_labels, tfidf_svm_labels)
precision = precision_score(test_labels, tfidf_svm_labels)
recall = recall_score(test_labels, tfidf_svm_labels)
f1 = f1_score(test_labels, tfidf_svm_labels)

print(f"\nTF-IDF+SVM Optimizado para Detección de Ironía:")
print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

Mejores parámetros encontrados: {'linearsvc__C': 0.1, 'tfidfvectorizer__max_features': 20000, 'tfidfvectorizer__ngram_range': (1, 2)}

TF-IDF+SVM Optimizado para Detección de Ironía:
Accuracy: 0.8403 | Precision: 0.8151 | Recall: 0.8588 | F1-Score: 0.8364
