在code的基础上通过未标记数据和 Word2Vec 增强泛化能力

In [4]:
import pandas as pd
import nltk
from gensim.models import Word2Vec
import re
import numpy as np

nltk.download('punkt')

unlabeled_data = pd.read_csv('unlabeledTrainData.tsv', delimiter='\t', quoting=3)


[nltk_data] Error loading punkt: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>


In [2]:
# 文本清洗函数：去除HTML标签和非字母字符
def clean_review(text):
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'[^a-zA-Z]', ' ', text) 
    return text.lower()

def preprocess_reviews(reviews):
    tokenized_reviews = []
    for review in reviews:
        cleaned_review = clean_review(review)
        words = nltk.word_tokenize(cleaned_review)
        tokenized_reviews.append(words)
    return tokenized_reviews

tokenized_reviews = preprocess_reviews(unlabeled_data['review'])

In [3]:
word2vec_model = Word2Vec(
    sentences=tokenized_reviews,  # 输入的分词文本
    vector_size=100,              # 词向量维度
    window=5,                     # 上下文窗口大小
    min_count=5,                  # 最小词频
    sg=1,                         # Skip-Gram模型 (sg=1)，CBOW为0
    workers=4                     # 使用4个线程进行训练
)

word2vec_model.save("word2vec_model.model")


In [5]:
def get_average_word_vectors(reviews, model, vector_size):
    review_vectors = []
    for review in reviews:
        word_vectors = [model.wv[word] for word in review if word in model.wv]
        if len(word_vectors) > 0:
            avg_vector = np.mean(word_vectors, axis=0)
        else:
            avg_vector = np.zeros(vector_size)
        review_vectors.append(avg_vector)
    return np.array(review_vectors)

train_review_vectors = get_average_word_vectors(tokenized_reviews, word2vec_model, vector_size=100)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

labeled_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t', quoting=3)
labeled_reviews = preprocess_reviews(labeled_data['review'])

labeled_review_vectors = get_average_word_vectors(labeled_reviews, word2vec_model, vector_size=100)

X_train, X_val, y_train, y_val = train_test_split(labeled_review_vectors, labeled_data['sentiment'], test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'验证集准确率: {accuracy:.4f}')


验证集准确率: 0.8656


In [10]:
from sklearn.ensemble import RandomForestClassifier

# 拆分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(labeled_review_vectors, labeled_data['sentiment'], test_size=0.2, random_state=42)

# 使用随机森林模型训练
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# 验证集预测
y_pred_rf = model_rf.predict(X_val)

# 计算准确率
accuracy_rf = accuracy_score(y_val, y_pred_rf)
print(f'验证集准确率: {accuracy_rf:.4f}')

验证集准确率: 0.8422
