# 一、数据处理

## 1. 读取数据

In [1]:
import pandas as pd

train = pd.read_csv("labeledTrainData.tsv", header=0,
 delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

# 查看数据
print(train["review"].size)
print(test["review"].size)
print(unlabeled_train["review"].size)

25000
25000
50000


## 2. 数据清洗

In [2]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
def review_to_wordlist(raw_review, remove_stopwords=False):
    # 删除HTML标签
    review_text = BeautifulSoup(raw_review, "html.parser").get_text()
    # 去除标点和数字
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 小写、分词
    review_cleaned = letters_only.lower().split()
    # 训练Word2Vec最好不去除停用词
    if remove_stopwords:
        # 去除停用词
        # 转为集合提高效率
        stop_words = set(stopwords.words("english"))
        review_cleaned = [w for w in review_cleaned if not w in stop_words]
    return review_cleaned

## 3. 划分句子

In [3]:
import nltk

# 加载 punkt
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # 切分句子
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for sentence in raw_sentences:
        if len(sentence) > 0:
            # 将词列表加入句子列表
            sentences.append(review_to_wordlist(sentence, remove_stopwords = remove_stopwords))
    return sentences

# 获取输入数据
sentences = []
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer,
                                     remove_stopwords=False)
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer,
                                     remove_stopwords=False)
print(len(sentences))
print(sentences[0])
print(sentences[1])

  review_text = BeautifulSoup(raw_review, "html.parser").get_text()
  review_text = BeautifulSoup(raw_review, "html.parser").get_text()


796172
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']
['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


# 二、模型训练

In [14]:
import logging
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 设置模型参数
num_features = 300
min_word_count = 40  # 保留出现次数>=40
num_workers = 4
context = 10
down_sampling = 1e-3

# 默认使用skip-gram模型
model = word2vec.Word2Vec(sentences, workers=num_workers,  vector_size=num_features, min_count=min_word_count, window=context, sample=down_sampling)

# 保存模型
model_name = "300features_40minwords_10context"
model.save(model_name)

2025-10-15 11:07:58,933 : INFO : collecting all words and their counts
2025-10-15 11:07:58,934 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-10-15 11:07:58,966 : INFO : PROGRESS: at sentence #10000, processed 225664 words, keeping 17775 word types
2025-10-15 11:07:58,996 : INFO : PROGRESS: at sentence #20000, processed 451738 words, keeping 24945 word types
2025-10-15 11:07:59,022 : INFO : PROGRESS: at sentence #30000, processed 670858 words, keeping 30027 word types
2025-10-15 11:07:59,052 : INFO : PROGRESS: at sentence #40000, processed 896840 words, keeping 34335 word types
2025-10-15 11:07:59,092 : INFO : PROGRESS: at sentence #50000, processed 1116081 words, keeping 37751 word types
2025-10-15 11:07:59,124 : INFO : PROGRESS: at sentence #60000, processed 1337543 words, keeping 40711 word types
2025-10-15 11:07:59,156 : INFO : PROGRESS: at sentence #70000, processed 1560306 words, keeping 43311 word types
2025-10-15 11:07:59,196 : INFO : PROGRESS: 

# 三、模型效果测试

In [20]:
# 找出语义最不匹配的单词
# 预期输出“kitchen”
print(model.wv.doesnt_match("man woman child kitchen".split()))
# 预期输出“berlin”
print(model.wv.doesnt_match("france england germany berlin".split()))
# 预期输出“austria”
print(model.wv.doesnt_match("paris berlin london austria".split()))

kitchen
berlin
austria


In [24]:
# 测试语义相似性
print(model.wv.most_similar("man"))
print(model.wv.most_similar("awful"))

[('woman', 0.597867488861084), ('lady', 0.5709412693977356), ('lad', 0.5448217988014221), ('monk', 0.5355395674705505), ('soldier', 0.5192042589187622), ('farmer', 0.5134194493293762), ('businessman', 0.5132336020469666), ('guy', 0.510189950466156), ('men', 0.5049229264259338), ('millionaire', 0.5029137134552002)]
[('terrible', 0.7625946998596191), ('horrible', 0.7238616347312927), ('atrocious', 0.7134142518043518), ('dreadful', 0.7055865526199341), ('abysmal', 0.6959513425827026), ('horrendous', 0.6813711524009705), ('appalling', 0.6618828773498535), ('horrid', 0.6542140245437622), ('lousy', 0.617429792881012), ('laughable', 0.6039625406265259)]


# 四、模型应用

## 1. 加载模型

In [4]:
from gensim.models import Word2Vec
# 加载模型
model = Word2Vec.load("300features_40minwords_10context")
# wv.vectors中保存着每个词的特征向量
print(type(model.wv.vectors))
print(model.wv.vectors.shape)
# 查看词“flower”的向量表示
print(model.wv["flower"])

<class 'numpy.ndarray'>
(16490, 300)
[-1.19871274e-01  5.72454214e-01 -8.74414593e-02 -8.96268617e-03
  2.58887142e-01  2.08539784e-01  8.73589069e-02  1.18361644e-01
  3.18020821e-01 -1.92725658e-01 -1.57137558e-01  3.02065950e-04
  3.02821137e-02  2.56524414e-01 -9.65133682e-02  8.24111402e-02
 -1.54083237e-01 -4.33911949e-01  2.14469917e-02  1.19003229e-01
 -1.69126526e-01  9.99797136e-02  2.73753405e-01  2.07810163e-01
  6.10518694e-01 -1.43236015e-02 -5.21949649e-01  1.14180721e-01
  1.56046227e-01  9.39961821e-02  2.79568464e-01 -1.41498566e-01
 -9.50638875e-02 -3.09604704e-01 -3.21064085e-01 -1.11517929e-01
  4.28073317e-01 -2.57438987e-01 -1.33117929e-01  3.76034528e-01
 -2.19613284e-01  2.04722106e-01  3.61238539e-01 -2.50527591e-01
 -3.57289404e-01  4.88126129e-01  1.85666725e-01  4.15443391e-01
 -4.11319762e-01  4.53914672e-01  6.38770103e-01  8.83016810e-02
 -1.62773967e-01  1.17148414e-01 -2.22932115e-01  4.46077794e-01
  1.67725056e-01  1.65523648e-01 -3.94457310e-01 -1.3

## 2. 向量平均法
使用 Word2Vec 模型将评论转换为数值向量(300维)，然后用随机森林进行情感分析

In [10]:
import numpy as np
# 计算一个词列表中所有词向量的均值
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros(num_features, dtype="float32")
    nwords = 0
    # 模型词汇表中的单词集合
    word_set = set(model.wv.index_to_key)
    for word in words:
        if word in word_set:
            nwords += 1
            featureVec = np.add(featureVec, model.wv[word])
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# 对每个评论调用makeFeatureVec,生成二维向量矩阵
def getAvgFeatureVec(reviews, model, num_features):
    # reviews 是一个列表，元素为词列表
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        if counter%1000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter += 1
    return reviewFeatureVecs

# 计算平均向量
num_features = 300
clean_train_reviews = []        # 记录每条评论转为词列表的结果
for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))
trainDataVecs = getAvgFeatureVec(clean_train_reviews, model, num_features)
# 计算测试集评论的平均特征向量
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
testDataVecs = getAvgFeatureVec(clean_test_reviews, model, num_features)


  review_text = BeautifulSoup(raw_review, "html.parser").get_text()


Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 o

In [11]:
# 构造随机森林
from sklearn.ensemble import RandomForestClassifier

# 创建随机森林
forest = RandomForestClassifier(n_estimators = 100)
# 依据word模型训练
forest = forest.fit(trainDataVecs, train["sentiment"])
# 模型预测
result = forest.predict(testDataVecs)
# 将模型预测结果保存到csv文件中
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})

# 将输出结果保存在csv文件中
output.to_csv("Word2Vec_AverageVectors.csv", index=False, quoting=3)

## 2. 聚类法 k-means

In [12]:
from sklearn.cluster import KMeans

word_vectors = model.wv.vectors
# 设置参数k的取值为词汇表大小的五分之一，每个簇中约有5个词
num_clusters = int(word_vectors.shape[0]/5)
kmeans_clustering = KMeans(n_clusters=num_clusters)
# 每个词向量所属簇的编号
idx = kmeans_clustering.fit_predict(word_vectors)
# 使用字典将每个词与其所属簇编号映射起来
word_centroid_map = dict(zip(model.wv.index_to_key, idx))

In [13]:
# 输出十个簇的结果
for cluster in range(10):
    print("Cluster %d of %d" % (cluster, num_clusters))
    words = []
    for word, word_cluster in word_centroid_map.items():
        if word_cluster == cluster:
            words.append(word)
    print(words)

Cluster 0 of 3298
['franco']
Cluster 1 of 3298
['magazine', 'reunion', 'nyc', 'brooklyn', 'attend', 'boston', 'studying', 'gallery', 'attended', 'graduate', 'attending', 'prom', 'boarding', 'elementary', 'annual', 'freshman', 'grad']
Cluster 2 of 3298
['fest']
Cluster 3 of 3298
['harrison', 'wallace']
Cluster 4 of 3298
['ridiculous', 'laughable', 'absurd', 'ludicrous']
Cluster 5 of 3298
['pity', 'producing', 'flop', 'showcase', 'caliber', 'contribution', 'breakthrough']
Cluster 6 of 3298
['sandra', 'dee', 'jolie', 'angelina', 'bullock']
Cluster 7 of 3298
['doomed', 'distant', 'raising', 'shared', 'challenges', 'sharing', 'eternal', 'immediate', 'continuing', 'dilemma', 'mutual', 'sadako', 'youthful', 'questioning', 'experiencing', 'trials', 'realization', 'enduring', 'eventual', 'sacrifices', 'embrace', 'gandhi', 'fragile', 'impending', 'keane', 'asoka', 'devotion', 'torment', 'maturity', 'endured', 'denial', 'forgiveness', 'fulfilling', 'marital', 'temptation', 'sorrow', 'existential'

In [14]:
# 将词列表转为一个向量，维度为簇的个数，bag_of_centroids[index]表示本评论中属于编号“index”的簇的词个数
def create_bag_of_centroids(wordlist, word_centroid_map):
    # 确定簇的数量
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            # 记录词出现次数
            bag_of_centroids[index] += 1
    return bag_of_centroids

In [15]:
import numpy as np

# 为训练集生成向量
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1
# 为测试集生成向量
test_centroids = np.zeros((test["review"].size, num_clusters), dtype="float32")
counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [16]:
# 构造随机森林
forest = RandomForestClassifier(n_estimators = 100)
# 模型训练
forest = forest.fit(train_centroids, train["sentiment"])
# 模型预测
result = forest.predict(test_centroids)
# 预测结果保存
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("BagOfCentroids.csv", index=False, quoting=3)