# 一、 数据处理


## 1. 读取数据

In [32]:
import pandas as pd

# 训练数据
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
# 测试数据
test = pd.read_csv("testData.tsv", header=0, delimiter='\t', quoting=3)

# 查看数据前5行
print(train.head())
# 查看数据集的大小
print(train.shape)
print(test.shape)
# 查看列
print(train.columns.values)
# 查看一则评论
print(train["review"][0])

         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...
(25000, 3)
(25000, 2)
['id' 'sentiment' 'review']
"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages a

## 2. 数据清洗

In [37]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
def review_to_words(raw_review):
    # 删除HTML标签
    review_text = BeautifulSoup(raw_review, "html.parser").get_text()
    # 去除标点和数字
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 小写、分词
    words = letters_only.lower().split()
    # 去除停用词
    # 转为集合提高效率
    stop_words = set(stopwords.words("english"))
    review_cleaned = [w for w in words if not w in stop_words]
    return review_cleaned


# train["review_cleaned"] = train["review"].apply(review_to_words)
# print(train["review_cleaned"].head())

# 处理训练集数据
train_num_reviews = train["review"].size
clean_train_review = []
for i in range(0, train_num_reviews):
    if (i+1) % 1000 == 0:
        print("train review %d of %d\n" % (i+1, train_num_reviews))
    clean_train_review.append(" ".join(review_to_words(train["review"][i])))

# 处理测试集数据
test_num_reviews = test["review"].size
clean_test_review = []
for i in range(0, test_num_reviews):
    if (i+1) % 1000 == 0:
        print("test review %d of %d\n" % (i+1, test_num_reviews))
    clean_test_review.append(" ".join(review_to_words(test["review"][i])))

print(clean_train_review[0])

  review_text = BeautifulSoup(raw_review, "html.parser").get_text()


train review 1000 of 25000

train review 2000 of 25000

train review 3000 of 25000

train review 4000 of 25000

train review 5000 of 25000

train review 6000 of 25000

train review 7000 of 25000

train review 8000 of 25000

train review 9000 of 25000

train review 10000 of 25000

train review 11000 of 25000

train review 12000 of 25000

train review 13000 of 25000

train review 14000 of 25000

train review 15000 of 25000

train review 16000 of 25000

train review 17000 of 25000

train review 18000 of 25000

train review 19000 of 25000

train review 20000 of 25000

train review 21000 of 25000

train review 22000 of 25000

train review 23000 of 25000

train review 24000 of 25000

train review 25000 of 25000

test review 1000 of 25000

test review 2000 of 25000

test review 3000 of 25000

test review 4000 of 25000

test review 5000 of 25000

test review 6000 of 25000

test review 7000 of 25000

test review 8000 of 25000

test review 9000 of 25000

test review 10000 of 25000

test review 1

## 3. 创建特征向量

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
# 初始化
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)
# 转为特征向量
train_data_features = vectorizer.fit_transform(clean_train_review)
train_data_features = train_data_features.toarray()
test_data_features = vectorizer.transform(clean_test_review)
test_data_features = test_data_features.toarray()

# 获取词汇表
vocab = vectorizer.get_feature_names_out()

print(vocab)
print(train_data_features.shape)

['abandoned' 'abc' 'abilities' ... 'zombie' 'zombies' 'zone']
(25000, 5000)


# 二、模型训练

In [39]:
from sklearn.ensemble import RandomForestClassifier

# 创建随机森林
forest = RandomForestClassifier(n_estimators = 100)
# 模型训练
forest = forest.fit(train_data_features, train["sentiment"])

# 三、模型预测

In [40]:
result = forest.predict(test_data_features)
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# 将输出结果保存在csv文件中
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )