In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.models import Word2Vec
import jieba
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
label_train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
unlabel_train = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3)

In [26]:
label_train

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
...,...,...,...
24995,"""3453_3""",0,"""It seems like more consideration has gone int..."
24996,"""5064_1""",0,"""I don't believe they made this film. Complete..."
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui..."
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the..."


In [27]:
# 文本清洗函数：去除HTML标签和非字母字符
def clean_review(text):
    text = re.sub(r'<.*?>', '', text)  # 移除HTML标签
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # 只保留字母
    return text.lower()

# 对每个评论进行清洗
label_train['review'] = label_train['review'].apply(clean_review)
test['review'] = test['review'].apply(clean_review)

In [28]:
len(label_train['review'][0])

2256

In [29]:
#文本向量化
vectorizer = TfidfVectorizer(max_features=5000)  # 设置最大词汇数量
train_vectors = vectorizer.fit_transform(label_train['review'])
test_vectors = vectorizer.transform(test['review'])

In [30]:
test

Unnamed: 0,id,review
0,"""12311_10""",naturally in a film who s main themes are of ...
1,"""8348_2""",this movie is a disaster within a disaster fi...
2,"""5828_4""",all in all this is a movie for kids we saw ...
3,"""7186_2""",afraid of the dark left me with the impressio...
4,"""12128_7""",a very accurate depiction of small time mob l...
...,...,...
24995,"""2155_10""",sony pictures classics i m looking at you s...
24996,"""59_10""",i always felt that ms merkerson had never go...
24997,"""2531_1""",i was so disappointed in this movie i am ver...
24998,"""7772_8""",from the opening sequence filled with black ...


模型

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 拆分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(train_vectors, label_train['sentiment'], test_size=0.2, random_state=42)

# 使用逻辑回归模型训练
model = LogisticRegression()
model.fit(X_train, y_train)

# 验证集预测
y_pred = model.predict(X_val)

# 计算准确率
accuracy = accuracy_score(y_val, y_pred)
print(f'验证集准确率: {accuracy:.4f}')


验证集准确率: 0.8884


In [37]:
# 拆分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(train_vectors, label_train['sentiment'], test_size=0.2, random_state=42)

# 使用随机森林模型训练
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# 验证集预测
y_pred_rf = model_rf.predict(X_val)

# 计算准确率
accuracy_rf = accuracy_score(y_val, y_pred_rf)
print(f'验证集准确率: {accuracy_rf:.4f}')

验证集准确率: 0.8354


In [45]:
import csv
#选逻辑回归
test_pred = model.predict(test_vectors)

submission = pd.DataFrame({'id': test['id'].str.strip('"'), 'sentiment': test_pred})
submission.to_csv('submission.csv', index=False, quoting=csv.QUOTE_ALL)


In [46]:
submission

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1
...,...,...
24995,2155_10,1
24996,59_10,1
24997,2531_1,0
24998,7772_8,1
