In [2]:
import sklearn
import pandas as pd
import numpy as np
from sklearn import metrics
import nltk
from nltk.tokenize import word_tokenize

In [3]:
train_file = "./sentiment-analysis-on-movie-reviews/train.tsv"
test_file = "./sentiment-analysis-on-movie-reviews/test.tsv"
train = pd.read_csv(train_file,sep = '\t')
test = pd.read_csv(test_file, sep = '\t')

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [5]:
#tf-idf中的idf是逆文档频率（inverse document frequency），idf = log(总文档数/包含有该词的文档数+1)，对热门的词起到一个惩罚作用；越热门idf值越小，tf*idf后的结果也就越小      
count_vec = CountVectorizer(ngram_range=(1,3), analyzer='word', stop_words='english', min_df=0.001)#提取特征向量,包括ngram1,2,3
tfidf_vec = TfidfVectorizer(ngram_range=(1,3), analyzer='word', stop_words='english', min_df=0.001)#提取特征向量
train['Phrase'] = train['Phrase'].str.lower()#先变成小写字母
train['tokenize'] = train.apply(lambda x: nltk.word_tokenize(x['Phrase']),axis=1)#分词，为下一步提词干作准备

In [6]:
#nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")#英文停用词有内置的
train['tokenize'] = train['tokenize'].apply(lambda x: [stemmer.stem(y) for y in x])#提词干，相当于一个降噪
train['tokenize'] = train['tokenize'].apply(lambda x: ' '.join(x))

In [7]:
x = train.tokenize
y = train.Sentiment
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [8]:
train_feature = count_vec.fit_transform(x_train)#124848x749
test_feature = count_vec.transform(x_test)#这里不是fit_transform,而是transform;这里得到的是词在文本中的出现频率的稀疏矩阵，例如test_feature就是一个31212x749的矩阵，31212是总文本长度，749是特征数
#train_feature = tfidf_vec.fit_transform(x_train)
#test_feature = tfidf_vec.transform(x_test)

In [9]:
#词汇表
print('\nvocabulary list:\n\n',count_vec.get_feature_names())
print( '\nvocabulary dic :\n\n',count_vec.vocabulary_)#{word:id}
print ('vocabulary:\n\n')
for key,value in count_vec.vocabulary_.items():
    print(key,value)
# print('\nvocabulary list:\n\n',tfidf_vec.get_feature_names())
# print( '\nvocabulary dic :\n\n',tfidf_vec.vocabulary_)
# print ('vocabulary:\n\n')
# for key,value in tfidf_vec.vocabulary_.items():
#     print(key,value)


vocabulary list:

 ['10', '90', '90 minut', 'abl', 'abov', 'achiev', 'act', 'action', 'actor', 'actress', 'actual', 'adapt', 'add', 'admir', 'adult', 'adventur', 'affect', 'ag', 'age', 'air', 'alien', 'allow', 'alreadi', 'altern', 'alway', 'ambiti', 'america', 'american', 'amus', 'ani', 'anim', 'anoth', 'anyon', 'anyth', 'appeal', 'appear', 'approach', 'art', 'artist', 'ask', 'atmospher', 'attempt', 'attent', 'attract', 'audienc', 'aw', 'away', 'bad', 'balanc', 'bare', 'battl', 'bear', 'beat', 'beauti', 'becaus', 'becom', 'befor', 'begin', 'believ', 'best', 'better', 'big', 'big screen', 'bit', 'bite', 'black', 'bland', 'blood', 'bodi', 'book', 'bore', 'boy', 'brain', 'break', 'brilliant', 'bring', 'british', 'brother', 'budget', 'burn', 'busi', 'ca', 'camera', 'captur', 'car', 'care', 'career', 'carri', 'cartoon', 'case', 'cast', 'celebr', 'centuri', 'certain', 'challeng', 'chang', 'charact', 'character', 'charm', 'chase', 'cheap', 'children', 'chill', 'cinema', 'cinemat', 'citi', 'c

In [10]:
print(train_feature.toarray()[2])
#print(len(count_vec.get_feature_names()))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [11]:
#print("Number of features:",count_vec.get_feature_names())
#朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_feature, y_train)
y_pred = nb.predict(test_feature)
print('NB:', metrics.accuracy_score(y_test, y_pred))
#SVM
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

#SVM = SGDClassifier(loss='hinge')#SGD用minibatch做梯度下降
SVM = SVC(kernel='rbf', class_weight='balanced')#Radial Based Function
SVM.fit(train_feature, y_train)
y_pred_class = SVM.predict(test_feature)
print('SVM:', metrics.accuracy_score(y_test, y_pred))
#随机森林
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 20)
rfc.fit(train_feature, y_train)
y_pred = rfc.predict(test_feature)
print('RF:',metrics.accuracy_score(y_test, y_pred))
#这里用tfidf的结果反而不如直接用词频

NB: 0.5689798795335127




SVM: 0.5689798795335127
RF: 0.6146674356016917




RF: 0.6124567474048442
