In [19]:
import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [20]:
cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)),cat)
        reviews.append(review)
random.shuffle(reviews)

In [21]:
all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))][0]

In [22]:
def ext_ft(review,top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft

In [23]:
featuresets = [(ext_ft(d,top_wd_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]

In [24]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.84


In [25]:
classifier.show_most_informative_features(20)

Most Informative Features
word_present(outstanding) = True              pos : neg    =     10.2 : 1.0
    word_present(seagal) = True              neg : pos    =      7.1 : 1.0
     word_present(mulan) = True              pos : neg    =      6.9 : 1.0
word_present(wonderfully) = True              pos : neg    =      6.7 : 1.0
    word_present(poorly) = True              neg : pos    =      5.7 : 1.0
     word_present(flynt) = True              pos : neg    =      5.6 : 1.0
      word_present(lame) = True              neg : pos    =      5.5 : 1.0
    word_present(wasted) = True              neg : pos    =      5.4 : 1.0
     word_present(damon) = True              pos : neg    =      5.2 : 1.0
word_present(ridiculous) = True              neg : pos    =      5.2 : 1.0
     word_present(awful) = True              neg : pos    =      5.1 : 1.0
     word_present(waste) = True              neg : pos    =      5.0 : 1.0
     word_present(worst) = True              neg : pos    =      4.7 : 1

In [26]:
dict_vectorizer=None
def get_train_test(train_set,test_set):
    global dict_vectorizer
    dict_vectorizer = DictVectorizer(sparse=False)
    X_train, y_train = zip(*train_set)
    X_train = dict_vectorizer.fit_transform(X_train)
    X_test,y_test = zip(*test_set)
    X_test = dict_vectorizer.transform(X_test)
    return X_train,X_test,y_train,y_test

In [27]:
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(n_jobs=4, random_state=10)

In [28]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.795


In [29]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
all_words_in_reviews = nltk.FreqDist(word.lower() for word in movie_reviews.words() if word not in stopwords_list)
top_words_in_reviews = [list(words) for words in zip(*all_words_in_reviews.most_common(2000))][0]

In [30]:
featuresets = [(ext_ft(d,top_words_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)

In [31]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(n_jobs=4, random_state=10)

In [32]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.78


In [33]:
features_list = zip(dict_vectorizer.get_feature_names(),rf.feature_importances_)
features_list = sorted(features_list, key=lambda x: x[1], reverse=True)
print(features_list[0:20])

[('word_present(bad)', 0.016739520639103028), ('word_present(mess)', 0.006629632281019339), ('word_present(stupid)', 0.006534607470282351), ('word_present(worst)', 0.006487818472443368), ('word_present(boring)', 0.005631611763307368), ('word_present(lame)', 0.005081177157544614), ('word_present(waste)', 0.004954570648885522), ('word_present(wasted)', 0.004256251522246547), ('word_present(awful)', 0.004213821004719616), ('word_present(dull)', 0.004163115309051399), ('word_present(supposed)', 0.004126325952489547), ('word_present(outstanding)', 0.0040604778013627486), ('word_present(ridiculous)', 0.0038293804751775074), ('word_present(excellent)', 0.0037160673869480604), ('word_present(memorable)', 0.003542931435602152), ('word_present(also)', 0.0033486414991687388), ('word_present(perfectly)', 0.0033066324754177324), ('word_present(life)', 0.0030930714572762635), ('word_present(best)', 0.0030499798779878517), ('word_present(plot)', 0.003039673355764874)]
