<h1> Movie review classification with NLTK </h1>

In [4]:
import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [5]:
cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)),cat)
        reviews.append(review)
random.shuffle(reviews)

In [8]:
all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))][0]

In [14]:
def ext_ft(review,top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft

In [15]:
featuresets = [(ext_ft(d,top_wd_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]

In [16]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.79


In [17]:
classifier.show_most_informative_features(20)

Most Informative Features
    word_present(seagal) = True              neg : pos    =     12.5 : 1.0
word_present(outstanding) = True              pos : neg    =     10.1 : 1.0
     word_present(mulan) = True              pos : neg    =      8.9 : 1.0
word_present(wonderfully) = True              pos : neg    =      7.3 : 1.0
    word_present(wasted) = True              neg : pos    =      6.1 : 1.0
     word_present(awful) = True              neg : pos    =      6.1 : 1.0
    word_present(poorly) = True              neg : pos    =      5.8 : 1.0
     word_present(flynt) = True              pos : neg    =      5.6 : 1.0
      word_present(lame) = True              neg : pos    =      5.5 : 1.0
     word_present(damon) = True              pos : neg    =      5.4 : 1.0
word_present(ridiculous) = True              neg : pos    =      5.3 : 1.0
       word_present(era) = True              pos : neg    =      5.1 : 1.0
      word_present(jedi) = True              pos : neg    =      4.9 : 1

In [18]:
dict_vectorizer=None
def get_train_test(train_set,test_set):
    global dict_vectorizer
    dict_vectorizer = DictVectorizer(sparse=False)
    X_train, y_train = zip(*train_set)
    X_train = dict_vectorizer.fit_transform(X_train)
    X_test,y_test = zip(*test_set)
    X_test = dict_vectorizer.transform(X_test)
    return X_train,X_test,y_train,y_test

In [19]:
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [20]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.785


In [21]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
all_words_in_reviews = nltk.FreqDist(word.lower() for word in movie_reviews.words() if word not in stopwords_list)
top_words_in_reviews = [list(words) for words in zip(*all_words_in_reviews.most_common(2000))][0]

In [22]:
featuresets = [(ext_ft(d,top_words_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)

In [23]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [24]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.825


In [25]:
features_list = zip(dict_vectorizer.get_feature_names(),rf.feature_importances_)
features_list = sorted(features_list, key=lambda x: x[1], reverse=True)
print(features_list[0:20])

[('word_present(bad)', 0.012904816953952729), ('word_present(boring)', 0.006797056379259946), ('word_present(stupid)', 0.006742453545126172), ('word_present(awful)', 0.00605732124427093), ('word_present(worst)', 0.005618499631730539), ('word_present(waste)', 0.005091242651240423), ('word_present(supposed)', 0.005019844359438753), ('word_present(excellent)', 0.005002846831984908), ('word_present(mess)', 0.004735341799753426), ('word_present(wasted)', 0.004477280752464545), ('word_present(ridiculous)', 0.00435578373608493), ('word_present(lame)', 0.00404257877140679), ('word_present(also)', 0.003663095965733155), ('word_present(others)', 0.0035194019538410553), ('word_present(dull)', 0.003464806019875671), ('word_present(plot)', 0.0034406946286116035), ('word_present(nothing)', 0.0033285487918061265), ('word_present(performances)', 0.003286015291474251), ('word_present(outstanding)', 0.0032708132090801516), ('word_present(memorable)', 0.003265718932501386)]
