In [1]:
import nltk

# nltk.download('movie_reviews')

In [2]:
from nltk import pos_tag, NaiveBayesClassifier
from nltk.corpus import movie_reviews, wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import random
import string

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
len(movie_reviews.fileids())

2000

In [5]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
        
documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
random.shuffle(documents)

documents[:5]

[(['i', 'feel', 'sorry', 'for', 'the', 'financial', ...], 'neg'),
 (['you', 'know', ',', 'i', 'never', 'really', ...], 'neg'),
 (['can', 'you', 'say', '"', 'dated', '"', '?', 'you', ...], 'neg'),
 (['reflecting', 'on', '"', 'bedazzled', ',', '"', 'a', ...], 'pos'),
 (['a', 'group', 'of', 'high', 'school', 'kids', 'mix', ...], 'pos')]

In [8]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)

stops.update(punctuations)

In [10]:
lemmatizer = WordNetLemmatizer()

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [11]:
documents = [(clean_review(document), category) for document, category in documents]

In [12]:
# the data is randomly shuffled already

training_documents = documents[:1500]
testing_documents = documents[1500:]

In [13]:
all_words = []

for doc in training_documents:
    all_words += doc[0]

In [14]:
freq = nltk.FreqDist(all_words) # returns frequency distribution of words

common_words = freq.most_common(3000)

features = [i[0] for i in common_words]

In [15]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [16]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [17]:
clf = NaiveBayesClassifier.train(training_data)

In [18]:
nltk.classify.accuracy(clf, testing_data)

0.782

In [19]:
clf.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     18.4 : 1.0
               ludicrous = True              neg : pos    =     13.0 : 1.0
                  seagal = True              neg : pos    =     12.6 : 1.0
               stupidity = True              neg : pos    =     10.3 : 1.0
              schumacher = True              neg : pos    =     10.0 : 1.0
                   mulan = True              pos : neg    =      7.9 : 1.0
             beautifully = True              pos : neg    =      7.2 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                   anger = True              pos : neg    =      7.0 : 1.0
                 refresh = True              pos : neg    =      6.8 : 1.0
              ridiculous = True              neg : pos    =      6.5 : 1.0
                   perry = True              neg : pos    =      6.1 : 1.0
                   squad = True              neg : pos    =      6.1 : 1.0

# Sklearn using NLTK

In [27]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [24]:
svc = SVC()

classifier_sklearn = SklearnClassifier(svc)

In [25]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))>

In [26]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.822

In [28]:
rfc = RandomForestClassifier()

classifier_sklearn1 = SklearnClassifier(rfc)

In [29]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False))>

In [30]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.81

# Count Vectorization

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection

In [33]:
train_set = {'the sky is blue', "the sun is bright"}

count_vec = CountVectorizer(max_features=3)
a = count_vec.fit_transform(train_set)

a.todense()

matrix([[0, 1, 1],
        [1, 1, 1]], dtype=int64)

In [34]:
count_vec.get_feature_names()

['blue', 'is', 'the']

In [35]:
categories = [category for document, category in documents]

In [38]:
text_document = [" ".join(document) for document, category in documents]

In [63]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(text_document, categories)

In [84]:
count_vec = CountVectorizer(max_features=2000, ngram_range=(1,3))
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 7, 0, ..., 0, 3, 3],
        ...,
        [0, 0, 0, ..., 3, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [85]:
# print(count_vec.get_feature_names())

x_test_features = count_vec.transform(x_test)

In [86]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 85891 stored elements in Compressed Sparse Row format>

In [87]:
svc.fit(x_train_features, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [88]:
svc.score(x_test_features, y_test)

0.814