In [1]:
from nltk.corpus import movie_reviews

In [2]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\tushant\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
# movie_reviews.fileids()

## Cleaning the dataset

In [5]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
# We are shuffling the dataset
import random
random.shuffle(documents)
documents[0:5]

[(['man', ',', 'this', 'was', 'one', 'wierd', 'movie', ...], 'neg'),
 (['it', 'may', 'seem', 'weird', 'to', 'begin', 'a', ...], 'pos'),
 (['charlie', 'sheen', 'stars', 'as', 'zane', ',', 'a', ...], 'pos'),
 (['finding', 'the', 'courage', 'to', 'face', 'life', ...], 'pos'),
 (['director', 'jan', 'de', 'bont', 'certainly', 'knows', ...], 'pos')]

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [8]:
from nltk.corpus import stopwords
import string 
stop = stopwords.words("english")
punctuation = list(string.punctuation)
stop += punctuation

In [9]:
from nltk.corpus import wordnet
from nltk import pos_tag
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [10]:
# Now we have to aplly the Lemmatization to each word and also removing the stop words.
# To do that we are writing a function
def clean(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [None]:
documents_clean = [(clean(document), category) for document, category in documents]

In [None]:
documents_clean[0]

In [None]:
train_documents = documents_clean[0:1500]
test_documents = documents_clean[1500:]

In [None]:
all_words = []
for document in train_documents:
    all_words += document[0] 

In [None]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [x[0] for x in common]

In [None]:
features

In [None]:
def get_feature_dict(word):
    current_feature = {}
    word_set = set(word)
    for w in features:
        current_feature[w] = w in word_set
    return current_feature

In [None]:
get_feature_dict(train_documents[0][0])

In [None]:
training_data = [ [get_feature_dict(document),category] for document,category in train_documents]

In [None]:
test_data = [ [get_feature_dict(document),category] for document,category in test_documents]

In [None]:
# Applying the classifier
from nltk import NaiveBayesClassifier

In [None]:
clg = NaiveBayesClassifier.train(training_data)

In [None]:
nltk.classify.accuracy(clg, test_data)

In [None]:
clg.show_most_informative_features(15)

## Using sklearn classifier to classify the the text in nltk.

In [None]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier