In [None]:
## Loading the data set from nltk itself

In [1]:
from nltk.corpus import movie_reviews

## exploring data

In [2]:
movie_reviews.categories() ## seeing the cat

['neg', 'pos']

In [3]:
len(movie_reviews.fileids())

2000

In [4]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
## storing all data into list and then shuffling it randomly

In [6]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
import random 
random.shuffle(documents)

In [8]:
documents[0:5]

[(['note', ':', 'some', 'may', 'consider', 'portions', ...], 'neg'),
 (['stendhal', "'", 's', 'syndrome', ':', 'a', ...], 'pos'),
 (['what', 'do', 'you', 'get', 'when', 'you', 'rip', '-', ...], 'neg'),
 (['plot', ':', 'a', 'group', 'of', 'asbestos', ...], 'pos'),
 (['here', "'", 's', 'a', 'word', 'analogy', ':', ...], 'pos')]

## removing the stop words and punctuations then Lemmatizing the remaing data

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [18]:
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))
import string
punctuations=list(string.punctuation)
stops.update(punctuations)


In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
from nltk import pos_tag
w='better'
pos_tag([w])#as it excepts data as array of words not a string or directly words

[('better', 'RBR')]

## Making a function to convert post_tag to general terminology that lemmatizer could understand

In [21]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [22]:
## making a function to get post tag, lemmatize then make a list to append it

In [23]:
def clean_review(words):
    output_words = []
    
    for w in words:
        if w.lower() not in stops: 
            pos = pos_tag([w])  # not lowering here since it might reduce the info
            clean_word = lemmatizer.lemmatize(w,pos= get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [24]:
documents = [(clean_review(document), category) for document, category  in documents ] 
## calling function for all data points

## Method 1 : building features for NLTK built in classifier

In [106]:
training_documents = documents[0:1500]  ## first doing train test split
test_documents = documents[1500:]
## since data is shuffled therefore can do it directly 
## also we make features from training data only

## making all the features(words) present in a single list

In [107]:
all_words = [] 
for doc in training_documents:
    all_words+=doc[0]  ## since doc is a tuple so first element will be added


In [108]:
import nltk

In [112]:
freq = nltk.FreqDist(all_words)  ## finds the freq distribution of each word
freq

FreqDist({'film': 8512, 'movie': 5142, 'one': 4510, 'make': 3292, 'like': 2996, 'character': 2912, 'get': 2713, 'see': 2380, 'go': 2306, 'time': 2284, ...})

In [113]:
common = freq.most_common(3000) ## to get top 3k most used words as features
features = [i[0] for i in common]

In [115]:
## now for each document we want to create a dict where it tell 
## for every feature 'film' True 
            #        'movie' false   ....

In [None]:
## function to check where the text contains the feature (top words) or not

In [120]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w]= w in words_set  ## return T/F
    return current_features

In [180]:
output = get_feature_dict(training_documents[0][0])


In [153]:
## for nltk training & testing data should be in form of
## array --> containg tuple + category
## tuple --> contain dict that features(top word) present in text or not [i.e feature + feature value]

In [122]:
training_data = [ (get_feature_dict(doc),category) for doc,category in training_documents] 

In [123]:
test_data = [ (get_feature_dict(doc),category) for doc,category in test_documents] 

In [None]:
## using inbuilt naive bayes classifier in nltk

In [127]:
from nltk import NaiveBayesClassifier

In [128]:
classifier = NaiveBayesClassifier.train(training_data)

In [131]:
nltk.classify.accuracy(classifier, test_data)

0.782

In [133]:
classifier.show_most_informative_features(15) ## top most effective words

Most Informative Features
               ludicrous = True              neg : pos    =     12.8 : 1.0
             outstanding = True              pos : neg    =     10.5 : 1.0
                   jolie = True              neg : pos    =      8.9 : 1.0
              schumacher = True              neg : pos    =      8.9 : 1.0
                  sinise = True              neg : pos    =      7.6 : 1.0
               stupidity = True              neg : pos    =      7.2 : 1.0
                  poorly = True              neg : pos    =      6.9 : 1.0
                   anger = True              pos : neg    =      6.5 : 1.0
                   ideal = True              pos : neg    =      6.5 : 1.0
                   damon = True              pos : neg    =      6.5 : 1.0
                lifeless = True              neg : pos    =      6.5 : 1.0
               criticism = True              pos : neg    =      6.2 : 1.0
             beautifully = True              pos : neg    =      6.2 : 1.0

## using Sklearn models with NLTK Training data

In [172]:
from sklearn.tree import DecisionTreeClassifier

In [173]:
dt = DecisionTreeClassifier()
classifier_sklearn = SklearnClassifier(dt)

In [174]:
classifier_sklearn.train(training_data)

<SklearnClassifier(DecisionTreeClassifier())>

In [175]:
nltk.classify.accuracy(classifier_sklearn, test_data)

0.594

In [168]:
from sklearn.ensemble import RandomForestClassifier

In [169]:
rf = RandomForestClassifier()
classifier_sklearn2 = SklearnClassifier(rf)

In [170]:
classifier_sklearn2.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [171]:
nltk.classify.accuracy(classifier_sklearn2, test_data)

0.814

## Method 2:  using count vectorization method

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
## in code below:
## 1. joining text
a = ['ab','cd']
' '.join(a)
## 2. getting y as categories

'ab cd'

In [61]:
text_data = [' '.join(document) for document, category in documents]
category = [category for document,category in documents]

In [62]:
from sklearn.model_selection import train_test_split
x_train , x_test, y_train , y_test = train_test_split(text_data , category)

### using max_features and Ngram

In [81]:
count_vec = CountVectorizer(max_features =2000, ngram_range = (1,2))

In [82]:
x_training = count_vec.fit_transform(x_train)
x_training

<1500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 262347 stored elements in Compressed Sparse Row format>

In [83]:
count_vec.get_feature_names()[0:30] ## seeing top 30 features



['000',
 '10',
 '100',
 '13',
 '15',
 '17',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act']

In [84]:
## x_training.todense()

In [85]:
x_testing = count_vec.transform(x_test)
x_testing
## x_testing.todense()

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 85457 stored elements in Compressed Sparse Row format>

In [86]:
from sklearn.ensemble import RandomForestClassifier

In [87]:
rf = RandomForestClassifier()

In [88]:
rf.fit(x_training , y_train)

RandomForestClassifier()

In [89]:
rf.score(x_testing , y_test)

0.824