## NLTK’s movie_reviews Dataset is used here.

Seperate out the positive reviews data in pos_reviews and negative reviews data in neg_reviews

In [1]:
from nltk.corpus import movie_reviews 
import nltk

nltk.download('movie_reviews') # downloading the data set

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [2]:
pos_reviews = [] 
for ids in movie_reviews.fileids('pos'):
    words = movie_reviews.words(ids)
    pos_reviews.append(words)
 
neg_reviews = []
for ids in movie_reviews.fileids('neg'):
    words = movie_reviews.words(ids)
    neg_reviews.append(words)
 
# print first positive review item from the pos_reviews list
print ("pos_reviews[0] :", pos_reviews[0])
print()
# print first negative review item from the neg_reviews list
print ("neg_reviews[0] :", neg_reviews[0])
print()
# print first 20 items of the first item of positive review
print ("pos_reviews[0][:20] :", pos_reviews[0][:20])
print()
# print first 20 items of the first item of negative review
print ("neg_reviews[0][:20] :", neg_reviews[0][:20])

pos_reviews[0] : ['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

neg_reviews[0] : ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

pos_reviews[0][:20] : ['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superheroes', '(', 'batman', ',']

neg_reviews[0][:20] : ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']



## Now, remove stop words and punctuations.
 > Because, Stop words are those frequently words which do not carry any significant meaning in text analysis. For example, I, me, my, the, a, and, is, are, he, she, we, and etc. Punctuation marks like comma(,), fullstop(.) inverted comma, etc. occur highly in any text data.


In [3]:
from nltk.corpus import stopwords 
import string 
nltk.download('stopwords')
# stopwords of English language because our model will work on English language only
_stopwords = stopwords.words('english') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Create Feature Set
Now, lets write a function that will be used to create feature set. The feature set is used to train the classifier.

> Bag-of-words feature is used.


In [4]:
# feature extractor function
def bag_of_words(words):
    words_clean = []
 
    for word in words:
        word = word.lower()
        if (word not in _stopwords) and (word not in string.punctuation): 
            words_clean.append(word)                                    
            
    words_dictionary = dict([word, True] for word in words_clean)
    
    return words_dictionary
 
# using dict will remove duplicate words from the words list
# note the output: stopword 'the' is also removed
print ("output:\nbag_of_words(['the', 'the', 'bad', 'bad', 'the', 'good']) =", 
       bag_of_words(['the', 'the', 'bad', 'bad', 'the', 'good']))

output:
bag_of_words(['the', 'the', 'bad', 'bad', 'the', 'good']) = {'bad': True, 'good': True}


The ***pos_reviews_set*** and ***neg_reviews_set*** contains the positive and negative reviews with label/sentiment **'pos'** and **'neg'** respectively.


Which is use to create train and test set.
> **Train Set** : is used to train the classifier.<br>
**Test Set** : is used to test the classifier to check how accurately it classifies the given text.


In [6]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_words(words), 'pos'))
 
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_words(words), 'neg'))

## Creating Train and Test Set
There are 1000 positive reviews set and 1000 negative reviews set.<br> We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as a test set. The remaining negative and positive reviews will be taken as a training set.

In [17]:
len(pos_reviews_set), len(neg_reviews_set)

(1000, 1000)

In [18]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
 
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
X_count_vect =  pos_reviews_set + neg_reviews_set

In [19]:
len(test_set), len(train_set)

(400, 1600)

## Training Classifier and Calculating Accuracy
We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.
> More on Naive Bayes Classifier:<br>
https://en.wikipedia.org/wiki/Naive_Bayes_classifier


In [21]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print("accuracy :- ", accuracy)
 
print (classifier.show_most_informative_features(10)) # show most 10 informative features

accuracy :-  0.6825
Most Informative Features
                  finest = True              pos : neg    =     14.2 : 1.0
             outstanding = True              pos : neg    =     13.9 : 1.0
               maintains = True              pos : neg    =     13.7 : 1.0
               ludicrous = True              neg : pos    =     13.4 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
               stupidity = True              neg : pos    =     11.0 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
               atrocious = True              neg : pos    =     10.3 : 1.0
                   sucks = True              neg : pos    =      9.8 : 1.0
              astounding = True              pos : neg    =      9.7 : 1.0
None


## Observations:
Let’s see the most informative features among the entire features in the feature set.<br>
The result shows that the word outstanding is used in positive reviews 13.9 times more often than it is used in negative reviews, the word ludicrous is used in negative reviews 13.4 times more often than it is used in positive reviews. Similarly, for other letters. These ratios are also called likelihood ratios.

Therefore, a review has a high chance to be classified as positive if it contains words like outstanding and finest, etc. Similarly, a review has a high chance of being classified as negative if it contains words like ludicrous, stupidity, sucks, etc.


## Testing Classifier with Custom Review

In [25]:
from nltk.tokenize import word_tokenize
nltk.download('punkt') 

custom_review = "Worst movie i have ever seen. It is worst then the corona virus, disasteres."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
print ("Predicition on,\n'Worst movie i have ever seen. It is worst then the corona virus, disasteres.' custom reviews : ", classifier.classify(custom_review_set))
# Negative review correctly classified as negative
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result)
print (prob_result.max())
print ("probability to classify as 'neg':", prob_result.prob("neg"))
print ("probability to classigy as 'pos':", prob_result.prob("pos"))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Predicition on,
'Worst movie i have ever seen. It is worst then the corona virus, disasteres.' custom reviews :  neg
<ProbDist with 2 samples>
neg
probability to classify as 'neg': 0.6302168701132754
probability to classigy as 'pos': 0.36978312988672457


In [26]:
custom_review = "Better then Corona, its good to watch movie like this, wonderful."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
 
print ("Predicition on,\n'Better then Corona, its good to watch movie like this, wonderful.' custom reviews : ", classifier.classify(custom_review_set))
# Positive review correctly classified as positive
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result)
print (prob_result.max())
print ("probability to classify as 'neg':", prob_result.prob("neg"))
print ("probability to classigy as 'pos':", prob_result.prob("pos"))

Predicition on,
'Better then Corona, its good to watch movie like this, wonderful.' custom reviews :  pos
<ProbDist with 2 samples>
pos
probability to classify as 'neg': 0.1406255106492366
probability to classigy as 'pos': 0.8593744893507645
