In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
# Define a function to extract features
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

In [4]:
# We need training data for this, so we will use movie reviews in NLTK
if __name__=='__main__':
    # Load positive and negative reviews
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')

In [5]:
# Let’s separate these into positive and negative reviews
features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in negative_fileids]

In [6]:
# Divide the data into training and testing datasets
# Split the data into train and test (80/20)
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

In [7]:
# Extract the features
features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]
print("Number of training datapoints:", len(features_train))
print("Number of test datapoints:", len(features_test))

Number of training datapoints: 1600
Number of test datapoints: 400


In [8]:
# We will use a Naive Bayes classifier. Define the object and train it
# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print ("\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))


Accuracy of the classifier: 0.735


In [9]:
# The classifier object contains the most informative words that it obtained during analysis. These words basically have a strong say in what’s classified as a positive or a negative review
print ("\nTop 10 most informative words:")
for item in classifier.most_informative_features()[:10]:
    print (item[0])


Top 10 most informative words:
outstanding
insulting
vulnerable
ludicrous
uninvolving
avoids
astounding
fascination
darker
animators


In [10]:
# Creating a couple of random input sentences
# Sample input1 reviews
input_reviews = ["It is an amazing movie.", 
                 "This is a good movie. I would recommend it.",
                 "The cinematography is pretty great in this movie.", 
                 "The direction was terrible and the story was all over the place." ]

In [11]:
# Runing the classifier on those input sentences and obtain the predictions
print ("Predictions:")
for review in input_reviews:
    print ("Review:", review)
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
    # Printing the output
    print ("Predicted sentiment:", pred_sentiment) 
    print ("Probability:", round(probdist.prob(pred_sentiment), 2))

Predictions:
Review: It is an amazing movie.
Predicted sentiment: Positive
Probability: 0.63
Review: This is a good movie. I would recommend it.
Predicted sentiment: Positive
Probability: 0.51
Review: The cinematography is pretty great in this movie.
Predicted sentiment: Positive
Probability: 0.69
Review: The direction was terrible and the story was all over the place.
Predicted sentiment: Negative
Probability: 0.67


In [12]:
# Creating a couple of random input sentences
# Sample input2 reviews
input_reviews = ["Actors acted upto the mark.",
                 "Extremely Terrible Movie and story line was bad.", 
                 "The cinematography was pretty great but acting was okay.", 
                 "The direction was terrible but the story was good." 
                ]

In [13]:
# Runing the classifier on those input sentences and obtain the predictions
print ("Predictions:")
for review in input_reviews:
    print ("Review:", review)
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
    # Printing the output
    print ("Predicted sentiment:", pred_sentiment) 
    print ("Probability:", round(probdist.prob(pred_sentiment), 2))

Predictions:
Review: Actors acted upto the mark.
Predicted sentiment: Positive
Probability: 0.53
Review: Extremely Terrible Movie and story line was bad.
Predicted sentiment: Negative
Probability: 0.51
Review: The cinematography was pretty great but acting was okay.
Predicted sentiment: Positive
Probability: 0.65
Review: The direction was terrible but the story was good.
Predicted sentiment: Negative
Probability: 0.69
