# Movie Review Sentiment Analysis

### In this project, I will analyze the sentiments behind the review and comments of viewers in perticular movie

In [2]:
import nltk
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [3]:
# Downloading Dataset
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\SACHIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [4]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

In [6]:
if __name__ == '__main__':
    # Loading positive and negative reviews
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')

In [7]:
features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 'positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 'negative') for f in negative_fileids]

In [8]:
# Split the data into train and test dataset (80 by 20)
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

In [12]:
features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]

In [13]:
print(f"Number of Training Datapoints: {len(features_train)}")
print(f"Number of Testing Datapoints: {len(features_test)}")

Number of Training Datapoints: 1600
Number of Testing Datapoints: 400


In [15]:
# Train the NaiveBayes Classifier
classifier = NaiveBayesClassifier.train(features_train)
print("Accuracy of the classifier on Training Dataset: ", nltk.classify.util.accuracy(classifier, features_train))
print("Accuracy of the classifier on Testing Dataset: ", nltk.classify.util.accuracy(classifier, features_test))

Accuracy of the classifier on Training Dataset:  0.9825
Accuracy of the classifier on Testing Dataset:  0.735


In [17]:
# Most important words in our dataset that do movie review sentiment analysis
print('\n Top 10 most Important Words')
for item in classifier.most_informative_features()[:10]:
    print(item[0])


 Top 10 most Important Words
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
affecting
animators


In [18]:
# Sample input review
input_reviews = [
    "It is an amazing movie",
    "This is a dull moviue. I would never recommend it to anyone",
    "The cinematography is preety great in this movie",
    "The movie is pathetic!",
    "The direction was terrible and the story was all over the place", 
    "I loved the movie!"
]

In [23]:
print("\nPredictions...")
for review in input_reviews:
    print('\nReview: ', review)
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
    print("Predicted Sentiments: ", pred_sentiment)
    print("Probability: ", round(probdist.prob(pred_sentiment), 2))


Predictions...

Review:  The movie is pathetic!
Predicted Sentiments:  negative
Probability:  0.52

Review:  The cinematography is preety great in this movie
Predicted Sentiments:  positive
Probability:  0.73

Review:  The direction was terrible and the story was all over the place
Predicted Sentiments:  negative
Probability:  0.63

Review:  This is a dull moviue. I would never recommend it to anyone
Predicted Sentiments:  negative
Probability:  0.79

Review:  It is an amazing movie
Predicted Sentiments:  positive
Probability:  0.61
