
# Basic Sentiment Analysis on text data using NLTK



Import required libraries

In [16]:
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
import random
from sklearn.model_selection import train_test_split

Dowload the NLTK data files

In [24]:

nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Description of movie_reviews dataset

In [8]:
# A list of all the words in 'movie_reviews'
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [9]:
# Prints total number of words in 'movie_reviews'
len(movie_reviews.words())

1583820

In [10]:
movie_reviews.categories()

['neg', 'pos']

In [11]:
# Displays frequency of words in ‘movie_reviews’
nltk.FreqDist(movie_reviews.words())

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

Preprocess the dataset and extract features

In [5]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

Load the movie_reviews from NLTK

In [6]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

In [7]:
# shuffle the dataset to ensure random distribution
random.shuffle(documents)

Prepair the dataset for traing and testing

In [14]:
featuresets = [(extract_features(d), c) for (d,c) in documents]
train_set, test_set = train_test_split(featuresets, test_size=0.2, random_state=42)


Train Naive Bayes classifier

In [17]:
clf = NaiveBayesClassifier.train(train_set)


Evaluate the classifier on the test set

In [18]:
accuracy = nltk_accuracy(clf, test_set)
print('Accuracy:', accuracy)

Accuracy: 0.7225


Show the most informative features

In [19]:
clf.show_most_informative_features(10)

Most Informative Features
               ludicrous = True              neg : pos    =     12.5 : 1.0
                  regard = True              pos : neg    =     11.1 : 1.0
                 idiotic = True              neg : pos    =     10.5 : 1.0
                thematic = True              pos : neg    =     10.4 : 1.0
             outstanding = True              pos : neg    =     10.2 : 1.0
               marvelous = True              pos : neg    =      9.9 : 1.0
               affecting = True              pos : neg    =      9.7 : 1.0
                  avoids = True              pos : neg    =      9.7 : 1.0
                    owes = True              pos : neg    =      9.7 : 1.0
                    taxi = True              pos : neg    =      9.7 : 1.0


Test on new input sentences

In [25]:
def analyze_sentiment(text):
  # Tokenize and remove stopword
  words = nltk.word_tokenize(text)
  words = [word.lower() for word in words if word.isalpha()]
  words = [word for word in words if word not in stopwords.words('english')]

  # Predict sentiment
  features = extract_features(words)
  sentiment = clf.classify(features)
  return sentiment

In [28]:
# prompt: write some sentences review about movie to test analyze_sentiment function
sentences = [
    "This movie was absolutely awful, the worst I've ever seen.",  # Negative
    "I thoroughly enjoyed this film, it was captivating from beginning to end.", # Positive
    "The plot was predictable and the acting mediocre.", # Negative
    "A truly stunning cinematic masterpiece, a must-watch!", # Positive
    "I found the dialogue confusing and the characters unrelatable.", # Negative
    "Visually impressive, but the story lacked depth.", # Mixed (leaning negative)
    "A great film with excellent performances and a gripping narrative.", # Positive
    "The special effects were amazing, but the story was weak.", # Mixed (leaning positive)
    "Not a good movie. Very boring", #Negative
    "A fantastic movie that deserves all the praise it's getting!", #Positive
]

for sentence in sentences:
    sentiment = analyze_sentiment(sentence)
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")
    print("-" * 20)


Sentence: This movie was absolutely awful, the worst I've ever seen.
Sentiment: neg
--------------------
Sentence: I thoroughly enjoyed this film, it was captivating from beginning to end.
Sentiment: pos
--------------------
Sentence: The plot was predictable and the acting mediocre.
Sentiment: neg
--------------------
Sentence: A truly stunning cinematic masterpiece, a must-watch!
Sentiment: pos
--------------------
Sentence: I found the dialogue confusing and the characters unrelatable.
Sentiment: neg
--------------------
Sentence: Visually impressive, but the story lacked depth.
Sentiment: pos
--------------------
Sentence: A great film with excellent performances and a gripping narrative.
Sentiment: pos
--------------------
Sentence: The special effects were amazing, but the story was weak.
Sentiment: pos
--------------------
Sentence: Not a good movie. Very boring
Sentiment: neg
--------------------
Sentence: A fantastic movie that deserves all the praise it's getting!
Sentiment: 