# Sentiment Analysis

Lets build a sentiment Analysis model, which can detect given a phrase/sentence, if its positive or negative

In [None]:
## Import Libraries
import numpy as np
import pandas as pd
import urllib.request
import nltk
import random
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.probability import FreqDist


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Dataset
Sentiment Analysis is the process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer’s attitude towards a particular topic, product, etc. is positive, negative, or neutral.

This is a simple project of classifying the movie reviews as either positive or negative. We would be working on the ‘movie_reviews’ dataset in ntlk.corpus package



In [None]:
nltk.download([ "stopwords", "movie_reviews", "punkt"])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:


# size of the corpus
print("Total number of reviews in the corpus:", len(movie_reviews.fileids()))

# categories in the corpus
print("Categories in the corpus:", movie_reviews.categories())

# number of positive and negative reviews
num_pos_reviews = len(movie_reviews.fileids('pos'))
num_neg_reviews = len(movie_reviews.fileids('neg'))
print("Number of positive reviews:", num_pos_reviews)
print("Number of negative reviews:", num_neg_reviews)

# view a specific review
review_text = movie_reviews.raw('neg/cv000_29416.txt')
print(review_text)

# tokenization of review

tokens = word_tokenize(review_text)
print(tokens)

# part-of-speech (POS) tagging

pos_tags = pos_tag(tokens)
print(pos_tags)

# create a list of all reviews in the corpus with their respective categories
reviews = []
for fileid in movie_reviews.fileids():
    category = movie_reviews.categories(fileid)[0]
    review = movie_reviews.raw(fileid)
    reviews.append((review, category))
    
print(reviews[:2])    


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Total number of reviews in the corpus: 2000
Categories in the corpus: ['neg', 'pos']
Number of positive reviews: 1000
Number of negative reviews: 1000
plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . 
they seem to have taken this pretty neat concept , but executed it terribly . 
so what are the problems with the movie ? 
well , its main problem is tha

In [None]:
from nltk.classify import NaiveBayesClassifier

# create list of stopwords
stop_words = stopwords.words('english')

# remove stopwords and convert to lowercase
def preprocess(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha()]
    return filtered_tokens

# create frequency distribution of most common words
all_words = []
for fileid in movie_reviews.fileids():
    all_words += preprocess(movie_reviews.raw(fileid))

fdist = FreqDist(all_words)
print(fdist.most_common(10))

# create feature set of most common words
num_features = 2000
top_words = [word for word, freq in fdist.most_common(num_features)]
feature_set = []
for fileid in movie_reviews.fileids():
    category = movie_reviews.categories(fileid)[0]
    words = preprocess(movie_reviews.raw(fileid))
    features = {}
    for word in top_words:
        features[word] = words.count(word)
    feature_set.append((features, category))
    
# shuffle feature set
random.shuffle(feature_set)

# split feature set into training and test sets
split_point = int(0.8 * len(feature_set))
train_set = feature_set[:split_point]
test_set = feature_set[split_point:]

# train naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# test classifier on test set
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

# show most informative features
classifier.show_most_informative_features(10)


[('film', 9443), ('movie', 5671), ('one', 5582), ('like', 3547), ('even', 2556), ('good', 2316), ('time', 2282), ('would', 2264), ('story', 2146), ('much', 2024)]
Accuracy: 0.7925
Most Informative Features
                  stupid = 2                 neg : pos    =     18.8 : 1.0
               wonderful = 2                 pos : neg    =     14.0 : 1.0
                  boring = 2                 neg : pos    =     11.3 : 1.0
             outstanding = 1                 pos : neg    =      9.1 : 1.0
                   ideas = 2                 neg : pos    =      7.9 : 1.0
                   great = 4                 pos : neg    =      7.6 : 1.0
                 country = 2                 pos : neg    =      7.5 : 1.0
                   theme = 2                 pos : neg    =      7.5 : 1.0
                     bad = 4                 neg : pos    =      7.2 : 1.0
                  murphy = 1                 pos : neg    =      6.8 : 1.0


In [None]:
fdist = FreqDist(all_words)
print(fdist.most_common(30))

# create feature set of most common words
num_features = 2000
top_words = [word for word, freq in fdist.most_common(num_features)]
feature_set = []
for fileid in movie_reviews.fileids():
    category = movie_reviews.categories(fileid)[0]
    words = preprocess(movie_reviews.raw(fileid))
    features = {}
    for word in top_words:
        features[word] = words.count(word)
    feature_set.append((features, category))
    
# shuffle feature set
random.shuffle(feature_set)

# split feature set into training and test sets
split_point = int(0.8 * len(feature_set))
train_set = feature_set[:split_point]
test_set = feature_set[split_point:]

# train naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# test classifier on test set
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

# show most informative features
classifier.show_most_informative_features(30)

[('film', 9443), ('movie', 5671), ('one', 5582), ('like', 3547), ('even', 2556), ('good', 2316), ('time', 2282), ('would', 2264), ('story', 2146), ('much', 2024), ('character', 1996), ('also', 1965), ('get', 1925), ('characters', 1858), ('two', 1827), ('first', 1769), ('see', 1731), ('way', 1669), ('well', 1656), ('could', 1609), ('make', 1593), ('really', 1556), ('films', 1520), ('little', 1490), ('life', 1483), ('plot', 1460), ('people', 1448), ('scene', 1377), ('bad', 1375), ('never', 1361)]
Accuracy: 0.8
Most Informative Features
                   worst = 2                 neg : pos    =     19.8 : 1.0
                   great = 4                 pos : neg    =     14.0 : 1.0
              apparently = 2                 neg : pos    =     11.4 : 1.0
               wonderful = 2                 pos : neg    =     11.3 : 1.0
                    best = 4                 pos : neg    =     11.3 : 1.0
                  stupid = 2                 neg : pos    =      9.9 : 1.0
          

In [None]:
fdist = FreqDist(all_words)
print(fdist.most_common(30))

# create feature set of most common words
num_features = 2000
top_words = [word for word, freq in fdist.most_common(num_features)]
feature_set = []
for fileid in movie_reviews.fileids():
    category = movie_reviews.categories(fileid)[0]
    words = preprocess(movie_reviews.raw(fileid))
    features = {}
    for word in top_words:
        features[word] = words.count(word)
    feature_set.append((features, category))
    
# shuffle feature set
random.shuffle(feature_set)

# split feature set into training and test sets
split_point = int(0.8 * len(feature_set))
train_set = feature_set[:split_point]
test_set = feature_set[split_point:]

# train naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# test classifier on test set
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

# show most informative features
classifier.show_most_informative_features(10)

[('film', 9443), ('movie', 5671), ('one', 5582), ('like', 3547), ('even', 2556), ('good', 2316), ('time', 2282), ('would', 2264), ('story', 2146), ('much', 2024), ('character', 1996), ('also', 1965), ('get', 1925), ('characters', 1858), ('two', 1827), ('first', 1769), ('see', 1731), ('way', 1669), ('well', 1656), ('could', 1609), ('make', 1593), ('really', 1556), ('films', 1520), ('little', 1490), ('life', 1483), ('plot', 1460), ('people', 1448), ('scene', 1377), ('bad', 1375), ('never', 1361)]
Accuracy: 0.765
Most Informative Features
               wonderful = 2                 pos : neg    =     11.1 : 1.0
                    best = 4                 pos : neg    =     11.1 : 1.0
                  boring = 2                 neg : pos    =     10.1 : 1.0
                  stupid = 2                 neg : pos    =     10.1 : 1.0
                   worst = 2                 neg : pos    =      9.8 : 1.0
             outstanding = 1                 pos : neg    =      9.7 : 1.0
        