# Sentiment Analysis with Python

## Import and read data

In [1]:
## Importing the necessary libraries along with the standard import

import numpy as np
import pandas as pd
import nltk # this is the Natural Language Tool Kit which contains a lot of functionalities for text analytics
import random
import string # this is used for string manipulations

from nltk.corpus import movie_reviews

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('movie_reviews')

In [2]:
len(movie_reviews.fileids()) # Checking the length/no. fileids

2000

In [3]:
movie_reviews.raw(movie_reviews.fileids()[0])

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [4]:
nltk.FreqDist(movie_reviews.words()).most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [5]:
# Defining a variable 'stopwords' which contains the list of punctuations from the string library and the english stopwords
# from nltk
stopwords = nltk.corpus.stopwords.words('english') +list(string.punctuation)

# Converting all the words to lower case
all_words = (w.lower() for w in movie_reviews.words()) 
# Only keeping the words which are not the 'stopwords'
all_words_clean = [word for word in all_words if word not in stopwords]


# Creating a frequency distribution of the lower case words which does not contain any stopwords
all_words_freq = nltk.FreqDist(all_words_clean)

# Extracting the  most common 2000 words after the list of words have been converted to lowercase and the stopwords 
word_features = [item[0] for item in all_words_freq.most_common(2000)]


In [6]:
word_features[0:15] # looking at first 5 word_features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well']

In [7]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

# In the first line, we are creating a list where we need entries from both the 'category' and 'fileid'. 
# A variable 'category' has been defined which will give output to all the categories given by the following 
# first loop "for category in movie_reviews.categories()". 
# This particular value of the variable 'category' is then fitted into the second loop "for fileid in movie_reviews.fileids(category)". 
# So, the second loop is dependent on the first loop in the sense that it takes the entries of the first loop and then executes it.
# In the end, the output of both these loops are stored in the list defined in the first line.
random.shuffle(documents)


In [8]:
documents[0][0][0:15] # Checking first 15

['oh',
 'god',
 'how',
 'many',
 'john',
 'grisham',
 'lawyer',
 'films',
 'we',
 'have',
 'been',
 'munundated',
 'with',
 '!',
 'in']

In [9]:
## We are defining a function to appropriately process the text document

def document_features(document): # we are naming the function as document_features
    document_words = set(document) #getting the unique number of entries in the document variable
    features = {} #defining an empty dictionary
    for word in word_features: #looping over the 'word_features' which has been defined in the last code block
        features['contains({})'.format(word)] = (word in document_words) #defining 'features' in  particular format
        # and checking whether the unique elements of the input 'document' are contained in the 'word_features' 
        # defined before
    return features

In [10]:

## We are defining our combined data frame which we will split into training and test before fitting a classifier

# We are creating a list the entries of which are a tuple. We are appending the list with tuples whose entries are the 
# pre-processed tweets and the corresponding sentiment attached to it.
featuresets = [(document_features(d), c) for (d,c) in documents]


In [11]:
# Train Naive Bayes classifier
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
print(nltk.classify.accuracy(classifier, test_set))

0.81


In [13]:
classifier.show_most_informative_features(10)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.1 : 1.0
         contains(damon) = True              pos : neg    =      9.9 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.5 : 1.0
         contains(awful) = True              neg : pos    =      6.0 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
          contains(lame) = True              neg : pos    =      5.6 : 1.0
        contains(wasted) = True              neg : pos    =      5.6 : 1.0
        contains(poorly) = True              neg : pos    =      5.5 : 1.0
         contains(waste) = True              neg : pos    =      5.2 : 1.0


In [14]:
# A little difference in the results is due to random.shuffle as it randomly shuffle the list