### Naive Bayes Classifier for Sentiment Analysis with POS tagging

In [559]:
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
import pandas as pd
from nltk.classify import ClassifierI
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re
import random

In [560]:
all_words = []
documents = []

In [561]:
train = pd.read_csv('amazon_cells_labelled.csv')
train_positive = train.loc[train['Label']==1]
#dataset of all positive reviews
train_positive.head()
train_negative = train.loc[train['Label']==0]
#dataset of all negative reviews
train_negative.head()

Unnamed: 0,Comment,Label
0,So there is no way for me to plug it in here i...,0
3,Tied to charger for conversations lasting more...,0
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
8,Needless to say I wasted my money.,0


Preprocessing the words - cleaning, converting to tokens, removing stop words
<br>Then tagging each word with the part of speech tag
<br>For sentiment analysis, the adjectives convey the positive or negative sentiment. So we are creating a dictionary of positive adjectives(from positive dataset) and negative adjectives(from negative data)

In [562]:
stop_words = list(set(stopwords.words('english')))
st = WordNetLemmatizer()
freq = pd.Series(' '.join(train['Comment']).split()).value_counts()[:10]

for index, row in train_positive.iterrows():
    documents.append( (row["Comment"], row["Label"]) )
    comment = row["Comment"]
    
    #cleaning the comment by removing special characters and numbers
    cleaned = re.sub(r'[^(a-zA-Z)\s]',' ', comment)
    
    #lemmatize the comment
    lemmatized = st.lemmatize(cleaned)

    #convert the lemmatized comment into tokens
    tokenized = word_tokenize(lemmatized)
    
    #tagging the tokens
    pos = nltk.pos_tag(tokenized)

    #Removing stop words
    stopped = [w for w in pos if not w in stop_words]
    
    #remove the high frequency words since they dont contribute to the classification 
    highfreq =  [w for w in stopped if not w in freq]
    
    #for sentiment classification, forming a list of adjectives from positive reviews
    for w in highfreq:
         if w[1][0] in ('JJ'):
            all_words.append(w[0].lower())


In [563]:
for index, row in train_negative.iterrows():
    documents.append( (row["Comment"], row["Label"]) )
    comment = row["Comment"]
    cleaned = re.sub(r'[^(a-zA-Z)\s]',' ', comment)
    
    lemmatized = st.lemmatize(cleaned)
    
    tokenized = word_tokenize(lemmatized)
    
    pos = nltk.pos_tag(tokenized)

    stopped = [w for w in pos if not w in stop_words]
    
    #remove the high frequency words since they dont contribute to the classification 
    highfreq =  [w for w in stopped if not w in freq]

    #for sentiment classification, forming a list of adjectives from negative reviews
    for w in highfreq:
         if w[1][0] in ('JJ'):
            all_words.append(w[0].lower())

In [564]:
print(all_words)
print(len(all_words))

['good', 'great', 'razr', 'great', 'impressed', 'original', 'extended', 'good', 'blue', 'good', 'great', 'new', 'best', 'mobile', 'ideal', 'sensitive', 'sure', 'great', 'fine', 'great', 'nice', 'clear', 'great', 'excellent', 'good', 'bulky', 'usable', 'real', 'useful', 'neat', 'pretty', 'sturdy', 'large', 'fine', 'reasonable', 'great', 'happy', 'sound', 'nice', 'great', 'best', 'last', 'several', 'comfortable', 'most', 'great', 'several', 'beautiful', 'great', 'little', 'handy', 'everyday', 'easy', 'great', 'excellent', 'cheaper', 'super', 'sturdy', 'great', 'best', 'tried', 'other', 'best', 'good', 'free', 'good', 'good', 'nice', 'cool', 'black', 'white', 'great', 'good', 'ear', 'comfortable', 'excellent', 'slim', 'light', 'beautiful', 'little', 'i', 'sleek', 'great', 'nice', 'full', 'basic', 'comfortable', 'tried', 'several', 'different', 'first', 'few', 'small', 'accompanied', 'brilliant', 'great', 'good', 'peachy', 'tremendous', 'relative', 'glad', 'funny', 'sketchy', 't', 'great',

In [565]:
all_words = nltk.FreqDist(all_words)
# listing the 50 most frequent words
word_features = list(all_words.keys())[:50]
print(word_features)


['good', 'great', 'razr', 'impressed', 'original', 'extended', 'blue', 'new', 'best', 'mobile', 'ideal', 'sensitive', 'sure', 'fine', 'nice', 'clear', 'excellent', 'bulky', 'usable', 'real', 'useful', 'neat', 'pretty', 'sturdy', 'large', 'reasonable', 'happy', 'sound', 'last', 'several', 'comfortable', 'most', 'beautiful', 'little', 'handy', 'everyday', 'easy', 'cheaper', 'super', 'tried', 'other', 'free', 'cool', 'black', 'white', 'ear', 'slim', 'light', 'i', 'sleek']


In [566]:
# function to create a dictionary of features for each comment in the list document.
# The keys are the words in word_features 
# The values of each key are either true or false for whether that feature appears in the review or not
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# Creating features for each review
featuresets = [(find_features(comment), label) for (comment, label) in documents]
print(len(featuresets))

1000


In [567]:
# Shuffling the documents 
random.shuffle(featuresets)
print(len(featuresets))
training_set = featuresets[:500]
testing_set = featuresets[500:]

1000


In [568]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(5)


Classifier accuracy percent: 63.800000000000004
Most Informative Features
                   great = True                1 : 0      =     10.0 : 1.0
                    best = True                1 : 0      =      5.4 : 1.0
                    good = True                1 : 0      =      4.8 : 1.0
                    fine = True                1 : 0      =      4.7 : 1.0
                    easy = True                1 : 0      =      4.1 : 1.0
