In [17]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
import string
from collections import Counter, defaultdict
import math
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sravanthimalepati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Train a Naïve Bayes classifier for sentiment analysis of Amazon users reviews for refurbished iPhone 7. You are given:

1)	Training set that consists of 6 positive reviews and 4 negative reviews 

2)	A set of 4 keywords: Great, Happy, Bad, Return. Those keywords should define 4-dimensional feature vectors for reviews (bag-of-words)

3)	Test set that consists of 2 reviews


So I have stored the given reviews in a excel file and reading it as follows.


In [3]:
df = pd.read_excel('data.xlsx')
vocab = Counter()

In [4]:
df

Unnamed: 0,y,review
0,1,I’ve seen a lot of bad reviews for this phone ...
1,1,This phone looks and performs great like it's ...
2,1,Don't listen to bad reviews! My phone arrived ...
3,1,Love this phone! I am so glad I bought a refur...
4,1,"First, seller did a great job and I think I go..."
5,1,Received prompt delivery of the phone. I inser...
6,0,"Overall, the phone isn't too bad for the price..."
7,0,"The iPhone 7 I purchased was ""certified refurb..."
8,0,Initially I was happy with the phone. It looke...
9,0,"Be cautious - if you have ANY issues at all, r..."


# Here "1" is for positive reviews and "0" is for negative reviews.

Removing stopwords process is as follows

In [5]:
dc = df['review'][0]
tokens = dc.split()
#print(tokens)
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
#print(table)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
print(tokens)

['seen', 'lot', 'bad', 'reviews', 'phone', 'based', 'issues', 'seller', 'Granted', 'reviews', 'say', 'took', 'weeks', 'problems', 'appear', 'edit', 'happens', 'wow', 'happy', 'got', 'Not', 'come', 'charger', 'debate', 'reviews', 'even', 'clear', 'bumper', 'case', 'That', 'neither', 'expected', 'necessary', 'appreciated', 'bought', 'Unicorn', 'Beetle', 'case', 'used', 'loved', 'phones', 'There', 'scratch', 'phone', 'started', 'working', 'right', 'away', 'The', 'battery', 'seems', 'holding', 'fine', 'All', 'say', 'seems', 'like', 'steal', 'If', 'self', 'destructs', 'next', 'weeks', 'update', 'UPDATE', 'Its', 'months', 'trip', 'overseas', 'since', 'wrote', 'initial', 'review', 'remains', 'solid', 'decision', 'Im', 'happy']


In [9]:
class funa:

    def __init__(self, y):
        self.classes = y

    def group_by_class(self, X, y):
        data = dict()
        for c in self.classes:
            data[c] = X[np.where(y == c)]
        return data

    
    def tokenize(self, text):
        tokens = text.split()
        # remove punctuation from each token
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]
        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        # filter out short tokens
        tokens = [word for word in tokens if len(word) > 1]
        return tokens


    def laplace_smoothing(self, word, text_class):
        num = self.word_counts[text_class][word] + 1
        denom = self.n_class_items[text_class] + len(self.vocab)
        return math.log(num / denom)

    def fit(self, X, y):
        self.n_class_items = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        n = len(X)
        grouped_data = self.group_by_class(X, y)
        for c, data in grouped_data.items():
            self.n_class_items[c] = len(data)
            self.log_class_priors[c] = math.log(self.n_class_items[c] / n)
            self.word_counts[c] = defaultdict(lambda: 0)

            for text in data:
                counts = Counter(self.tokenize(text))
                for word, count in counts.items():
                    if word not in self.vocab:
                        self.vocab.add(word)

                    self.word_counts[c][word] += count
    def predict(self, X):
        result = []
        for text in X:
            class_scores = {c: self.log_class_priors[c] for c in self.classes}
            words = set(self.tokenize(text))

            for word in words:
                if word not in self.vocab: continue

                for c in self.classes:

                    log_w_given_c = self.laplace_smoothing(word, c)
                    class_scores[c] += log_w_given_c
    

            result.append(max(class_scores, key=class_scores.get))
        print(result)
        return result

In [10]:
v = funa(np.unique(df['y']))
v.fit(df['review'].values,df['y'].values)

# Predicting the given 2 test reviews with the trained naive bayes model

In [11]:
gd= ["""The phone arrived in pretty decent condition. The front screen was scratch-free and the display is great, but there is a long scratch on the back of the phone. This doesn't bother me much because I always have a case on my phone. However, the issue with this phone is that the cellular signal won't work; the device detects the sim but the signal is bad. Apparently this is an issue with some iPhone 7 models, but the any free of charge repair is not valid because the phone is coming from a third party seller. After speaking with Apple, Verizon (my mobile carrier), AND Amazon, I've reached the conclusion that the issue is with the phone. I've tried everything to troubleshoot, but I will unfortunately have to return the item and get another one.""", """iPhone 7 Black came in excellent condition. Like new. No scratches or scuffs. Works great. Was happy for couple months until phone started to develop issues with hearing callers and vs versa. Callers can’t hear me and I can’t hear callers, the sound is bad. Checked settings . Disabled WiFi calling. Hard reset phone. Updated iOS. Happens randomly. Suspect possible known defects on iPhone 7 with audio IC chips. I want to return the phone but I’m waiting to se for a month"""]

In [12]:
y_hat = v.predict(gd)
print(y_hat)

[1, 1]
[1, 1]
