In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sms_raw = pd.read_csv("amazon_cells_labelled.txt", delimiter= '\t', header=None)
sms_raw.columns = ['review', 'positive_review']

In [3]:
sms_raw['review'].astype(str);
sms_raw['review'] = sms_raw['review'].str.lower()

In [4]:
keywords = ['good', 'great', 'impressed', 'excellent', 'positive', 'best', 'wonderful', 'perfect', 'love', 'like']

for key in keywords:
    sms_raw[str(key)] = sms_raw.review.str.contains(
        str(key),
        case=False
    )

In [5]:
sms_raw['positive_review'] = sms_raw['positive_review'].astype(str)
sms_raw['positive_review'] = (sms_raw['positive_review'] == '1')

In [6]:
data = sms_raw[keywords]
target = sms_raw['positive_review']

In [7]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

In [8]:
#fit the model to the daya
bnb.fit(data,target)

#classify, store results in new variable
y_pred = bnb.predict(data)

#display results
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 301


In [9]:
print("Accuracy of Model:")
print ("{0:.0f}%".format((1000-301)/1000*100))

Accuracy of Model:
70%


In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[478,  22],
       [279, 221]], dtype=int64)

In [11]:
print("Negative reviews correctly identified: 478 of 500")
print("Positive reviews correctly identified: 221 of 500")

Negative reviews correctly identified: 478 of 500
Positive reviews correctly identified: 221 of 500


Confusion matrix shows that we had 22 false positives and 279 false negatives. Thus, our greater problem is identifying positive reviews (which is what our classifier was attempting to acheive). Perhaps we could have more success if we created a classifier based of negative reviews (rather than positive).

In [12]:
keywords2 = ['bad', 'hate', 'stupid', 'poor', 'terrible', 'not good', 'negative', 'worst', 'trash', 'junk', 'dont buy', 'don\'t buy', 'dislike', 'hate', 'waste']

for key in keywords2:
    sms_raw[str(key)] = sms_raw.review.str.contains(
        str(key),
        case=False
    )

In [13]:
data2 = sms_raw[keywords2]

In [14]:
#fit the model to the daya
bnb.fit(data2,target)

#classify, store results in new variable
y_pred = bnb.predict(data2)

#display results
print("Number of mislabeled points out of a total {} points : {}".format(
    data2.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 405


In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[ 96, 404],
       [  1, 499]], dtype=int64)

Now there is even higher ratio of mislabelled points, perhaps we can improve our originally classifier by including more keywords.

In [16]:
keywords = ['good', 'great', 'impressed', 'excellent', 'positive', 'best', 'wonderful', 
            'perfect', 'love', ':-\)', ':\)', 'like', 'amazing', 'clever', 'delight',
           'sweet', 'sweetest', 'endorse', 'enjoy',
           'excited', 'happy', 'helpful', 'proud', 'relax', 'safe',
           'smooth', 'thrive', 'yippee', 'wonder', 'wonders', 'thrilled']

for key in keywords:
    sms_raw[str(key)] = sms_raw.review.str.contains(
        str(key),
        case=False
    )

In [17]:
data = sms_raw[keywords]
target = sms_raw['positive_review']

#fit the model to the daya
bnb.fit(data,target)

#classify, store results in new variable
y_pred = bnb.predict(data)

#display results
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 287


In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[474,  26],
       [261, 239]], dtype=int64)

No significant improvement.

Do any of your classifiers seem to overfit?:
Overfitting doesn't seem to be as much of an issue as underfitting.

Which seem to perform the best? Why?:
So far, searching for positive reviews has a better performance.

What features seemed to be most impactful to performance?:
Searching for positive reviews.