In [1]:
import numpy as np
import pandas as pd

In [2]:
vg_reviews = pd.read_json('vg_reviews.json', lines=True)

In [3]:
vg_reviews.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400
2,700099867,"[0, 0]",1,1st shipment received a book instead of the ga...,"06 28, 2014",A1INA0F5CWW3J4,"Amazon Shopper ""Mr.Repsol""",Wrong key,1403913600
3,700099867,"[7, 10]",3,"I got this version instead of the PS3 version,...","09 14, 2011",A1DLMTOTHQ4AST,ampgreen,"awesome game, if it did not crash frequently !!",1315958400
4,700099867,"[2, 2]",4,I had Dirt 2 on Xbox 360 and it was an okay ga...,"06 14, 2011",A361M14PU2GUEG,"Angry Ryan ""Ryan A. Forrest""",DIRT 3,1308009600


In [19]:
vg_reviews.describe()

Unnamed: 0,helpful,score,unix_time,not_helpful,positive_review
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,3.6762,4.0678,1274692000.0,5.9464,0.748
std,17.661227,1.20975,116081300.0,23.533778,0.434205
min,0.0,1.0,944438400.0,0.0,0.0
25%,0.0,3.0,1208153000.0,0.0,0.0
50%,1.0,5.0,1314706000.0,1.0,1.0
75%,2.0,5.0,1367971000.0,4.0,1.0
max,640.0,5.0,1405987000.0,674.0,1.0


In [4]:
# remove reviewer ID/name & asin (product ID), since data is arbitrary.
vg_reviews = vg_reviews.drop(columns = ['asin', 'reviewerID', 'reviewerName'])

In [5]:
# Simplify column names
vg_reviews = vg_reviews.rename(index=str, columns={'overall': 'score', 'reviewText': 'review', 
                                                   'reviewTime': 'time', 'unixReviewTime': 'unix_time'})

In [6]:
# Check for missing data
vg_reviews.isnull().sum()

asin         0
helpful      0
score        0
review       0
time         0
summary      0
unix_time    0
dtype: int64

In [7]:
# let's see how many rows we have
vg_reviews['helpful'].count()

231780

In [8]:
# Unfortunately, that amount of data cannot be handled by my laptop
# And I'll be rerunning the kernel many times, as well
# We'll have to take a tiny sample of the 230,000+ reviews
vg_reviews = vg_reviews.sample(n=5000)

In [9]:
vg_reviews.head()

Unnamed: 0,asin,helpful,score,review,time,summary,unix_time
17967,B00005C4HM,"[0, 0]",5,THis is a much beter buy the just then they pl...,"04 1, 2009",FUN FUN,1238544000
98482,B000ZKBJXC,"[3, 4]",4,This is a game I just knew was going to suck. ...,"06 25, 2009",This is how a game based on a movie should be ...,1245888000
136178,B002I0GEXM,"[5, 7]",3,"When I first read the setup for Epic Mickey, I...","12 10, 2010",Mediocrity snatched from the jaws of greatness,1291939200
41220,B0000A09EI,"[0, 0]",5,I had this game years ago and just this week I...,"06 21, 2012",A worthwhile Yu-gi-oh! game,1340236800
116935,B001IVXI7C,"[1, 1]",5,"I have a lot of fun playing this game, and I d...","08 29, 2011",Endless possibilities,1314576000


In [10]:
# let's turn the "helpful" column into two seperate integer columns
# what dtype is "helpful"?
vg_reviews.dtypes

asin         object
helpful      object
score         int64
review       object
time         object
summary      object
unix_time     int64
dtype: object

In [11]:
# change to string
vg_reviews['helpful'] = vg_reviews['helpful'].astype(str)

# remove brackets
vg_reviews['helpful'] = vg_reviews['helpful'].str.strip('[]')

# split into two columns 
vg_reviews['helpful'], vg_reviews['not_helpful'] = vg_reviews['helpful'].str.split(', ', 1).str

# convert into floats
vg_reviews['helpful'] = vg_reviews['helpful'].astype(int)
vg_reviews['not_helpful'] = vg_reviews['not_helpful'].astype(int)

In [12]:
# I just realized that we're only interested in predicting whether
# a review is positive or negative based only on text
# But I am going to keep these changes for future reference
vg_reviews.head()

Unnamed: 0,asin,helpful,score,review,time,summary,unix_time,not_helpful
17967,B00005C4HM,0,5,THis is a much beter buy the just then they pl...,"04 1, 2009",FUN FUN,1238544000,0
98482,B000ZKBJXC,3,4,This is a game I just knew was going to suck. ...,"06 25, 2009",This is how a game based on a movie should be ...,1245888000,4
136178,B002I0GEXM,5,3,"When I first read the setup for Epic Mickey, I...","12 10, 2010",Mediocrity snatched from the jaws of greatness,1291939200,7
41220,B0000A09EI,0,5,I had this game years ago and just this week I...,"06 21, 2012",A worthwhile Yu-gi-oh! game,1340236800,0
116935,B001IVXI7C,1,5,"I have a lot of fun playing this game, and I d...","08 29, 2011",Endless possibilities,1314576000,1


In [13]:
pos_keywords = ['good', 'great', 'impressed', 'excellent', 'positive',
            'best', 'wonderful', 'perfect', 'love', 'like']
neg_keywords = ['bad', 'hate', 'stupid', 'poor', 'terrible',
             'not good', 'negative', 'worst', 'trash', 'junk',
             'dont buy', 'don\'t buy', 'dislike', 'hate', 'waste']

for key in pos_keywords:
    vg_reviews[str(key)] = vg_reviews.review.str.contains(
        str(key),
        case=False
    )

In [14]:
vg_reviews['positive_review'] = np.where(vg_reviews['score']>3, 1, 0)

In [15]:
pos_data = vg_reviews[pos_keywords]
target = vg_reviews['positive_review']

In [21]:
#import the Bernoulli classifier since data is binary/boolean
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(pos_data, target)

#Classify storing the result in a new variable.
y_pred = bnb.predict(pos_data)

print("Number of Mislabeled reviews out of a total of {} points : {}".format(
     pos_data.shape[0],
     (target != y_pred).sum()
))

Number of Mislabeled reviews out of a total of 5000 points : 1260


In [22]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
X = pos_data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4000, 10) (4000,)
(1000, 10) (1000,)


In [23]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)
bnb = BernoulliNB() 
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bnb = bnb.fit(X_train, y_train)
    print("Score:", bnb.score(X_test, y_test))

Score: 0.754
Score: 0.753
Score: 0.762
Score: 0.762
Score: 0.712
