# Project Amazon

In [1]:
# Import Lib
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [6]:
# load Amazon data: Digital Music
with open('reviews_Digital_Music_5.json', 'r') as f:
   data = f.readlines()

df = pd.read_json("[" + ",".join(data) + "]")

In [10]:
df.shape

(64706, 9)

In [12]:
df.dtypes

asin              object
helpful           object
overall            int64
reviewText        object
reviewTime        object
reviewerID        object
reviewerName      object
summary           object
unixReviewTime     int64
dtype: object

In [14]:
df.iloc[:,0:5].head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime
0,5555991584,"[3, 3]",5,"It's hard to believe ""Memory of Trees"" came ou...","09 12, 2006"
1,5555991584,"[0, 0]",5,"A clasically-styled and introverted album, Mem...","06 3, 2001"
2,5555991584,"[2, 2]",5,I never thought Enya would reach the sublime h...,"07 14, 2003"
3,5555991584,"[1, 1]",5,This is the third review of an irish album I w...,"05 3, 2000"
4,5555991584,"[1, 1]",4,"Enya, despite being a successful recording art...","01 17, 2008"


In [15]:
df.iloc[:,5:9].head(5)

Unnamed: 0,reviewerID,reviewerName,summary,unixReviewTime
0,A3EBHHCZO6V2A4,"Amaranth ""music fan""",Enya's last great album,1158019200
1,AZPWAXJG9OJXV,bethtexas,Enya at her most elegant,991526400
2,A38IRL0X2T4DPF,bob turnley,The best so far,1058140800
3,A22IK3I6U76GX0,Calle,Ireland produces good music.,957312000
4,A1AISPOIIHTHXX,"Cloud ""...""",4.5; music to dream to,1200528000


In [16]:
df.overall.describe()

count    64706.000000
mean         4.222514
std          1.086081
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: overall, dtype: float64

In [18]:
# create 'c' column (N=1,2, P=4,5, M=3)
df['c'] = df.overall.apply(lambda x: 'N' if x < 3 else 'P' if x>3 else 'M' )

df1 = df[['c','summary']]

In [19]:
# get small data
amz_x1 = df1.summary.values[:10000]
amz_y1 = df1.c.values[:10000]

In [20]:
# 1.3 chia Train-Test
testCnt = int(amz_y1.size / 10)
train_x1, test_x1, train_y1, test_y1 = train_test_split(amz_x1, amz_y1, test_size=testCnt)

In [21]:
# ----------------------------------------------------
# Run Model
# Vong 1 - MultinomialNB
# Text feature extraction
bag = CountVectorizer()
x1 = bag.fit_transform(train_x1)
x1.shape

(9000, 5758)

In [22]:
# 2. Apply the model for prediction
# 2.1. Naive Bayes MultinomialNB
clf = MultinomialNB()
clf.fit(x1, train_y1)

prd_y1 = clf.predict(bag.transform(test_x1).toarray())
accuracy_score(test_y1, prd_y1)

0.878

In [23]:
# ----------------------------------------------------
# Vong 2 - Bo sung stopword
# word start number
wstnb = {k for k,v in bag.vocabulary_.items() if k[0].isdigit()}
# common english stopword
engstw = {'the','a','if','in','it','of','or'}

bag = CountVectorizer(stop_words=wstnb.union(engstw))
x1 = bag.fit_transform(train_x1)
x1.shape

(9000, 5622)

In [24]:
clf.fit(x1, train_y1)
prd_y1 = clf.predict(bag.transform(test_x1).toarray())
accuracy_score(test_y1, prd_y1)

0.879

In [25]:
# ----------------------------------------------------
# Vong 3 - Tfid Transform
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x1 = vectorizer.fit_transform(train_x1)
clf.fit(x1, train_y1)
prd_y1 = clf.predict(vectorizer.transform(test_x1).toarray())
accuracy_score(test_y1, prd_y1)

0.879

In [28]:
# ----------------------------------------------------
# Vong 4 - Parameter tuning using grid search
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([('vect', CountVectorizer(stop_words=wstnb.union(engstw))),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

gs_clf = gs_clf.fit(train_x1, train_y1)

prd_y1 = gs_clf.predict(test_x1)
accuracy_score(test_y1, prd_y1)

0.883

In [29]:
# ----------------------------------------------------
# Vong 5 - KNN
bag = CountVectorizer(stop_words=wstnb.union(engstw))
x1 = bag.fit_transform(train_x1)
x1.shape

(9000, 5622)

In [30]:
# 1
knn_clf = neighbors.KNeighborsClassifier(n_neighbors = 1, p = 2)
knn_clf.fit(x1, train_y1)
y_pred = knn_clf.predict(bag.transform(test_x1).toarray())
accuracy_score(test_y1, y_pred)

0.715

In [31]:
# 10
knn_clf = neighbors.KNeighborsClassifier(n_neighbors = 10, p = 2)
knn_clf.fit(x1, train_y1)
y_pred = knn_clf.predict(bag.transform(test_x1).toarray())
accuracy_score(test_y1, y_pred)

0.884

In [32]:
# 20
knn_clf = neighbors.KNeighborsClassifier(n_neighbors = 20, p = 2)
knn_clf.fit(x1, train_y1)
y_pred = knn_clf.predict(bag.transform(test_x1).toarray())
accuracy_score(test_y1, y_pred)

0.879

In [33]:
# 100
knn_clf = neighbors.KNeighborsClassifier(n_neighbors = 100, p = 2)
knn_clf.fit(x1, train_y1)
y_pred = knn_clf.predict(bag.transform(test_x1).toarray())
accuracy_score(test_y1, y_pred)

0.879