## Data Loading

In [96]:
from data_handler import *
from submodular_pick import *
import random
import numpy as np
import nltk
import re
from sklearn.pipeline import make_pipeline
from gensim.models import Doc2Vec
from gensim import corpora
from gensim.models.doc2vec import LabeledSentence
from gensim import summarization
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from lime.lime_text import LimeTextExplainer
import operator
from tqdm import tqdm

In [97]:
reviewData_filepath = 'reviews_Electronics.json'
metaData_filepath = 'meta_Electronics.json'
metadata_df = load_metadata(metaData_filepath)

# Exception occured : 40


In [98]:
category_counts = metadata_df['categories'].value_counts()

In [99]:
product_group_name = 'Electronics, Computers & Accessories, Laptops'

In [100]:
product_group_review_data = get_reviewdata_of_product_group(product_group_name, metadata_df, reviewData_filepath)
product_group_review_data['label'] = product_group_review_data['overall'].apply(lambda x : 1 if x >= 4 else 0)

In [101]:
product_id = 'B009LL9VDG'

In [105]:
product_review = product_group_review_data.loc[product_group_review_data['asin'] == product_id]

In [110]:
def clean_punctuation(text):
    text = re.sub('\\!+', '.',text)
    text = re.sub('\\?+', '.',text)
    text = re.sub('\\.+','.',text)
    return text

In [205]:
product_review = product_group_review_data.loc[product_group_review_data['asin'] == product_id]

product_review['reviewText'] = product_review['reviewText'].apply(lambda x : clean_punctuation(x))

product_review = product_review.loc[product_review['reviewText'].apply(lambda x : len(x.split('.'))>=4)]
product_review = product_review.loc[product_review['reviewText'].apply(lambda x : len(x.split('.'))<=6)]
num_pos = len(product_review.loc[product_review['label'] == 1])
num_neg = len(product_review.loc[product_review['label'] == 0])
resampled_pos = product_review.loc[product_review['label'] == 1].iloc[random.sample(range(0,num_pos),num_neg)].reset_index(drop=True)
product_review = resampled_pos.append(product_review.loc[product_review['label'] == 0]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [206]:
len(product_review)

780

## Embedding

In [207]:
def clean_text(text):
    text = text.strip()
    text = text.lower()
    text = re.sub('[^a-z ]', '', text)
    text = re.sub(' +', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stoplist])
    return text

In [208]:
class Custom_pipeline:
    
    def __init__(self, embedder, predictor):
        self.embedder = embedder
        self.predictor = predictor
        
    def predict_proba(self, x):
        embedded_x = [self.embedder.infer_vector(i.split()) for i in x]
        return self.predictor.predict_proba(embedded_x)
    
    def fit(x,y):
        self.predictor.fit(x,y)
        

In [209]:
stoplist = set(nltk.corpus.stopwords.words('english'))

product_review['clean_text'] = product_review['reviewText'].apply(lambda x : clean_text(x))

instances = []
for i in range(len(product_review)):
    instance = LabeledSentence(words=product_review.loc[i,'clean_text'].split(), tags=[str(i)])
    instances.append(instance)

In [210]:
# for size in [30,50,70,100,200,300,400,500,700]:
    
#     for window in [6,8,10,12,14]:

#         model = Doc2Vec(size = size, window = window, workers = 3)
#         model.build_vocab(instances)
#         for epoch in range(20):
#             random.shuffle(instances)
#             model.train(instances)
#         embedded_text = [model.docvecs[str(x)] for x in range(len(product_review))]

#         clf = LogisticRegression(penalty='l1')
#         scores = cross_val_score(clf, embedded_text, product_review['label'], cv=10)
#         print(size,window,epoch,np.mean(scores))

In [211]:
best_vec_size = 50
best_window_size = 14

model = Doc2Vec(size = best_vec_size, window = best_window_size, workers=10)
model.build_vocab(instances)

for epoch in range(20):
    random.shuffle(instances)
    model.train(instances)
    
embedded_text = [model.docvecs[str(x)] for x in range(len(product_review))]

clf = LogisticRegression(penalty='l1')
scores = cross_val_score(clf, embedded_text, product_review['label'], cv=3)
print(np.mean(scores))

clf.fit(embedded_text, product_review['label'])
p_comp = Custom_pipeline(embedder = model, predictor = clf)

0.741025641026


In [212]:
model.most_similar('good')

[('excellent', 0.42109382152557373),
 ('great', 0.412241131067276),
 ('perfectly', 0.4026727080345154),
 ('way', 0.40017855167388916),
 ('nice', 0.3994280695915222),
 ('sound', 0.3843111991882324),
 ('kind', 0.3583163619041443),
 ('functional', 0.3424456715583801),
 ('stream', 0.34095680713653564),
 ('gaming', 0.32450658082962036)]

## LIME

In [213]:
class_names = ['negative', 'positive']
explainer = LimeTextExplainer(class_names = class_names)

In [246]:
% time product_review['explanations'] = product_review['clean_text'].map(lambda x : get_explanation(explainer=explainer,\
                                                                    pipeline = p_comp, instance = x))

Wall time: 1min 7s


In [247]:
feature_importances = get_feature_importances(list(product_review['explanations']))
features = sorted(feature_importances.items(), key=operator.itemgetter(1), reverse=True)

In [248]:
features

[('get pay', 1.0282845110228394),
 ('samsung chromebook', 0.98617679714108974),
 ('light weight', 0.95102829015860557),
 ('great product', 0.91740637979659501),
 ('highly recommend', 0.87898669162667864),
 ('great computer', 0.8046045002811375),
 ('nice little', 0.71318965507677845),
 ('little laptop', 0.66976043969992993),
 ('cant beat', 0.54422319491550597),
 ('read reviews', 0.54397485837575665),
 ('battery life', 0.51861198204402159),
 ('definitely recommend', 0.50647388151164352),
 ('little disappointed', 0.49653782160153287),
 ('love samsung', 0.47555381398482749),
 ('return policy', 0.45630384165718674),
 ('stay away', 0.45089493621882681),
 ('everything else', 0.44729040406685666),
 ('product definitely', 0.44180975678626655),
 ('child could', 0.4235000328438161),
 ('product exactly', 0.41023354078877688),
 ('piece equipment', 0.40938974865942668),
 ('sent back', 0.40249849049544523),
 ('exactly wantedit', 0.39889158425393928),
 ('flimsy returned', 0.39018340122441214),
 ('disa

## Pos / Neg Feature Extraction

In [370]:
pos_features= {}
neg_features= {}

for feature in tqdm(list(features)):
    f_exist = product_review.loc[product_review['clean_text'].apply(lambda x : feature[0] in x)]
    prob = np.mean(f_exist['label'])
    if (prob >= 0.8) and (2 < len(f_exist)):
        pos_features[feature[0]] = feature[1]
    if (prob < 0.2) and (2 < len(f_exist)):
        neg_features[feature[0]] = feature[1]

100%|████████████████████████████████████████████████████████████████████████████████████████| 1409/1409 [00:01<00:00, 1234.77it/s]


In [371]:
sorted(pos_features.items(), key=operator.itemgetter(1), reverse=True)

[('highly recommend', 0.87898669162667864),
 ('great computer', 0.8046045002811375),
 ('little laptop', 0.66976043969992993),
 ('cant beat', 0.54422319491550597),
 ('battery life', 0.51861198204402159),
 ('definitely recommend', 0.50647388151164352),
 ('love samsung', 0.47555381398482749),
 ('little computer', 0.38755039203963332),
 ('price cant', 0.38066664218957519),
 ('love chromebook', 0.34315464599151851),
 ('great machine', 0.28919535335538654),
 ('dont want', 0.28814633830916364),
 ('lightweight fast', 0.2782414208656207),
 ('great little', 0.25819498963119836),
 ('love little', 0.25376303973669823),
 ('lightweight compact', 0.24962186072143275),
 ('fast efficient', 0.24696321462542439),
 ('love thing', 0.23100965241563437),
 ('cloud storage', 0.223922493382086),
 ('love chrome', 0.2181894176402894),
 ('chromebook great', 0.19484945476175652),
 ('little chromebook', 0.18336612115296402),
 ('easy use', 0.17120024824403274),
 ('use love', 0.16944056413880537),
 ('everything need',

In [372]:
sorted(neg_features.items(), key=operator.itemgetter(1), reverse=True)

[('get pay', 1.0282845110228394),
 ('return policy', 0.45630384165718674),
 ('sent back', 0.40249849049544523),
 ('ended returning', 0.3663350291530123),
 ('real laptop', 0.31767334136549552),
 ('internet connection', 0.19701883558495237),
 ('doesnt work', 0.19138926842503767),
 ('quality product', 0.17651254192688387),
 ('thought would', 0.16843233096711244),
 ('without internet', 0.16805414088108059),
 ('sending back', 0.16608176029626806),
 ('money back', 0.15866125592581604),
 ('waste money', 0.14566744071139426),
 ('even though', 0.13077939123439516),
 ('buy another', 0.12481274928357125),
 ('able anything', 0.12316562864858735),
 ('back amazon', 0.11332689964824028),
 ('google samsung', 0.083678548436804334),
 ('get much', 0.083659617613309978),
 ('use itunes', 0.075261489392127678),
 ('connect internet', 0.070823537591412494),
 ('wi fi', 0.066827639419370755),
 ('things like', 0.064505258802215915),
 ('worst computer', 0.063945044085714442),
 ('download itunes', 0.06215796148861

## Instance Pick

In [357]:
pos_reviews_explanations = list(product_review.loc[product_review['label'] == 1]['explanations'])
neg_reviews_explanations = list(product_review.loc[product_review['label'] == 0]['explanations'])
pos_chosen, _ = submodular_pick(pos_reviews_explanations, pos_features, k = 10)
neg_chosen, _ = submodular_pick(neg_reviews_explanations, neg_features, k = 10)

In [358]:
for review in pos_chosen:
    row = product_review.loc[product_review['clean_text'] == review[1]]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

This is a great little chromebook.  I havent had any problems with it so far, and i've dropped it quite a few times.  Great for streaming video online, watching netflix and hulu, and playing games.  Very lightweight and portable.  There is no CD/DVD drive, which can be kind of lame sometimes, but otherwise a great computer. 1 

Sper easy to use, lightweight, great for homework, scheduling, and internet based projects or work. Very easy to set up multiple users so we use this computer for our 4 kids. Love this little laptop. 1 

Great for travel or the coffee shop etc. But remember it needs wifi.Everything that I expected. I would highly recommend it. 1 

Simple to use. No viruses, very good speed, light to carry. Pretty much everything you need. Sound and volume average. Would definitely recommend. 1 

easy to use - love the size of the keyboard. the book is lightweight and easy to carry. i am still learning about all the things it does.  also like watching tv on it at night in bed whi

In [359]:
for review in neg_chosen:
    row = product_review.loc[product_review['clean_text'] == review[1]]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

Everything was awesome with with notebook, perfect size  , starts up fast as heck from a cold start, looks good  has all the features, but all the sudden mine just shut off-.  and  a white light  blinked, i tried messing with the power cable and differnt oulets ect. but no luck , so good luck if u get one , looks like you get what you pay for. 0 

I sent it back. Amazon deserves 5 stars for its swift, efficient return policy that had UPS pick up this defective product at my door free of charge and then issued a full refund. My cromebook would not stay connected to the internet. Every time you click on a new web page it has to search for a connection again and reconnect making this the slowest internet browser on the market which is a serious charge for a product that does not work without an internet connection. 0 

Would be cool if it functioned but now it's an expensive paperweight. Very disappointed. If you buy, get it somewhere with a better return policy. 0 

I loved the look and 

## TextRank

In [223]:
pos_reviews = list(product_review.loc[product_review['label'] == 1]['clean_text'])
neg_reviews = list(product_review.loc[product_review['label'] == 0]['clean_text'])
reviews = pos_reviews + neg_reviews
tokens = [review.split() for review in reviews]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

In [360]:
ratio = 20 / len(reviews)
print(ratio)

0.02564102564102564


In [361]:
textrank_summa_pos = summarization.summarize('. '.join(pos_reviews), ratio=ratio)
textrank_summa_neg = summarization.summarize('. '.join(neg_reviews), ratio=ratio)

In [362]:
for summa in textrank_summa_pos.split('.')[:-1]:
    row = product_review.loc[product_review['clean_text'] == summa.strip()]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

I bought two of these for my kids aged 8 and 10 so they could play Flash games without taking my MacBook Pro. The older child has also started using Google Drive apps for school reports and presentations as well. One obvious shortcoming I discovered when setting up their accounts is that Google requires that users be 13 years old to get an account so I had to set myself up as admin on each of their devices and then use my birthday when setting up their individual accounts in order to get the free cloud storage for each device (since the free storage is one per account, they had to be the registered users). Google's commercials seemed to advertise a laptop for everyone (including kids) so I'm not sure how I was &#34;supposed&#34; to resolve the issue other than lying about their age (and accepting responsibility for monitoring their internet usage). The Chromebooks are plastic and not as solid as a unibody MBP, as you might expect - picking one up by the front corner while it is open re

In [363]:
for summa in textrank_summa_neg.split('.')[:-1]:
    row = product_review.loc[product_review['clean_text'] == summa.strip()]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

My old Samsung Chromebook, Model 500c June/2011, works just fine anywhere in the house.  But my two tries for the new Model 303C cannot pick up WiFi quickly or even at all sometimes.  I tried to contact a person regarding this problem, but that's impossible so the only way to communicate a problem is to return the item.  I love ChromeOS, even use it on my MacAir.  It's too bad, I like the style and keyboard of the new 303C but I can't stand the WiFi connect problem. 0 

This laptop does everything it says it will do. Don't expect to get much else use out of it other than basic internet browsing. Also, mine only lasted a year before I began having issues with it turning on (only a white, pixelated screen would show up). If you need to just check e-mail, edit documents on Google Drive (and Google Drive only), and surf the web, then you'd be pretty satisfied with this little thing. Otherwise, look elsewhere. 0 

A problem with WEP router incryption has rendered my new Chromebook almost us