## Data Loading

In [61]:
from data_handler import *
from submodular_pick import *
import random
import numpy as np
import nltk
import re
from gensim.models import Doc2Vec
from gensim import corpora
from gensim.models.doc2vec import LabeledSentence
from gensim import summarization
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from lime.lime_text import LimeTextExplainer
import operator
from tqdm import tqdm

In [62]:
reviewData_filepath = 'reviews_Electronics.json'
metaData_filepath = 'meta_Electronics.json'
metadata_df = load_metadata(metaData_filepath)

# Exception occured : 40


In [63]:
category_counts = metadata_df['categories'].value_counts()

In [64]:
product_group_name = 'Electronics, Computers & Accessories, Cables & Accessories, Keyboards'

In [65]:
product_group_review_data = get_reviewdata_of_product_group(product_group_name, metadata_df, reviewData_filepath)
product_group_review_data['label'] = product_group_review_data['overall'].apply(lambda x : 1 if x >= 4 else 0)

In [66]:
# product_group_review_data['asin'].value_counts()

In [67]:
product_id = 'B005DKZTMG'

In [68]:
def clean_punctuation(text):
    text = re.sub('\\!+', '.',text)
    text = re.sub('\\?+', '.',text)
    text = re.sub('\\.+','.',text)
    return text

In [120]:
product_review = product_group_review_data.loc[product_group_review_data['asin'] == product_id]

product_review['reviewText'] = product_review['reviewText'].apply(lambda x : clean_punctuation(x))

product_review = product_review.loc[product_review['reviewText'].apply(lambda x : len(x.split('.'))>=4)]
product_review = product_review.loc[product_review['reviewText'].apply(lambda x : len(x.split('.'))<=6)]
num_pos = len(product_review.loc[product_review['label'] == 1])
num_neg = len(product_review.loc[product_review['label'] == 0])
resampled_pos = product_review.loc[product_review['label'] == 1].iloc[random.sample(range(0,num_pos),num_neg)].reset_index(drop=True)
product_review = resampled_pos.append(product_review.loc[product_review['label'] == 0]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [121]:
len(product_review)

558

In [122]:
list(product_review.loc[:3,'reviewText'])

['The wireless Logitech K400 keyboard works great with my Samsung 6100 series tv when I am brousing the internet.  I love the size of the keyboard and the built-in mouse.  It will only work when using the internet and will not work when using netflix or the other apps on the smart tv, but I knew that before I bought it.',
 'Works fine. not too much for gaming but for a living room pc its great to use. I have used it alot.',
 "I love the size of the Logitech wireless keyboard. It is easy to use with the multi touch touchpad being a plus.  It's the perfect size and weight to carry from my desk to bed for easy access to what I need. It comes with two AA batteries which was nice and it is the best wireless keyboard I have ever used. I give Logitech Wireless Touch keyboard K400 with built-in Multi-touchTouchpad a five star rating and highly recommend it.",
 "We have 2 of these and they are great. Batteries haven't been changed since purchased 6 months ago and still good. I even have a pc ho

## Embedding

In [123]:
def clean_text(text):
    text = text.strip()
    text = text.lower()
    text = re.sub('[^a-z ]', '', text)
    text = re.sub(' +', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stoplist])
    return text

In [124]:
class Custom_pipeline:
    
    def __init__(self, embedder, predictor):
        self.embedder = embedder
        self.predictor = predictor
        
    def predict_proba(self, x):
        embedded_x = [self.embedder.infer_vector(i.split()) for i in x]
        return self.predictor.predict_proba(embedded_x)
    
    def fit(x,y):
        self.predictor.fit(x,y)
        

In [125]:
stoplist = set(nltk.corpus.stopwords.words('english'))

product_review['clean_text'] = product_review['reviewText'].apply(lambda x : clean_text(x))

instances = []
for i in range(len(product_review)):
    instance = LabeledSentence(words=product_review.loc[i,'clean_text'].split(), tags=[str(i)])
    instances.append(instance)

In [126]:
# for size in [30,50,70,100]:
    
#     for window in [6,8,10,12,14]:

#         model = Doc2Vec(size = size, window = window, workers = 3)
#         model.build_vocab(instances)
#         for epoch in range(20):
#             random.shuffle(instances)
#             model.train(instances)
#         embedded_text = [model.docvecs[str(x)] for x in range(len(product_review))]

#         clf = LogisticRegression(penalty='l1')
#         scores = cross_val_score(clf, embedded_text, product_review['label'], cv=10)
#         print(size,window,epoch,np.mean(scores))

In [127]:
best_vec_size = 300
best_window_size = 10

model = Doc2Vec(size = best_vec_size, window = best_window_size, workers=10)
model.build_vocab(instances)

for epoch in range(20):
    random.shuffle(instances)
    model.train(instances)
    
embedded_text = [model.docvecs[str(x)] for x in range(len(product_review))]

clf = LogisticRegression(penalty='l1')
scores = cross_val_score(clf, embedded_text, product_review['label'], cv=3)
print(np.mean(scores))

clf.fit(embedded_text, product_review['label'])
p_comp = Custom_pipeline(embedder = model, predictor = clf)

0.666666666667


In [128]:
model.most_similar('arrow')

[('enter', 0.45364534854888916),
 ('press', 0.3771659731864929),
 ('uparrow', 0.3199821412563324),
 ('shift', 0.31649765372276306),
 ('poor', 0.3055460453033447),
 ('f', 0.2981566786766052),
 ('pressing', 0.2854941487312317),
 ('hard', 0.2852557897567749),
 ('enough', 0.2799328565597534),
 ('fn', 0.27977943420410156)]

## LIME

In [129]:
class_names = ['negative', 'positive']
explainer = LimeTextExplainer(class_names = class_names)

In [161]:
% time product_review['explanations'] = product_review['clean_text'].map(lambda x : get_explanation(explainer=explainer,\
                                                                    pipeline = p_comp, instance = x))

Wall time: 51.6 s


In [162]:
feature_importances = get_feature_importances(list(product_review['explanations']))
features = sorted(feature_importances.items(), key=operator.itemgetter(1), reverse=True)

In [163]:
features

[('great keyboard', 2.5184846593912829),
 ('works great', 2.3913055046529514),
 ('keyboard great', 1.2658916967338401),
 ('touch pad', 1.26451618423822),
 ('great product', 1.2368592280101187),
 ('nice keyboard', 0.97987250207338272),
 ('key board', 0.8021050905805911),
 ('keyboard works', 0.62203821787674096),
 ('keyboard little', 0.61110137621197724),
 ('bought use', 0.54441564978750534),
 ('works well', 0.53949160299795174),
 ('product great', 0.51189674905224603),
 ('smart tv', 0.47912786336333635),
 ('purchased use', 0.45450569997386403),
 ('work great', 0.45185549732099112),
 ('great little', 0.4442857883459892),
 ('keyboard perfect', 0.40347247868940517),
 ('mouse pad', 0.36896264344307117),
 ('like keyboard', 0.36302648038915375),
 ('using keyboard', 0.35617054916753377),
 ('wireless keyboard', 0.34322235848974586),
 ('keyboard small', 0.33159308687962241),
 ('great comfy', 0.32708121510334071),
 ('highly recommended', 0.32024886614280818),
 ('second one', 0.31973660563053519),

## Pos / Neg Feature Extraction

#### 제품에 따라서, 긍/부정적인 요소가 뻔한 경우 확률 cutoff를 낮춰잡아야 다양한 요소 반영 가능

In [227]:
pos_features= {}
neg_features= {}
for feature in tqdm(list(features)):
    f_exist = product_review.loc[product_review['clean_text'].apply(lambda x : feature[0] in x)]
    prob = np.mean(f_exist['label'])
    if (prob >= 0.8) and (len(f_exist) > 2):
        pos_features[feature[0]] = feature[1]
    if (prob < 0.2) and (len(f_exist) > 2):
        neg_features[feature[0]] = feature[1]

100%|██████████████████████████████████████████████████████████████████████████████████████████| 946/946 [00:00<00:00, 1377.69it/s]


In [228]:
sorted(pos_features.items(), key=operator.itemgetter(1), reverse=True)

[('works great', 2.3913055046529514),
 ('purchased use', 0.45450569997386403),
 ('great little', 0.4442857883459892),
 ('highly recommended', 0.32024886614280818),
 ('little keyboard', 0.29045903332282863),
 ('works perfectly', 0.21108722789450485),
 ('really like', 0.18371702414131011),
 ('get another', 0.16673186850612198),
 ('one thing', 0.1489539648190922),
 ('keyboard setup', 0.13303418449087714),
 ('well purpose', 0.12882251260965905),
 ('light weight', 0.12756121438659795),
 ('great range', 0.1206467595912),
 ('great buy', 0.11705138868743598),
 ('anything else', 0.10646019487333806),
 ('big deal', 0.083829020012941388),
 ('great use', 0.077120601951484902),
 ('laptop tv', 0.06967225453791287),
 ('plug usb', 0.062537186164363587),
 ('definitely recommend', 0.061814768875848955),
 ('highly recommend', 0.052541906218076771),
 ('compact easy', 0.0485182789771543),
 ('easy use', 0.046293867564577831),
 ('laptop connected', 0.045914443149487186),
 ('touch keyboard', 0.044313282318711

In [229]:
sorted(neg_features.items(), key=operator.itemgetter(1), reverse=True)

[('shift key', 0.17996038131816716),
 ('waste time', 0.1757067915087567),
 ('keyboard nice', 0.15555781792672635),
 ('look elsewhere', 0.14821233936821854),
 ('keyboard good', 0.13544712943826717),
 ('sent back', 0.12963398364169815),
 ('builtin touchpad', 0.094572538473906786),
 ('works ok', 0.091104849036882207),
 ('doesnt work', 0.088337967204838705),
 ('waste money', 0.06952333204539482),
 ('logitech products', 0.065103048843139624),
 ('logitech product', 0.058200773584276611),
 ('never bought', 0.058118087804388908),
 ('enter key', 0.057534434489210494),
 ('cheap plastic', 0.050208164932718173),
 ('arrow instead', 0.047782948126314277),
 ('product work', 0.044456391018406323),
 ('im used', 0.028792909263165811),
 ('cant type', 0.02769987318290856)]

## Instance Pick

In [232]:
pos_reviews_explanations = list(product_review.loc[product_review['label'] == 1]['explanations'])
neg_reviews_explanations = list(product_review.loc[product_review['label'] == 0]['explanations'])
pos_chosen, _ = submodular_pick(pos_reviews_explanations, pos_features, k = 10, l_penalty=True)
neg_chosen, _ = submodular_pick(neg_reviews_explanations, neg_features, k = 10, l_penalty=True)

In [233]:
for review in pos_chosen:
    row = product_review.loc[product_review['clean_text'] == review[1]]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

I would have rated it a 5, but it doesn't have a number pad.  Otherwise it works great.  A side number pad will have to do. 1 

A great little keyboard/touchpad combo. After reading all the complaints about other keyboards with a built-in trackball that can't be cleaned, I went with the touchpad instead. Very happy with it. 1 

Love it. We have many Logitech keyboard/mice at work and no issues, so I am sure the quality is fine too. Highly recommended. 1 

Purchased to use with Samsung 3D Smart HDTV and it works great.  Was easy to setup and is a bigger improvement in moving around the screen to make choices with the Touch Pad.  Trying to use the remote with the TV to type in a website was very difficult and time consuming.  Now it's a breeze. 1 

I really like this wireless touch keyboard. It serves well my purpose. I will certainly recommend to my associates and friends. 1 

Compact but easy to use. Touchpad gives some of the same abilities for wiewing as smart phone screen. Can be us

In [222]:
for review in neg_chosen:
    row = product_review.loc[product_review['clean_text'] == review[1]]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

The lefthand shift key is in the right place, and note it's a double-sized key. Which it should be, considering how often you use it. The righthand shift key, however, is half this size, and to the right of the up arrow key. Many other compact keyboards have the Shift key to the left of the up arrow key, and pretty much all regular sized keyboards do as well. Before you purchase this keyboard, LOOK AT WHERE THE RIGHTHAND SHIFT KEY IS ON YOUR KEYBOARD, especially if you plan to alternate between keyboards, on different machines. 0 

It was not obvious to me from the advertising that this product only works with certain versions of Windows. Further, it comes with a tiny, tiny chip of some sort which either was not included or fell out while I was opening the package. Poor design, poor advertising. Waste of my time. 0 

This keyboard was nice for about the first day. It then started to cause Firefox to open new windows and tabs spontaneously, first with every 30-40 keystrokes, then in inc

## TextRank

In [176]:
pos_reviews = list(product_review.loc[product_review['label'] == 1]['clean_text'])
neg_reviews = list(product_review.loc[product_review['label'] == 0]['clean_text'])
reviews = pos_reviews + neg_reviews
tokens = [review.split() for review in reviews]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

In [223]:
ratio = 20 / len(reviews)
print(ratio)

0.035842293906810034


In [224]:
textrank_summa_pos = summarization.summarize('. '.join(pos_reviews), ratio=ratio)
textrank_summa_neg = summarization.summarize('. '.join(neg_reviews), ratio=ratio)

In [225]:
for summa in textrank_summa_pos.split('.')[:-1]:
    row = product_review.loc[product_review['clean_text'] == summa.strip()]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

I use this on my Chromebox which I have connected to my TV, mainly to watch Hulu (I am not a Hulu+ subscriber).  The media and other special keys don't typically work well in that scenario, but otherwise this is a great single device to wirelessly have mouse and keyboard functionality.  The mouse movement lags just enough to be slightly annoying, but for my usage scenario it's far from a major complaint.  One thing that is very annoying though is the two-finger scroll gesture--it jumps from too slow to way too fast with little difference in how I'm moving my fingers between--it's very difficult to make it scroll a usable speed; this makes the gesture almost useless, which is sad because I use that gesture all the time on my Chromebook and have become very used to doing it. 1 

This little keyboard is great. I plugged the transponder in to my computer and boom it was ready to go I didn't even need to install any drivers but other customers might have a different experience, I bought thi

In [226]:
for summa in textrank_summa_neg.split('.')[:-1]:
    row = product_review.loc[product_review['clean_text'] == summa.strip()]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

I bought the black one, Its back was white which I love, its buttons are nice to press and dont make a lot of sound.mousepad is good and easy to use. I like that the F1-F12 are pressed with the Fn button and the volume and playback keys are put in front. in spite of that the Alt+F4 combination is still there and you dont need to use the Fn to get it.A big disadvantage is that the right shift is so small and put behind the up/pageup button, that annoys me alot (hence -2 stars)  because each time I press the shift key while typing I end up pressing the up button, this happened to me while writing this alot.Oh, it has no umber lock, so bye bye special characters (Alt+number) like &#9786; ( I used my old keyboard for that) 0 

I was excited about buying a keyboard with a touchpad so I could ditch my wireless mouse, however, I do a lot of actual office work with it, and it drives me absolutely CRAZY that the right shift key is not full size. It's the size of  a number key and it's too far o