## Data loading

In [61]:
from data_handler import *
from submodular_pick import *
import random
import numpy as np
import nltk
import re
from gensim.models import Doc2Vec
from gensim import corpora
from gensim.models.doc2vec import LabeledSentence
from gensim import summarization
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from lime.lime_text import LimeTextExplainer
import operator
from tqdm import tqdm

In [62]:
reviewData_filepath = 'reviews_Electronics.json'
metaData_filepath = 'meta_Electronics.json'
metadata_df = load_metadata(metaData_filepath)

# Exception occured : 40


In [63]:
category_counts = metadata_df['categories'].value_counts()

In [64]:
product_group_name = 'Electronics, Accessories & Supplies, Audio & Video Accessories, Headphones'

In [65]:
product_group_review_data = get_reviewdata_of_product_group(product_group_name, metadata_df, reviewData_filepath)
product_group_review_data['label'] = product_group_review_data['overall'].apply(lambda x : 1 if x >= 4 else 0)

In [66]:
# product_group_review_data['asin'].value_counts()

In [67]:
product_id = 'B003ELYQGG'

In [68]:
def clean_punctuation(text):
    text = re.sub('\\!+', '.',text)
    text = re.sub('\\?+', '.',text)
    text = re.sub('\\.+','.',text)
    return text

In [117]:
product_review = product_group_review_data.loc[product_group_review_data['asin'] == product_id]

product_review['reviewText'] = product_review['reviewText'].apply(lambda x : clean_punctuation(x))

product_review = product_review.loc[product_review['reviewText'].apply(lambda x : len(x.split('.'))>=4)]
product_review = product_review.loc[product_review['reviewText'].apply(lambda x : len(x.split('.'))<=6)]
num_pos = len(product_review.loc[product_review['label'] == 1])
num_neg = len(product_review.loc[product_review['label'] == 0])
resampled_pos = product_review.loc[product_review['label'] == 1].iloc[random.sample(range(0,num_pos),num_neg)].reset_index(drop=True)
product_review = resampled_pos.append(product_review.loc[product_review['label'] == 0]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [118]:
len(product_review)

1634

In [119]:
list(product_review.loc[:10,'reviewText'])

["I am not picky about quality when it comes to headphones. They get the job done, and they come in a variety of colors. They are dependable, I've never had any problems with them and I've had some pairs for years. The only reason I rebuy them is because I want a new color or I've lost a pair. I recommend these if you just want some headphones that get the job done and aren't obnoxious (people are you can't hear what you're listening to).",
 "i'm no audio expert. a little big to fit comfortably into my smaller than average ear canals. three more words.",
 'product is fine. packaging is murder.  Took me 15 minutes and a broken pair of scissors to extract the earphones from the oyster pack.  really annoying.',
 'I use them when I am running and love them.  The  cord is long enough, but not too long to be annoying.  Great sound too.',
 "The only issue is that the cord is very fragile. Unfortunately, I had to through away the 1st pair I bought because the cord split and caused only one ear

## Embedding

In [120]:
def clean_text(text):
    text = text.strip()
    text = text.lower()
    text = re.sub('[^a-z ]', '', text)
    text = re.sub(' +', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stoplist])
    return text

In [121]:
class Custom_pipeline:
    
    def __init__(self, embedder, predictor):
        self.embedder = embedder
        self.predictor = predictor
        
    def predict_proba(self, x):
        embedded_x = [self.embedder.infer_vector(i.split()) for i in x]
        return self.predictor.predict_proba(embedded_x)
    
    def fit(x,y):
        self.predictor.fit(x,y)
        

In [122]:
stoplist = set(nltk.corpus.stopwords.words('english'))

product_review['clean_text'] = product_review['reviewText'].apply(lambda x : clean_text(x))

instances = []
for i in range(len(product_review)):
    instance = LabeledSentence(words=product_review.loc[i,'clean_text'].split(), tags=[str(i)])
    instances.append(instance)

In [123]:
# for size in [700,800]:
    
#     for window in [12,14]:

#         model = Doc2Vec(size = size, window = window, workers = 10)
#         model.build_vocab(instances)
#         for epoch in range(20):
#             random.shuffle(instances)
#             model.train(instances)
#         embedded_text = [model.docvecs[str(x)] for x in range(len(product_review))]

#         clf = LogisticRegression(penalty='l1')
#         scores = cross_val_score(clf, embedded_text, product_review['label'], cv=10)
#         print(size,window,epoch,np.mean(scores))

In [124]:
best_vec_size = 700
best_window_size = 14

model = Doc2Vec(size = best_vec_size, window = best_window_size, workers=3)
model.build_vocab(instances)

for epoch in range(20):
    random.shuffle(instances)
    model.train(instances)
    
embedded_text = [model.docvecs[str(x)] for x in range(len(product_review))]

clf = LogisticRegression(penalty='l1')
scores = cross_val_score(clf, embedded_text, product_review['label'], cv=3)
print(np.mean(scores))

clf.fit(embedded_text, product_review['label'])
p_comp = Custom_pipeline(embedder = model, predictor = clf)

0.750343406593


In [125]:
model.most_similar('good')

[('decent', 0.5083191394805908),
 ('amazing', 0.4081023037433624),
 ('rich', 0.3641355633735657),
 ('great', 0.3578459322452545),
 ('acceptable', 0.33184152841567993),
 ('awesome', 0.3000887632369995),
 ('okay', 0.2936441898345947),
 ('nonexistent', 0.2934083938598633),
 ('superb', 0.2906610667705536),
 ('deliver', 0.2880054712295532)]

## LIME

In [126]:
class_names = ['negative', 'positive']
explainer = LimeTextExplainer(class_names = class_names)

In [127]:
% time product_review['explanations'] = product_review['clean_text'].map(lambda x : get_explanation(explainer=explainer,\
                                                                    pipeline = p_comp, instance = x))

Wall time: 2min 58s


In [128]:
feature_importances = get_feature_importances(list(product_review['explanations']))
features = sorted(feature_importances.items(), key=operator.itemgetter(1), reverse=True)

In [129]:
features

[('ear buds', 4.1755426839883114),
 ('great sound', 3.2912938556117521),
 ('great price', 2.7474583678033047),
 ('great headphones', 2.7458269302914506),
 ('get pay', 2.7130834878686962),
 ('sound great', 2.5341240261131981),
 ('sound quality', 2.417869879128753),
 ('love headphones', 2.2038803226128691),
 ('beat price', 2.2037426853125006),
 ('poor quality', 2.1679368827742715),
 ('headphones great', 2.1185320035161479),
 ('waste money', 2.017125000919652),
 ('would recommend', 1.8000637386890128),
 ('last long', 1.7866309385030932),
 ('highly recommend', 1.7040527864472814),
 ('cant beat', 1.5834375415373128),
 ('highly recommended', 1.5715159908655634),
 ('great product', 1.4770065323373514),
 ('definitely recommend', 1.4159244780913001),
 ('inear headphones', 1.1308428425901906),
 ('something else', 1.0973521182827572),
 ('ear piece', 1.0838876159187767),
 ('good price', 0.98555338802733883),
 ('recommend anyone', 0.98435704273307523),
 ('sound amazing', 0.90396883288757834),
 ('gr

## Pos / Neg Feature Extraction

In [249]:
pos_features= {}
neg_features= {}
for feature in tqdm(list(features)):
    f_exist = product_review.loc[product_review['clean_text'].apply(lambda x : feature[0] in x)]
    prob = np.mean(f_exist['label'])
    if (prob >= 0.8) and (2 < len(f_exist)):
        pos_features[feature[0]] = feature[1]
    if (prob < 0.2) and (2 < len(f_exist)):
        neg_features[feature[0]] = feature[1]

100%|█████████████████████████████████████████████████████████████████████████████████████████| 2563/2563 [00:02<00:00, 887.84it/s]


In [250]:
sorted(pos_features.items(), key=operator.itemgetter(1), reverse=True)

[('great sound', 3.2912938556117521),
 ('great price', 2.7474583678033047),
 ('sound great', 2.5341240261131981),
 ('love headphones', 2.2038803226128691),
 ('beat price', 2.2037426853125006),
 ('highly recommend', 1.7040527864472814),
 ('cant beat', 1.5834375415373128),
 ('highly recommended', 1.5715159908655634),
 ('great product', 1.4770065323373514),
 ('definitely recommend', 1.4159244780913001),
 ('inear headphones', 1.1308428425901906),
 ('great pair', 0.88925154977025567),
 ('great bass', 0.73228218903836828),
 ('works great', 0.63470938875259475),
 ('wont regret', 0.63111978093355281),
 ('earbuds fit', 0.62031078255335481),
 ('work great', 0.60614313298375611),
 ('outside noise', 0.55353254351494574),
 ('excellent price', 0.53933377355644996),
 ('happy purchase', 0.52005251569542366),
 ('price perfect', 0.50900011296280812),
 ('great quality', 0.49786463481970344),
 ('far best', 0.4931018621353207),
 ('inear headphone', 0.49222129796298564),
 ('great buy', 0.48827397349304869),

In [251]:
sorted(neg_features.items(), key=operator.itemgetter(1), reverse=True)

[('get pay', 2.7130834878686962),
 ('poor quality', 2.1679368827742715),
 ('waste money', 2.017125000919652),
 ('something else', 1.0973521182827572),
 ('look elsewhere', 0.56201641608279651),
 ('work fine', 0.52969674233914843),
 ('guess got', 0.5291599590591507),
 ('give stars', 0.39632259357986016),
 ('price bad', 0.39530844996286885),
 ('sound ok', 0.37535512114030534),
 ('received headphones', 0.36959209438863705),
 ('better buy', 0.35386624959045632),
 ('spend money', 0.34489406419643304),
 ('better quality', 0.34436644950173956),
 ('got paid', 0.32424825785871236),
 ('lot better', 0.31979958905988315),
 ('disappointed sound', 0.31203333094993352),
 ('great reviews', 0.29446703310487365),
 ('quality product', 0.29369479686704025),
 ('wouldnt buy', 0.2896440469291513),
 ('pretty disappointed', 0.2775776386788058),
 ('bought based', 0.26421170721078557),
 ('well price', 0.26394847300983065),
 ('bad sound', 0.25076411470052695),
 ('quality awful', 0.23677423405563358),
 ('dont buy',

## Instance Pick

In [209]:
pos_reviews_explanations = list(product_review.loc[product_review['label'] == 1]['explanations'])
neg_reviews_explanations = list(product_review.loc[product_review['label'] == 0]['explanations'])
pos_chosen, _ = submodular_pick(pos_reviews_explanations, pos_features, k = 10)
neg_chosen, _ = submodular_pick(neg_reviews_explanations, neg_features, k = 10)

In [210]:
for review in pos_chosen:
    row = product_review.loc[product_review['clean_text'] == review[1]]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

This was a great purchase. I needed some more ear buds desperately, and I wasn't sure what I would be getting for the price. Great headphones, great price. You won't regret it. 1 

At &#60;$10 these are by far the best in-ear headphones I've had. Very high quality. Little added noise when touching the cord, and 3 ear-bud sizes. 1 

Very comfortable, great sound, cheap . what more could you want.  I have tried a bunch, and these are better than the rest, regardless of cost.  Try them.  You won't be disappointed. 1 

Can't go wrong for the price. I've had several pairs of these headphones and I love them. They won't last forever if you are hard on headphones like me, but they hold up pretty well. The sound is excellent for the price. 1 

Great pair. Fits in ear, looks good , good sound. Seems pretty durable too. Great price, only downside is it does not have a volume up/down button 1 

Great bass response and warm sound - that's what I like and that's what I got.  They also fit well and 

In [211]:
for review in neg_chosen:
    row = product_review.loc[product_review['clean_text'] == review[1]]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

I have small ears - I can't wear the apple headphones, and have to change other earbuds to the smallest size. But even the smallest size of these don't help, because the plastic is shaped in a way that hurts my ears, so I can't wear them for more than an hour or so at a time. The sound quality is great, but if you have small ears, I'd look for something else. 0 

These earphones don't have decent bass so music sounds a little tinny.  They are comfortable but what you get in comfort, you lose in sound.  I guess I got what I paid for. 0 

I should have known better than to buy headphones that had little tiny covers that fit on each headphone.  Of course one of them disappeared right away.  I drag them on the subway, I shove them in my pocket and flip them into a car, having little covers will never be a tenable situation.  Then one of the ear things broke completely.  So it's currently one eared. 0 

The volume is very low and distorted like you are listening a song for a distant.  Worke

## TextRank

In [144]:
pos_reviews = list(product_review.loc[product_review['label'] == 1]['clean_text'])
neg_reviews = list(product_review.loc[product_review['label'] == 0]['clean_text'])
reviews = pos_reviews + neg_reviews
tokens = [review.split() for review in reviews]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

In [152]:
ratio = 20 / len(reviews)
print(ratio)

0.012239902080783354


In [153]:
textrank_summa_pos = summarization.summarize('. '.join(pos_reviews), ratio=ratio)
textrank_summa_neg = summarization.summarize('. '.join(neg_reviews), ratio=ratio)

In [154]:
for summa in textrank_summa_pos.split('.')[:-1]:
    row = product_review.loc[product_review['clean_text'] == summa.strip()]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

I tend to need a new pair of headphones every six months (I'm sure a lot of you are the same way), and thought that this pair, for under six bucks, would be a cheap holdover until my next pair.  I've had these now for a week, and I'm blown away at how great the sound is.  The buds are incredibly comfortable and fit perfectly in your ear (they provide three sizes of soft, malleable earpads), and, most surprisingly, they block out external noise incredibly well.  Highly recommended for anyone looking for an affordable, reliable pair of headphones. 1 

You can't tell from the picture, but the white base is wider than the clear plug, this lets it grip the curves of my outer ear so these plugs don't fall out like all the other buds and phone. The sound quality is really good, even got some nice thumping bass out of them, much better than the orginal ipod earphones, which were a tinny sounding joke. These earphones were so nice all my kids and family wanted a pair so I had to buy a bunch mor

In [155]:
for summa in textrank_summa_neg.split('.')[:-1]:
    row = product_review.loc[product_review['clean_text'] == summa.strip()]
    text = row['reviewText'].iloc[0]
    label = row['label'].iloc[0]
    print(text,label,'\n')

Just got these today. They aren't bad, aren't great.Pros:Seem like decent qualityOne of the cheapest availableMulti-sizeSeem to stay in ear wellCons:Don't block out exterior sound like comparable ear buds (Don't fit snug in ear)Bass/lows suck, basically non existent (really important to me)Thin, easily tangled wireI usually buy the SkullCandy Ink'd and decided to try a new set, because Ink'd headphones only last about 3-4 months on average.  I will be going back to the Ink'd, mainly for the much better bass and the better blockage of outside sound.These will be my backups. 0 

I have used the cheap J-Lab headphones for years, purchasing a new pair every 6 months and not thinking about it. I recently decided to try out a few other brands, one of which was this Panasonic model. The sound quality wasn't even as good as the J-Lab, the cord was shorter (short enough to not be long enough to go from my ears to my phone in my pocket) and they don't stay in my years great. I still keep them ar