# Topic Modelling for a Single Product

In [7]:
import pandas as pd
import pickle
import numpy as np

In [11]:
# Read the preprocessed data file
df_huge = pd.read_pickle('../Clean_data')
df_huge.head()
df_huge.shape

(19525627, 5)

In [12]:
# Selection of product- product with maximum review data
df_grouped = df_huge.groupby(['asin']).agg({'reviewText': ' ** '.join,'asin':'size'}).rename(columns={'asin':'count'}).reset_index()
df_grouped.sort_values(by = 'count',ascending=False,inplace=True)
df_grouped.head(30)


Unnamed: 0,asin,reviewText,count
580956,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,22704
836347,B00Q7EV29G,This mattress shipped free via 2 day Prime Shi...,15201
551006,B00EINBSJ2,I received my pillow on time as stated on Frid...,14856
409335,B009HVH4XO,i have 4 of these and i didn't think they coul...,14649
742237,B00LV4W8BI,The sheets are very soft and comfortable right...,13507
790907,B00NX47YP4,"I love this little scale. It's simple, easy to...",12971
1102682,B019D9HESO,LOVE this cup! Keeps my drink from getting wat...,11770
420105,B009ZJ2M7G,My unreliable Bissell 2x scrubber recently bro...,10518
391650,B00902X68W,The sheets are some of the softest and most c...,10026
503326,B00COK3FD8,The tupperware was exactly as described. I lov...,9171


In [13]:
df_B00FLYWNYQ = df_huge.loc[df_huge['asin'] == "B00FLYWNYQ"]

In [15]:
df_B00FLYWNYQ

Unnamed: 0,overall,reviewerID,asin,reviewText,rev_date
2290534,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04
2321609,5.0,A33BHZZ03JPSQ2,B00FLYWNYQ,I was excited to try this so as soon as I got ...,2014-01-08
2331312,5.0,AN5GO0DDAQNSW,B00FLYWNYQ,Got it yesterday and it seems to be defective....,2014-01-09
2349417,5.0,A7JD2EUAVIPYD,B00FLYWNYQ,8/22/15 ** UPDATE **\n\nSTILL LOVING THIS BABY...,2014-01-12
2375015,5.0,A1U8VI6I2MFEU8,B00FLYWNYQ,Some of the positive reviews here have express...,2014-01-15
...,...,...,...,...,...
19286662,5.0,A2UYXOX0E4RCUU,B00FLYWNYQ,I understand why there is a craze for this pro...,2018-07-21
19286668,5.0,A15FO6FUAIWX4M,B00FLYWNYQ,Love this! //Five Stars,2018-07-21
19286678,1.0,AJHC8VUSCAFLI,B00FLYWNYQ,This is the second Instant Pot Mini I purchase...,2018-07-21
19286681,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21


In [16]:
df_B00FLYWNYQ['overall'].describe()

count    22704.000000
mean         4.599850
std          1.015574
min          1.000000
25%          5.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: overall, dtype: float64

## Preparing the dataframe with review sentences


In [18]:
# Preapre a data frame by spliiting reviews into sentences
df_sentences = pd.DataFrame(df_B00FLYWNYQ.reviewText.str.split('.').tolist(), index=df_B00FLYWNYQ.reviewerID).stack()
df_sentences = df_sentences.reset_index([0, 'reviewerID'])
df_sentences.columns = ['reviewerID', 'sentences']

df_sentences


Unnamed: 0,reviewerID,sentences
0,A38QLFND44V8QG,This is my first pressure cooker so I wasn't r...
1,A38QLFND44V8QG,"That being said, I found it very intuitive a..."
2,A38QLFND44V8QG,To try it out I decided to start with someth...
3,A38QLFND44V8QG,This product comes with recommended cooking ...
4,A38QLFND44V8QG,I would have been lost without it
...,...,...
116451,A27HMTTTL9X72A,I bought the smaller one for everyday since ...
116452,A27HMTTTL9X72A,"It's just as wonderful as the bigger one, bu..."
116453,A27HMTTTL9X72A,//Great for 2
116454,A2KXZ2A335QDI8,I love my Due Mini! It is fun to use--and it ...


In [19]:
# Merging the 50 reviews dataframe having all columns with new sentences dataframe
df_full = df_B00FLYWNYQ.merge(df_sentences, how='inner', left_on='reviewerID', right_on='reviewerID')
df_full_copy = df_B00FLYWNYQ.merge(df_sentences, how='inner', left_on='reviewerID', right_on='reviewerID')
df_full_copy

# #  Removed unneccesary columns 
# df_50p_5800r = df_50p_5800r.drop(["reviewText", "rev_date", "product_ID"], axis=1)
# df_50p_5800r = df_50p_5800r

# df_B00ETP7D3E_full.head()

Unnamed: 0,overall,reviewerID,asin,reviewText,rev_date,sentences
0,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This is my first pressure cooker so I wasn't r...
1,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,"That being said, I found it very intuitive a..."
2,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,To try it out I decided to start with someth...
3,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This product comes with recommended cooking ...
4,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,I would have been lost without it
...,...,...,...,...,...,...
116451,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,I bought the smaller one for everyday since ...
116452,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,"It's just as wonderful as the bigger one, bu..."
116453,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,//Great for 2
116454,5.0,A2KXZ2A335QDI8,B00FLYWNYQ,I love my Due Mini! It is fun to use--and it ...,2018-07-21,I love my Due Mini! It is fun to use--and it ...


## LDA Modeling

In [20]:
import time

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

unable to import 'smart_open.gcs', disabling that module


In [21]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [22]:
# Get sentences from the data frame
list_sentences = df_sentences.sentences.values.tolist()
print(list_sentences[:1], len(list_sentences))

["This is my first pressure cooker so I wasn't really sure what to expect"] 116456


In [23]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

list_sentences_tokenized = list(sent_to_words(list_sentences))
list_sentences_tokenized_copy = list(sent_to_words(list_sentences))
print(list_sentences_tokenized_copy[:1])

[['this', 'is', 'my', 'first', 'pressure', 'cooker', 'so', 'wasn', 'really', 'sure', 'what', 'to', 'expect']]


In [25]:
# Build the bigram and trigram models

start_time = time.time()

bigram = gensim.models.Phrases(list_sentences_tokenized, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[list_sentences_tokenized], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[list_sentences_tokenized[0]]])

print("--- %s seconds ---" % (time.time() - start_time))

['this', 'is', 'my', 'first', 'pressure', 'cooker', 'so', 'wasn', 'really', 'sure', 'what', 'to', 'expect']
--- 13.008661985397339 seconds ---


In [24]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [26]:
# Remove Stop Words
data_words_nostops = remove_stopwords(list_sentences_tokenized)
data_words_nostops[:1]

[['first', 'pressure', 'cooker', 'really', 'sure', 'expect']]

In [27]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_bigrams[:1]


[['first', 'pressure', 'cooker', 'really', 'sure', 'expect']]

In [28]:
start_time = time.time()
# # Remove Stop Words 
# data_words_nostops = remove_stopwords(list_sentences_tokenized)

# # Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# # Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print("--- %s seconds ---" % (time.time() - start_time))

[['first', 'pressure', 'cooker', 'really', 'sure', 'expect']]


In [29]:
#  For Text Blob
from nltk.tokenize.treebank import TreebankWordDetokenizer
list_sentences_lemetised = []
for i in range(len(data_lemmatized)):
    lem_sen = TreebankWordDetokenizer().detokenize(data_lemmatized[i])
    list_sentences_lemetised.append(lem_sen)

In [31]:
list_sentences_lemetised

['first pressure cooker really sure expect',
 'say find intuitive easy make sure read instruction though',
 'try decide start simple bake chicken',
 'product come recommend cooking time various food come handy',
 'would lose',
 'follow instruction chicken come perfect lot less time use',
 'also take much time',
 'build quality also top notch',
 'wait try new recipe',
 'love',
 'excited try soon get follow manual set brown rice recipe include cookbook instead use rice program button',
 'end good texture ever brown jasmine rice',
 'aroma stay pot rice really fragrant read manual figure rice program see work super simple set manually',
 'pressure cooker ever see own separate one combine cooker',
 'quiet musical beep opening closing lid tiny whisper steam whole cooking time',
 'tell display front metal part lid hot touch side',
 'could tell cooking time end chime pressure cycle end',
 'automatic warm hold nice timer continue know long food holding',
 'back need',
 'stay pot',
 'people post

In [32]:
import pickle
with open('list_sentences_lemetised_B00FLYWNYQ.pkl', 'wb') as f:
    pickle.dump(list_sentences_lemetised, f)

In [33]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [34]:
id2word[0]


'cooker'

In [35]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [36]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.127*"love" + 0.122*"pot" + 0.075*"star" + 0.068*"instant" + 0.056*"great" '
  '+ 0.049*"easy" + 0.032*"work" + 0.021*"size" + 0.020*"small" + '
  '0.016*"little"'),
 (1,
  '0.056*"much" + 0.053*"well" + 0.046*"really" + 0.040*"meat" + '
  '0.033*"amazing" + 0.021*"item" + 0.019*"sure" + 0.018*"crock" + '
  '0.016*"worth" + 0.016*"instruction"'),
 (2,
  '0.069*"cook" + 0.068*"make" + 0.043*"good" + 0.037*"buy" + 0.030*"thing" + '
  '0.029*"get" + 0.029*"product" + 0.026*"meal" + 0.021*"food" + 0.016*"come"'),
 (3,
  '0.077*"pressure" + 0.073*"cooker" + 0.047*"recipe" + 0.036*"take" + '
  '0.034*"perfect" + 0.026*"fast" + 0.022*"first" + 0.021*"lot" + 0.019*"give" '
  '+ 0.019*"people"'),
 (4,
  '0.091*"time" + 0.070*"use" + 0.042*"cooking" + 0.033*"would" + '
  '0.029*"purchase" + 0.025*"slow" + 0.023*"try" + 0.021*"ever" + '
  '0.021*"minute" + 0.021*"rice"')]


In [37]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.686973335694573

Coherence Score:  0.289322429787681


In [45]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis


## LDA Mallet Model

In [39]:
mallet_path = 'mallet-2.0.8/bin/mallet' # update this path


In [40]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)

# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
# 0- quality of cooking, 1-good appliance, 2-functionalities 3-product design, 5-how likely to recommend

[(0,
  [('cook', 0.11880355478782645),
   ('make', 0.11879219037661659),
   ('minute', 0.03695706525444917),
   ('rice', 0.030945291724435758),
   ('meat', 0.024717594381435098),
   ('chicken', 0.0238538991294861),
   ('fast', 0.021115076027910993),
   ('hour', 0.020637770757097076),
   ('soup', 0.02012637225265359),
   ('perfect', 0.019569516103370686)]),
 (1,
  [('time', 0.10477318103728467),
   ('great', 0.08600315735710637),
   ('work', 0.051933881227654737),
   ('recipe', 0.0431234619492037),
   ('meal', 0.04132423271579143),
   ('appliance', 0.02143984770395134),
   ('good', 0.02127733667641733),
   ('week', 0.02097553048242559),
   ('kitchen', 0.020244230858522543),
   ('amazing', 0.020093327761526676)]),
 (2,
  [('cooker', 0.12109807905430366),
   ('pressure', 0.11424085703731067),
   ('cooking', 0.04832379017362394),
   ('slow', 0.032704562246028815),
   ('year', 0.017766438862209088),
   ('steam', 0.01565386036202438),
   ('set', 0.014730328777244181),
   ('start', 0.01389915

In [41]:
ldamallet_3topics = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=3, id2word=id2word)

# Show Topics
pprint(ldamallet_3topics.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet_3topics, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('great', 0.052320490367775835),
   ('easy', 0.046183831421953564),
   ('star', 0.04617676967402971),
   ('work', 0.031580136715439806),
   ('thing', 0.03024546635783289),
   ('product', 0.0286848200666629),
   ('recipe', 0.0262343935370883),
   ('meal', 0.025139822608892155),
   ('clean', 0.01494265860685837),
   ('fast', 0.01310660414665838)]),
 (1,
  [('cook', 0.07052257737189244),
   ('time', 0.061031625232538474),
   ('make', 0.059793674953492304),
   ('minute', 0.02199898528665652),
   ('food', 0.021735159817351597),
   ('rice', 0.018420429561982074),
   ('cooking', 0.015646879756468796),
   ('meat', 0.014713343480466767),
   ('chicken', 0.014199222053103332),
   ('hour', 0.012284796211736851)]),
 (2,
  [('pot', 0.11167940055987034),
   ('love', 0.09637066503897343),
   ('cooker', 0.07359699157388114),
   ('pressure', 0.06944356745455438),
   ('instant', 0.05475223281626009),
   ('good', 0.03619512674257891),
   ('buy', 0.033732539131288895),
   ('slow', 0.019876098868332

In [47]:
ldamallet_4topics = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=4, id2word=id2word)

# Show Topics
pprint(ldamallet_4topics.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet_4topics, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
# 0 - great product, 1- features and fucntions, 2-review after few days, 3-things to cook

[(0,
  [('great', 0.06978783592644978),
   ('easy', 0.06166902404526167),
   ('good', 0.05798208392267798),
   ('star', 0.04776991984912777),
   ('work', 0.04218764733616219),
   ('product', 0.038302687411598306),
   ('recipe', 0.03503064592173503),
   ('thing', 0.026053748231966052),
   ('clean', 0.01995285242809995),
   ('appliance', 0.017416313059877415)]),
 (1,
  [('cooker', 0.09646677889082061),
   ('pressure', 0.09104874391735887),
   ('time', 0.058411752260580804),
   ('cooking', 0.03851496168740974),
   ('slow', 0.0260599203392481),
   ('steam', 0.012473438750448436),
   ('set', 0.011737542658976552),
   ('function', 0.01105683877436506),
   ('start', 0.010790076441206501),
   ('long', 0.01029334657946298)]),
 (2,
  [('pot', 0.14967841426260955),
   ('love', 0.1291138526347463),
   ('instant', 0.07338172791213751),
   ('buy', 0.04698724940760522),
   ('purchase', 0.025313122954827547),
   ('day', 0.017329898070485576),
   ('thing', 0.01700078985970587),
   ('week', 0.0169913867

In [43]:
ldamallet_6topics = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=6, id2word=id2word)



[(0,
  [('great', 0.052320490367775835),
   ('easy', 0.046183831421953564),
   ('star', 0.04617676967402971),
   ('work', 0.031580136715439806),
   ('thing', 0.03024546635783289),
   ('product', 0.0286848200666629),
   ('recipe', 0.0262343935370883),
   ('meal', 0.025139822608892155),
   ('clean', 0.01494265860685837),
   ('fast', 0.01310660414665838)]),
 (1,
  [('cook', 0.07052257737189244),
   ('time', 0.061031625232538474),
   ('make', 0.059793674953492304),
   ('minute', 0.02199898528665652),
   ('food', 0.021735159817351597),
   ('rice', 0.018420429561982074),
   ('cooking', 0.015646879756468796),
   ('meat', 0.014713343480466767),
   ('chicken', 0.014199222053103332),
   ('hour', 0.012284796211736851)]),
 (2,
  [('pot', 0.11167940055987034),
   ('love', 0.09637066503897343),
   ('cooker', 0.07359699157388114),
   ('pressure', 0.06944356745455438),
   ('instant', 0.05475223281626009),
   ('good', 0.03619512674257891),
   ('buy', 0.033732539131288895),
   ('slow', 0.019876098868332

In [44]:
# Show Topics
pprint(ldamallet_6topics.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet_6topics, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('pot', 0.2254961680667507),
   ('love', 0.194585712059611),
   ('instant', 0.11055233670014591),
   ('easy', 0.09264637135045543),
   ('clean', 0.029975492626538792),
   ('crock', 0.013344477341304133),
   ('size', 0.011616211698375147),
   ('stainless_steel', 0.011460384468274992),
   ('big', 0.01143205224462042),
   ('large', 0.011262058902692978)]),
 (1,
  [('make', 0.14264075762124395),
   ('minute', 0.04437651810813024),
   ('rice', 0.0371578286618815),
   ('meat', 0.029679866815862013),
   ('chicken', 0.02864277721677902),
   ('cook', 0.026773286755274146),
   ('hour', 0.024780983051772603),
   ('soup', 0.024166916841789253),
   ('perfect', 0.02349826696869627),
   ('bean', 0.02098741846565323)]),
 (2,
  [('cooker', 0.14178358067742547),
   ('pressure', 0.13378206687751737),
   ('slow', 0.03829102803233044),
   ('cooking', 0.028099910793933987),
   ('steam', 0.01832779174438407),
   ('function', 0.015962479387992324),
   ('top', 0.01452977590354932),
   ('lid', 0.0134214

In [410]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

[(0,
  [('hot', 0.1900320123633955),
   ('coffee', 0.17303234352577548),
   ('hour', 0.09222872281708798),
   ('drink', 0.05571807042719947),
   ('cold', 0.03896677337454465),
   ('warm', 0.038828789049563966),
   ('long', 0.036317474334915556),
   ('day', 0.033916547080251686),
   ('stay', 0.024781984766530523),
   ('tea', 0.02100121426205983)]),
 (1,
  [('lid', 0.04567095750308588),
   ('clean', 0.038382413448539356),
   ('easy', 0.027067536589666725),
   ('put', 0.026538529359901253),
   ('top', 0.025216011285487568),
   ('make', 0.021542349967671782),
   ('seal', 0.020837006994651148),
   ('paint', 0.018074413683653676),
   ('hand', 0.014430141656380416),
   ('color', 0.011843884088638101)]),
 (2,
  [('good', 0.08141759632649456),
   ('travel', 0.06013599034527419),
   ('great', 0.057104170959291205),
   ('buy', 0.04998086715921467),
   ('love', 0.04950990492449887),
   ('star', 0.04653695581785536),
   ('product', 0.037824154475612985),
   ('thermo', 0.034497983692932624),
   ('mu

In [None]:
## Finding DOminant Topi in each Sentence

In [51]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [52]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet_6topics, corpus=corpus, texts=data_lemmatized)

In [53]:

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

(116456, 5)

In [55]:
# Show
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.1950,"cooker, pressure, slow, cooking, steam, functi...","[first, pressure, cooker, really, sure, expect]"
1,1,3.0,0.2063,"good, thing, recipe, star, appliance, purchase...","[say, find, intuitive, easy, make, sure, read,..."
2,2,1.0,0.1879,"make, minute, rice, meat, chicken, cook, hour,...","[try, decide, start, simple, bake, chicken]"
3,3,5.0,0.1944,"time, cook, meal, food, cooking, fast, day, we...","[product, come, recommend, cooking, time, vari..."
4,4,2.0,0.1830,"cooker, pressure, slow, cooking, steam, functi...","[would, lose]"
...,...,...,...,...,...
116451,116451,3.0,0.1950,"good, thing, recipe, star, appliance, purchase...","[buy, small, everyday]"
116452,116452,0.0,0.1914,"pot, love, instant, easy, clean, crock, size, ...","[wonderful, big, perfect, size]"
116453,116453,4.0,0.1830,"great, star, work, product, buy, year, recomme...",[great]
116454,116454,0.0,0.2241,"pot, love, instant, easy, clean, crock, size, ...","[love, due, mini, fun, perfect, size, single, ..."


## Sentiment Analysis using Text Blob

In [57]:
from textblob import TextBlob
# Get the polarity score using below function
def get_textBlob_score(sent):
    # This polarity score is between -1 to 1
    polarity = TextBlob(sent).sentiment.polarity
    return polarity

In [58]:
len(list_sentences_lemetised)

116456

In [59]:
# Prepare sentiment scores for all sentences from text blob
texblog_senti_scores= []
for sentence in list_sentences_lemetised:
    texblog_score = get_textBlob_score(sentence)
    texblog_senti_scores.append(texblog_score)

In [60]:
len(texblog_senti_scores)

116456

In [61]:
texblog_senti_scores[20000], list_sentences_lemetised[20000]

(0.0, 'also prepare meal pot night meat custom broth mix')

## Combine Sentiment Analysis with LDA

In [65]:
df_full['topics'] = df_dominant_topic['Dominant_Topic'].astype(float)

In [62]:
df_full['sentiment_scores'] = np.resize(texblog_senti_scores,len(df_full))



Unnamed: 0,overall,reviewerID,asin,reviewText,rev_date,sentences,sentiment_scores
0,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This is my first pressure cooker so I wasn't r...,0.375000
1,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,"That being said, I found it very intuitive a...",0.466667
2,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,To try it out I decided to start with someth...,-0.300000
3,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This product comes with recommended cooking ...,0.300000
4,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,I would have been lost without it,0.000000
...,...,...,...,...,...,...,...
116451,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,I bought the smaller one for everyday since ...,-0.225000
116452,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,"It's just as wonderful as the bigger one, bu...",0.666667
116453,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,//Great for 2,0.800000
116454,5.0,A2KXZ2A335QDI8,B00FLYWNYQ,I love my Due Mini! It is fun to use--and it ...,2018-07-21,I love my Due Mini! It is fun to use--and it ...,0.225595


In [66]:
df_full

Unnamed: 0,overall,reviewerID,asin,reviewText,rev_date,sentences,sentiment_scores,topics
0,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This is my first pressure cooker so I wasn't r...,0.375000,2.0
1,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,"That being said, I found it very intuitive a...",0.466667,3.0
2,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,To try it out I decided to start with someth...,-0.300000,1.0
3,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This product comes with recommended cooking ...,0.300000,5.0
4,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,I would have been lost without it,0.000000,2.0
...,...,...,...,...,...,...,...,...
116451,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,I bought the smaller one for everyday since ...,-0.225000,3.0
116452,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,"It's just as wonderful as the bigger one, bu...",0.666667,0.0
116453,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,//Great for 2,0.800000,4.0
116454,5.0,A2KXZ2A335QDI8,B00FLYWNYQ,I love my Due Mini! It is fun to use--and it ...,2018-07-21,I love my Due Mini! It is fun to use--and it ...,0.225595,0.0


In [64]:
df_full['sentiment_scores'].describe()

count    116456.000000
mean          0.179585
std           0.301609
min          -1.000000
25%           0.000000
50%           0.000000
75%           0.400000
max           1.000000
Name: sentiment_scores, dtype: float64

In [68]:
texblog_senti_scores
sentiments_textblob = []
for each in texblog_senti_scores:
    if(each >=0.4):
        a=1
    else:
        a=0
    sentiments_textblob.append(a)

In [69]:
df_full['sentiment'] = np.resize(sentiments_textblob,len(df_full))


In [70]:
df_full

Unnamed: 0,overall,reviewerID,asin,reviewText,rev_date,sentences,sentiment_scores,topics,sentiment
0,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This is my first pressure cooker so I wasn't r...,0.375000,2.0,0
1,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,"That being said, I found it very intuitive a...",0.466667,3.0,1
2,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,To try it out I decided to start with someth...,-0.300000,1.0,0
3,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,This product comes with recommended cooking ...,0.300000,5.0,0
4,5.0,A38QLFND44V8QG,B00FLYWNYQ,This is my first pressure cooker so I wasn't r...,2014-01-04,I would have been lost without it,0.000000,2.0,0
...,...,...,...,...,...,...,...,...,...
116451,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,I bought the smaller one for everyday since ...,-0.225000,3.0,0
116452,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,"It's just as wonderful as the bigger one, bu...",0.666667,0.0,1
116453,5.0,A27HMTTTL9X72A,B00FLYWNYQ,I bought the 8 Qt first and use it a lot for m...,2018-07-21,//Great for 2,0.800000,4.0,1
116454,5.0,A2KXZ2A335QDI8,B00FLYWNYQ,I love my Due Mini! It is fun to use--and it ...,2018-07-21,I love my Due Mini! It is fun to use--and it ...,0.225595,0.0,0


In [99]:
topic_wise_sentiment_counts = df_full.groupby(['topics'])['sentiment'].value_counts()
topic_wise_sentiment_counts

topics  sentiment
0.0     0            33481
        1             9182
1.0     0            10889
        1             3858
2.0     0            11749
        1             3004
3.0     0            10042
        1             5700
4.0     0            11639
        1             5502
5.0     0             9128
        1             2282
Name: sentiment, dtype: int64

In [100]:
total_sentiment = df_full.groupby(['topics'])['sentiment'].count()
total_sentiment

topics
0.0    42663
1.0    14747
2.0    14753
3.0    15742
4.0    17141
5.0    11410
Name: sentiment, dtype: int64

In [96]:
total_sentiment.loc[0], topic_wise_sentiment_counts[0][1]

(42663, 9182)

In [105]:
results = []
ratings= []
for i in range(6):
    value = total_sentiment.loc[i]/topic_wise_sentiment_counts[i][1]
    results.append(value)
    if (value >4):
        ratings.append(5)
    elif (value >3):
        ratings.append(4)
    elif (value >2):
        ratings.append(3)
    elif (value >1):
        ratings.append(2)
    else: ratings.append(1)
results, ratings

([4.646373339141799,
  3.8224468636599274,
  4.911118508655126,
  2.7617543859649123,
  3.1154125772446384,
  5.0],
 [5, 4, 5, 3, 4, 5])

## Results

### Topic wise product rating

In [107]:
Product_Summary = pd.DataFrame({'productID':['B00FLYWNYQ', 'B00FLYWNYQ', 'B00FLYWNYQ', 'B00FLYWNYQ', 'B00FLYWNYQ', 'B00FLYWNYQ'],
                              'topic': ['Product Design', 'What to Cook', 'Product Features', 'Likely Recommendation', 'recommendation', 'Time to cook'],
                               'rating': ratings})
Product_Summary

Unnamed: 0,productID,topic,rating
0,B00FLYWNYQ,Product Design,5
1,B00FLYWNYQ,What to Cook,4
2,B00FLYWNYQ,Product Features,5
3,B00FLYWNYQ,Likely Recommendation,3
4,B00FLYWNYQ,recommendation,4
5,B00FLYWNYQ,Time to cook,5


### Most Postive sentences for this product

In [114]:
df_full.nlargest(5, 'sentiment_scores')['sentences']

90     \n\nI'm really impressed with the quality of t...
94       Both dishes I tested which were  brown rice ...
98                            The brown rice was perfect
149                                            Wonderful
245                    Thank you for a wonderful product
Name: sentences, dtype: object

In [115]:
df_full.iloc[90]['sentences']

"\n\nI'm really impressed with the quality of this cooker!  It feels and looks like it was very well thought out and constructed"

In [116]:
df_full.iloc[94]['sentences']

'  Both dishes I tested which were  brown rice and Ossobuco turned out delicious'

In [117]:
df_full.iloc[98]['sentences']

'  The brown rice was perfect'

In [118]:
df_full.iloc[149]['sentences']

'Wonderful'

In [119]:
df_full.iloc[245]['sentences']

'  Thank you for a wonderful product'

### Most negative sentences for this product

In [120]:
df_full.nsmallest(5, 'sentiment_scores')['sentences']

1153     Tried to make yogurt 3 times and it turned out...
3006       Anybody who has worked with plastic before k...
7640      I ended up discarding the rice pudding, it wa...
9565      Only reason not a 5 star\nThe manual's cookin...
10951    \nWhy? The terrible silicone ring that absorbe...
Name: sentences, dtype: object

In [121]:
df_full.iloc[1153]['sentences']

'Tried to make yogurt 3 times and it turned out awful, grainey and watery'

In [122]:
df_full.iloc[3006]['sentences']

'  Anybody who has worked with plastic before knows that plastic has a nasty habit of letting things penetrate inside'

In [123]:
df_full.iloc[7640]['sentences']

' I ended up discarding the rice pudding, it was that awful'

In [124]:
df_full.iloc[9565]['sentences']

" Only reason not a 5 star\nThe manual's cooking times and water mix are terrible"

In [125]:
df_full.iloc[10951]['sentences']

"\nWhy? The terrible silicone ring that absorbed food smell, and even the detergent's smell when I tried to wash it, and the mixture made me ditch the pot"