#### This notebook generates abstract topics from the large collection of review comments. Review comments are grouped by product id and preprocessed then fed to LDAmodel to get topics.
#### To reproduce  the results, load the saved LDA model and run last 7 cells

In [2]:
#Import all the necessary packages
import pandas as pd
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from stop_words import get_stop_words
from gensim import corpora, models
from gensim.models import Phrases
from nltk.stem.porter import *
import numpy as np
import operator
import nltk
# Compute bigrams.
from gensim.models import Phrases
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import pickle

In [2]:
#Load data
df = pd.read_pickle('Clean_data')

In [3]:
#Filter required columns
df = df.filter(items=['asin','reviewText'])

In [4]:
df

Unnamed: 0,asin,reviewText
0,B00004R940,I'll admit it -- I'm a bit of a snob when it c...
1,B00004S4TZ,Maybe when Mom cooked for a big family every d...
2,B00004RDAZ,"This was an impulse buy, I was looking for a b..."
3,B00004S576,I bought this rice cooker based on the first t...
4,B00004SBIB,I went mad when I first saw this amazing cassa...
...,...,...
6053291,B01HCJCM52,These are exactly what I was looking for to so...
6053292,B01HIAZ9BY,Looks cheap but its sturdy to my surprise. //G...
6053293,B01HIAZ9BY,Great look and quality //Great product
6053294,B01HHAW9HW,This is a great lunchbox. My daughter loves th...


In [5]:
#Check the no.of unique product id
n_products = df.asin.unique().shape[0]
print('Number of Products:', n_products)

Number of Products: 188138


In [6]:
#Group reviews by product id and add a column count
df_grouped = df.groupby(['asin']).agg({'reviewText': ' ** '.join,'asin':'size'}).rename(columns={'asin':'count'}).reset_index()

In [7]:
#Sort dataframe by count in descending order
df_grouped.sort_values(by = 'count',ascending=False,inplace=True)

In [8]:
#reset index
df_grouped.reset_index(inplace=True,drop=True)

In [9]:
df_grouped

Unnamed: 0,asin,reviewText,count
0,B00FLYWNYQ,I was excited to try this so as soon as I got ...,7433
1,B00COK3FD8,The tupperware was exactly as described. I lov...,4563
2,B009HVH4XO,i have 4 of these and i didn't think they coul...,4460
3,B00NX47YP4,Pros:\nSeems to weigh accurately\nEasy to use ...,4379
4,B00902X68W,I love these sheets. They are so soft and coz...,3725
...,...,...,...
188133,B00176NOZO,I'm not sure if the one I received was defecti...,1
188134,B00KLFM0ZW,They are really nice quality and they look rea...,1
188135,B00FYHDHT0,Luxury look at a reasonable price. Very elega...,1
188136,B01D1AI06Q,Good luck cutting anything with this junk. Nic...,1


In [26]:
#Print a random review text
#df_grouped.reviewText[0]

### Tokenize

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
doc_1 = df_grouped.reviewText[0]

In [13]:
# Using the sample reviewtext
tokens = tokenizer.tokenize(doc_1.lower())
print('{} characters in string vs {} words in a list'.format(len(doc_1), len(tokens)))

2854393 characters in string vs 541632 words in a list


In [14]:
# Remove numbers, but not words that contain numbers.
docs = [token for token in tokens if not token.isnumeric()]
print('{} words in a list after removing numbers'.format(len(docs)))

535183 words in a list after removing numbers


In [15]:
# Remove words that are less than 4 characters only
docs = [token for token in docs if len(token) > 3]
print('{} words in a list after words that are less than 4 chars'.format(len(docs)))

291878 words in a list after words that are less than 4 chars


### Remove stop words

In [17]:
#create a merged list of stop words
nltk_stpwd = stopwords.words('english')
#Extend stopwords with commonly found tokens in review texts
nltk_stpwd.extend(['generally', 'used', 'personally', 'review', 'honestly','truly','whatever','done','star','one','two','three','four','five','since','ever','even','much','thing','also','go','come','must'])
stop_words_stpwd = get_stop_words('en')
merged_stopwords = list(set(nltk_stpwd + stop_words_stpwd))

print(len(set(merged_stopwords)))
print(merged_stopwords[:10])

234
['hadn', 'is', 'do', 'my', 'you', 'd', 'to', 'shouldn', "can't", "you're"]


In [18]:
stopped_tokens = [token for token in docs if not token in merged_stopwords]
print('{} words in a list after removing stop words'.format(len(stopped_tokens)))

223868 words in a list after removing stop words


### Lemmatization
##### https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

In [19]:
# Instantiate a WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 

In [None]:
lemm_tokens = [lemmatizer.lemmatize(token) for token in stopped_tokens]
print(lemm_tokens[:10])

#### Do all the preprocessing steps for the entire dataset

In [22]:
num_reviews = df_grouped.shape[0]

doc_set = [df_grouped.reviewText[i] for i in range(num_reviews)]

texts = []

for doc in doc_set:
    # putting our 5 steps together
    tokens = tokenizer.tokenize(doc.lower())
    tokens_alp = [token for token in tokens if not token.isnumeric()]
    token_gr_3 = [token for token in tokens_alp if len(token) > 3]
    stopped_tokens = [token for token in token_gr_3 if not token in merged_stopwords]
    lemm_tokens = [lemmatizer.lemmatize(token) for token in stopped_tokens]
    # add tokens to list
    texts.append(lemm_tokens)

#### Add bigrams and trigrams to docs (only ones that appear 30 times or more).

In [23]:

bigram = Phrases(texts, min_count=30)
for idx in range(len(texts)):
    for token in bigram[texts[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            texts[idx].append(token)

In [24]:
#Save preprocessed pickle file for later use
with open('texts.pkl', 'wb') as f:
    pickle.dump(texts, f)

In [25]:
# Gensim's Dictionary encapsulates the mapping between normalized words and their integer ids.
texts_dict = corpora.Dictionary(texts)
texts_dict.save('auto_review.dict') # lets save to disk for later use
# Examine each token’s unique id
print(texts_dict)

Dictionary(292059 unique tokens: ['00pm', '02atm', '1000w', '100v', '10min']...)


In [26]:
print("IDs 1 through 10: {}".format(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:10]))

IDs 1 through 10: [('00pm', 0), ('02atm', 1), ('1000w', 2), ('100v', 3), ('10min', 4), ('10mins', 5), ('10minutes', 6), ('10oz', 7), ('10qt', 8), ('10th', 9)]


In [17]:
#Filter tokens that appear too rare are too frequent
texts_dict.filter_extremes(no_below = 20, no_above = 0.15) # inplace filter
print(texts_dict)
print("top terms:")
print(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:100])

Dictionary(39976 unique tokens: ['00pm', '1000w', '10min', '10mins', '10minutes']...)
top terms:
[('00pm', 0), ('1000w', 1), ('10min', 2), ('10mins', 3), ('10minutes', 4), ('10oz', 5), ('10qt', 6), ('10th', 7), ('110f', 8), ('11lb', 9), ('11pm', 10), ('120v', 11), ('120vac', 12), ('12oz', 13), ('12qt', 14), ('150f', 15), ('15lbs', 16), ('15mins', 17), ('15minutes', 18), ('165f', 19), ('180f', 20), ('195f', 21), ('1960s', 22), ('1970s', 23), ('1cup', 24), ('1min', 25), ('1tbsp', 26), ('1tsp', 27), ('20ish', 28), ('20min', 29), ('20minutes', 30), ('21st', 31), ('21st_century', 32), ('220v', 33), ('22nd', 34), ('2hrs', 35), ('2lbs', 36), ('30mins', 37), ('30pm', 38), ('30sec', 39), ('3hrs', 40), ('3rds', 41), ('3times', 42), ('3yrs', 43), ('45mins', 44), ('4hrs', 45), ('4lbs', 46), ('4min', 47), ('50lb', 48), ('50th', 49), ('50th_anniversary', 50), ('5lbs', 51), ('60hz', 52), ('6lbs', 53), ('6months', 54), ('75lbs', 55), ('75th', 56), ('75th_birthday', 57), ('7lbs', 58), ('7min', 59), ('7

In [22]:
#Create bag of words corpus
corpus = [texts_dict.doc2bow(text) for text in texts]
len(corpus)

188138

In [23]:
#Save corpus for later use
gensim.corpora.MmCorpus.serialize('amzn_h_k_review.mm', corpus)

In [21]:
#To load texts 
#with open('texts.pkl', 'rb') as f:
    #texts = pickle.load(f)

In [16]:
#Load dictionary
#texts_dict =  corpora.Dictionary.load('auto_review.dict')

In [9]:
#Load corpus
#corpus = corpora.MmCorpus('amzn_h_k_review.mm')

In [54]:
#Build LDA model with num_topics = 7
lda_model = gensim.models.LdaModel(corpus=corpus, alpha='auto',eta='auto',id2word=texts_dict,num_topics=7,chunksize=10000,passes=10)

In [55]:
#Save model for later use
lda_model.save('LDA_Model_grp_7_15.lda')

In [24]:
#To load model
#lda_model = models.LdaModel.load('LDA_Model_grp_7_15.lda')

In [12]:
#Get top 20 words in each topic
top_topics = lda_model.top_topics(corpus, topn=20)

In [13]:
#Find the average topic coherence
avg_topic_coherence = sum([t[1] for t in top_topics]) / 10
print('Average topic coherence: %.4f.' % avg_topic_coherence)

Average topic coherence: -1.2511.


In [14]:
#Print topics and word distribution for each topic
counter = 0
for topic in top_topics:
    print('Topic {}:'.format(counter))
    counter += 1
    pprint(topic)

Topic 0:
([(0.02548651, 'pillow'),
  (0.02293437, 'sheet'),
  (0.013917377, 'vacuum'),
  (0.011032599, 'blanket'),
  (0.009316477, 'fabric'),
  (0.008612212, 'towel'),
  (0.0072261314, 'floor'),
  (0.0069879536, 'comforter'),
  (0.0066475132, 'warm'),
  (0.006570338, 'washed'),
  (0.0054044086, 'cotton'),
  (0.005298636, 'carpet'),
  (0.0052566724, 'case'),
  (0.0047472743, 'washing'),
  (0.004519601, 'comfortable'),
  (0.0043066745, 'blue'),
  (0.00410424, 'throw'),
  (0.004075046, 'bag'),
  (0.004072287, 'king'),
  (0.0039258823, 'hair')],
 -1.5690847396789387)
Topic 1:
([(0.02147665, 'knife'),
  (0.008283081, 'cook'),
  (0.008155044, 'oven'),
  (0.008044792, 'cooking'),
  (0.0073750694, 'pan'),
  (0.007208541, 'sharp'),
  (0.006936081, 'food'),
  (0.005925456, 'cake'),
  (0.0057505374, 'blade'),
  (0.0056528873, 'bread'),
  (0.004860285, 'heat'),
  (0.004433428, 'tool'),
  (0.0043840907, 'steel'),
  (0.004277366, 'iron'),
  (0.004179265, 'baking'),
  (0.003952857, 'cooker'),
  (0.00

In [25]:
# Visualize the topics
pyLDAvis.enable_notebook()
#Prepare topic visualization
vis = pyLDAvis.gensim.prepare(lda_model, corpus, texts_dict,sort_topics = False)

In [None]:
pyLDAvis.display(vis)

In [None]:
#save pyLDAvis as html
pyLDAvis.save_html(vis,"vis.html")

In [30]:
print(vis.topic_order)

[1, 2, 4, 6, 5, 3, 7]


#### The topic order here is different than gensim
#### Notice that in gensim topic starte from 0, in PyLDAvis topic starts from 1