In [None]:
# Download dataset from here: https://www.kaggle.com/ayushigaur/amazon-product-reviews

In [20]:
#load required libraries
import pandas as pd 
import numpy as np

In [21]:
import warnings
warnings.filterwarnings('ignore')

In [22]:
#load dataset
data=pd.read_csv('amazon-reviews.csv',sep="\t", error_bad_lines=False,encoding = 'ISO-8859-1')
data.head(5)

Unnamed: 0,date,summary,review,rating
0,2013-07-16,Awesine,Perfect for new parents. We were able to keep ...,5
1,2013-06-29,Should be required for all new parents!,This book is such a life saver. It has been s...,5
2,2014-03-19,Grandmother watching baby,Helps me know exactly how my babies day has go...,5
3,2013-08-17,repeat buyer,I bought this a few times for my older son and...,5
4,2014-04-01,Great,I wanted an alternative to printing out daily ...,4


In [23]:
lis=data['review'].tolist()
lis[:5]

["Perfect for new parents. We were able to keep track of baby's feeding, sleep and diaper change schedule for the first two and a half months of her life. Made life easier when the doctor would ask questions about habits because we had it all right there!",
 'This book is such a life saver.  It has been so helpful to be able to go back to track trends, answer pediatrician questions, or communicate with each other when you are up at different times of the night with a newborn.  I think it is one of those things that everyone should be required to have before they leave the hospital.  We went through all the pages of the newborn version, then moved to the infant version, and will finish up the second infant book (third total) right as our baby turns 1.  See other things that are must haves for baby at [...]',
 "Helps me know exactly how my babies day has gone with my mother in law watching him while I go to work.  It also has a section for her to write notes and let me know anything she 

In [24]:
#Shape
data.shape

(205331, 4)

In [25]:
#reduce the dataset for easy usecase
data=data[:50000]
data.shape

(50000, 4)

In [26]:
#Check any missing values
data.isnull().sum()

date        0
summary     0
review     21
rating      0
dtype: int64

In [27]:
#drop those columns has null values

data=data.dropna()
data=data.reset_index(drop=True)

In [28]:
#Check missing values
data.isnull().sum()

date       0
summary    0
review     0
rating     0
dtype: int64

In [29]:
#No null values

### Text preprocessing

In [30]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
def clean_text(df):
    all_reviews=[]
    lines=df['review'].values.tolist()
    for text in lines:
        text=text.lower()
        pattern=re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text=pattern.sub('',text)
        emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        text=emoji.sub('',text)
        text=re.sub(r"i'm", "i am",text)
        text=re.sub(r"he's", "he is",text)
        text=re.sub(r"she's","she is",text)
        text=re.sub(r"that's", "that is",text)
        text=re.sub(r"what's","what is",text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"did't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        tokens = word_tokenize(text)
        #USE str.maketrans() and str.translate() to remove punctuation from a string
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        words = [w for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews



In [31]:
cleaned_text=clean_text(data)
cleaned_text[:5]

['perfect new parents able keep track baby feeding sleep diaper change schedule first two half months life made life easier doctor would ask questions habits right',
 'book life saver helpful able go back track trends answer pediatrician questions communicate different times night newborn think one things everyone required leave hospital went pages newborn version moved infant version finish second infant book third total right baby turns see things must haves baby',
 'helps know exactly babies day gone mother law watching go work also section write notes let know anything may need could not happier book',
 'bought times older son bought newborn super easy use helps keep track daily routine started going sitter went back work helped know day went better prepare evening would likely go sick help keep track many diapers day producing make sure getting dehydrated note sections side bottom useful sitter writes small notes whether not liked lunch playtime included going walk etcexcellent mo

In [32]:
print('Original text:')
print('\n',data['review'].tolist()[1])
print('------------------------------------------------------')

print('Cleaned_text:')
print('\n',cleaned_text[1])

Original text:

 This book is such a life saver.  It has been so helpful to be able to go back to track trends, answer pediatrician questions, or communicate with each other when you are up at different times of the night with a newborn.  I think it is one of those things that everyone should be required to have before they leave the hospital.  We went through all the pages of the newborn version, then moved to the infant version, and will finish up the second infant book (third total) right as our baby turns 1.  See other things that are must haves for baby at [...]
------------------------------------------------------
Cleaned_text:

 book life saver helpful able go back track trends answer pediatrician questions communicate different times night newborn think one things everyone required leave hospital went pages newborn version moved infant version finish second infant book third total right baby turns see things must haves baby


#### Create vocabulary dictionary, document term matrix

In [33]:
import gensim
from gensim import corpora

In [34]:
cleaned_text=[text.split() for text in cleaned_text]
cleaned_text[:5]  #for gensim.corpora cleaned_text like this

[['perfect',
  'new',
  'parents',
  'able',
  'keep',
  'track',
  'baby',
  'feeding',
  'sleep',
  'diaper',
  'change',
  'schedule',
  'first',
  'two',
  'half',
  'months',
  'life',
  'made',
  'life',
  'easier',
  'doctor',
  'would',
  'ask',
  'questions',
  'habits',
  'right'],
 ['book',
  'life',
  'saver',
  'helpful',
  'able',
  'go',
  'back',
  'track',
  'trends',
  'answer',
  'pediatrician',
  'questions',
  'communicate',
  'different',
  'times',
  'night',
  'newborn',
  'think',
  'one',
  'things',
  'everyone',
  'required',
  'leave',
  'hospital',
  'went',
  'pages',
  'newborn',
  'version',
  'moved',
  'infant',
  'version',
  'finish',
  'second',
  'infant',
  'book',
  'third',
  'total',
  'right',
  'baby',
  'turns',
  'see',
  'things',
  'must',
  'haves',
  'baby'],
 ['helps',
  'know',
  'exactly',
  'babies',
  'day',
  'gone',
  'mother',
  'law',
  'watching',
  'go',
  'work',
  'also',
  'section',
  'write',
  'notes',
  'let',
  'know

In [35]:
dictionary=corpora.Dictionary(cleaned_text)

In [37]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x20d27cc4340>

In [38]:
doc_term_matrix=[dictionary.doc2bow(review) for review in cleaned_text]

In [39]:
doc_term_matrix

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 2),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1)],
 [(0, 1),
  (2, 2),
  (12, 1),
  (18, 1),
  (19, 1),
  (22, 1),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 2),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1)],
 [(27, 1),
  (32, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1)],
 [(11, 4),
  (22, 4),
  (24, 1),
  (26, 1),
  (32, 1

### Build a model

In [43]:
#build LDA model
lda_model=gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix,
                                          id2word=dictionary,
                                          num_topics=10, #randomly taken
                                          random_state=10,  
                                          chunksize=1000, #Number of documents to be used in each training chunk.
                                          passes=50,      #Number of passes through the corpus during training.
                                          iterations=10)  

In [44]:
#print topics
lda_model.print_topics(num_words=10)

[(0,
  '0.032*"bottles" + 0.026*"pump" + 0.025*"bottle" + 0.015*"not" + 0.014*"milk" + 0.013*"use" + 0.012*"cup" + 0.011*"parts" + 0.009*"nipples" + 0.009*"nipple"'),
 (1,
  '0.035*"not" + 0.022*"would" + 0.021*"one" + 0.020*"nt" + 0.014*"baby" + 0.011*"get" + 0.011*"time" + 0.010*"first" + 0.009*"like" + 0.009*"use"'),
 (2,
  '0.038*"not" + 0.009*"nt" + 0.008*"gate" + 0.008*"open" + 0.008*"get" + 0.008*"top" + 0.007*"plastic" + 0.007*"would" + 0.007*"easy" + 0.007*"one"'),
 (3,
  '0.122*"seat" + 0.073*"car" + 0.056*"stroller" + 0.020*"carrier" + 0.019*"straps" + 0.016*"strap" + 0.016*"back" + 0.013*"seats" + 0.011*"ergo" + 0.011*"infant"'),
 (4,
  '0.077*"water" + 0.077*"tub" + 0.041*"bath" + 0.023*"hot" + 0.018*"warm" + 0.017*"cold" + 0.017*"warmer" + 0.017*"sink" + 0.013*"sponge" + 0.011*"towel"'),
 (5,
  '0.024*"loves" + 0.022*"baby" + 0.021*"toy" + 0.020*"old" + 0.016*"son" + 0.016*"months" + 0.014*"toys" + 0.013*"little" + 0.012*"swing" + 0.012*"daughter"'),
 (6,
  '0.046*"baby" 

In [45]:
#Visualization

import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
vis

In [46]:
#Check performance of the model  

#low perplexity + high coherence score= good model
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=cleaned_text, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -8.47534698557803

Coherence Score:  0.46120976046488027


In [None]:
#Model performance is good bcoz Perplexity is less and Coherence score is more.

Evaluate

### Method to find optimal value of k(number of topics)

Method to find optimal number of topics Code from:https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#14computemodelperplexityandcoherencescore



In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=tokenized_reviews, start=2, limit=50, step=1)

In [None]:
# Show graph
limit=50; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()# Print the coherence scores

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[7]
model_topics = optimal_model.show_topics(formatted=False)
optimal_model.print_topics(num_words=10)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, doc_term_matrix, dictionary)
vis

reference:

https://github.com/rsreetech/LDATopicModelling/blob/main/LDADemo.ipynb