In [2]:
import pandas as pd
import numpy as np

In [3]:
#Read the .csv file using Pandas. Take a look at the top few records.
data=pd.read_csv('K8 Reviews v0.2.csv')

Review Project Analysis.
Course-end Project 1

DESCRIPTION

Help a leading mobile brand understand the voice of the customer by analyzing the reviews of their product on Amazon and the topics that customers are talking about. You will perform topic modeling on specific parts of speech. You’ll finally interpret the emerging topics.

Problem Statement: 

A popular mobile phone brand, Lenovo has launched their budget smartphone in the Indian market. The client wants to understand the VOC (voice of the customer) on the product. This will be useful to not just evaluate the current product, but to also get some direction for developing the product pipeline. The client is particularly interested in the different aspects that customers care about. Product reviews by customers on a leading e-commerce site should provide a good view.

Domain: Amazon reviews for a leading phone brand

Analysis to be done: POS tagging, topic modeling using LDA, and topic interpretation

Content: 

Dataset: ‘K8 Reviews v0.2.csv’

Columns:

Sentiment: The sentiment against the review (4,5 star reviews are positive, 1,2 are negative)

Reviews: The main text of the review

Steps to perform:

Discover the topics in the reviews and present it to business in a consumable format. Employ techniques in syntactic processing and topic modeling.

Perform specific cleanup, POS tagging, and restricting to relevant POS tags, then, perform topic modeling using LDA. Finally, give business-friendly names to the topics and make a table for business.

Tasks: 

Read the .csv file using Pandas. Take a look at the top few records.

Normalize casings for the review text and extract the text into a list for easier manipulation.

Tokenize the reviews using NLTKs word_tokenize function.

Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.

For the topic model, we should  want to include only nouns.

Find out all the POS tags that correspond to nouns.

Limit the data to only terms with these tags.

Lemmatize. 

Different forms of the terms need to be treated as one.

No need to provide POS tag to lemmatizer for now.

Remove stopwords and punctuation (if there are any). 

Create a topic model using LDA on the cleaned-up data with 12 topics.

Print out the top terms for each topic.

What is the coherence of the model with the c_v metric?

Analyze the topics through the business lens.

Determine which of the topics can be combined.

Create topic model using LDA with what you think is the optimal number of topics

What is the coherence of the model?

The business should  be able to interpret the topics.

Name each of the identified topics.

Create a table with the topic name and the top 10 terms in each to present to the  business.

You can download the datasets from here - 

 

In [4]:
data.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [5]:
type(data)

pandas.core.frame.DataFrame

In [6]:
data['sentiment'].value_counts()

0    7712
1    6963
Name: sentiment, dtype: int64

In [7]:
reviews=data['review'].values

In [8]:
type(reviews)

numpy.ndarray

In [9]:
reviews[:5]

array(['Good but need updates and improvements',
       "Worst mobile i have bought ever, Battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if I put mobile idle its getting discharged.This is biggest lie from Amazon & Lenove which is not at all expected, they are making full by saying that battery is 4000MAH & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.Don't know how Lenovo will survive by making full of us.Please don;t go for this else you will regret like me.",
       'when I will get my 10% cash back.... its already 15 January..',
       'Good',
       'The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone .Highly disappointing of amazon'],
      dtype=object)

In [10]:
#Normalize casings for the review text and extract the text into a list for easier manipulation.
reviews_lower=[txt.lower() for txt in reviews]
type(reviews_lower)

list

In [11]:
#Tokenize the reviews using NLTKs word_tokenize function
from nltk.tokenize import word_tokenize,sent_tokenize

In [12]:
# word tokenize
reviews_tokens=[ word_tokenize(review) for review in reviews_lower]

In [13]:
len(reviews_tokens)

14675

In [14]:
type(reviews_tokens)

list

In [15]:
#df = pd.DataFrame({'Column1': reviews_lower, 'Column2': reviews_tokens})

In [16]:
#df.head(10)

In [17]:
#Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.
import nltk
nltk.pos_tag(reviews_tokens[2])

[('when', 'WRB'),
 ('i', 'NN'),
 ('will', 'MD'),
 ('get', 'VB'),
 ('my', 'PRP$'),
 ('10', 'CD'),
 ('%', 'NN'),
 ('cash', 'NN'),
 ('back', 'RB'),
 ('...', ':'),
 ('.', '.'),
 ('its', 'PRP$'),
 ('already', 'RB'),
 ('15', 'CD'),
 ('january..', 'NN')]

In [18]:
reviews_pos=[]
count=0
for sent in reviews_tokens:
    reviews_pos.append(nltk.pos_tag(sent))

In [19]:
nltk.pos_tag(['good', 'but', 'need', 'updates', 'and', 'improvements'])

[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [20]:
len(reviews_pos)

14675

In [21]:
reviews_pos[1]

[('worst', 'JJS'),
 ('mobile', 'NN'),
 ('i', 'NN'),
 ('have', 'VBP'),
 ('bought', 'VBN'),
 ('ever', 'RB'),
 (',', ','),
 ('battery', 'NN'),
 ('is', 'VBZ'),
 ('draining', 'VBG'),
 ('like', 'IN'),
 ('hell', 'NN'),
 (',', ','),
 ('backup', 'NN'),
 ('is', 'VBZ'),
 ('only', 'RB'),
 ('6', 'CD'),
 ('to', 'TO'),
 ('7', 'CD'),
 ('hours', 'NNS'),
 ('with', 'IN'),
 ('internet', 'JJ'),
 ('uses', 'NNS'),
 (',', ','),
 ('even', 'RB'),
 ('if', 'IN'),
 ('i', 'JJ'),
 ('put', 'VBP'),
 ('mobile', 'JJ'),
 ('idle', 'NN'),
 ('its', 'PRP$'),
 ('getting', 'VBG'),
 ('discharged.this', 'NN'),
 ('is', 'VBZ'),
 ('biggest', 'JJS'),
 ('lie', 'NN'),
 ('from', 'IN'),
 ('amazon', 'NN'),
 ('&', 'CC'),
 ('lenove', 'NN'),
 ('which', 'WDT'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('at', 'IN'),
 ('all', 'DT'),
 ('expected', 'VBN'),
 (',', ','),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('making', 'VBG'),
 ('full', 'JJ'),
 ('by', 'IN'),
 ('saying', 'VBG'),
 ('that', 'DT'),
 ('battery', 'NN'),
 ('is', 'VBZ'),
 ('4000mah', 'CD'),
 ('&', 

Find out all the POS tags that correspond to nouns.

Limit the data to only terms with these tags.


In [22]:
#Create list of all nouns
all_nouns=[]
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
for sent in reviews_pos:
    res=[term for term,pos in sent if pos.startswith('NN' or 'NNS'or 'NNP' or 'NNPS')]
    all_nouns.append(res)

In [23]:
len(all_nouns)

14675

Lemmatize. 

Different forms of the terms need to be treated as one.

No need to provide POS tag to lemmatizer for now.

Remove stopwords and punctuation (if there are any). 


In [24]:
from nltk.corpus import stopwords
stop_nltk = stopwords.words('english')

In [25]:
len(stop_nltk),type(stop_nltk),stop_nltk[:10]

(179,
 list,
 ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're"])

In [26]:
from string import punctuation
stop_punct = list(punctuation)

In [27]:
stop_punct[:10],len(stop_punct)

(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*'], 32)

In [28]:
stop_final = stop_nltk+stop_punct

In [29]:
len(stop_final)

211

In [30]:
from nltk import WordNetLemmatizer

wnl = WordNetLemmatizer()

allnouns_clean_lemmed = []

for item in all_nouns:
    lemmed_tokens = [wnl.lemmatize(token, 'n') for token in item]
    allnouns_clean_lemmed.append(lemmed_tokens)
    
print(allnouns_clean_lemmed[:5])

[['update', 'improvement'], ['mobile', 'i', 'battery', 'hell', 'backup', 'hour', 'us', 'idle', 'discharged.this', 'lie', 'amazon', 'lenove', 'battery', 'charger', 'hour', 'don'], ['i', '%', 'cash', 'january..'], [], ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon']]


In [31]:
#Remove stopwords and punctuation (if there are any). 

review_preprocessed = []
for item in allnouns_clean_lemmed:
    if len(item)>0:        
        preprocessed_tokens = [token for token in item if token not in stop_final and len(token)>1]
        review_preprocessed.append(preprocessed_tokens)
    else:
        review_preprocessed.append(item)
    
print(review_preprocessed[:5])

[['update', 'improvement'], ['mobile', 'battery', 'hell', 'backup', 'hour', 'us', 'idle', 'discharged.this', 'lie', 'amazon', 'lenove', 'battery', 'charger', 'hour'], ['cash', 'january..'], [], ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon']]


Create a topic model using LDA on the cleaned-up data with 12 topics.

Print out the top terms for each topic.

What is the coherence of the model with the c_v metric?


In [32]:

import gensim
from gensim import corpora

# Step 1: Data preprocessing
# Assuming you have a list of preprocessed documents called 'documents'

# Create a dictionary of the words in the documents
dictionary = corpora.Dictionary(review_preprocessed)

# Convert the documents to a bag-of-words representation
corpus = [dictionary.doc2bow(doc) for doc in review_preprocessed]

# Step 2: Building the LDA model
# Specify the number of topics
num_topics = 15

# Build the LDA model
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

# Step 3: Training the LDA model
# Training happens during the model initialization
# Step 4: Interpreting the results
# Print the topics and their top words
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx},\nWords: {topic}\n')

# Step 5: Topic labeling
# Assign labels to the topics based on the top words

# Step 6: Topic visualization and analysis
# Visualize the topics using word clouds, bar charts, or other techniques


Topic: 0,
Words: 0.111*"note" + 0.082*"k8" + 0.080*"heating" + 0.068*"problem" + 0.059*"lenovo" + 0.048*"phone" + 0.038*"product" + 0.033*"battery" + 0.028*"charge" + 0.014*"buy"

Topic: 1,
Words: 0.060*"phone" + 0.054*"camera" + 0.032*"note" + 0.020*"feature" + 0.018*"battery" + 0.018*"music" + 0.017*"quality" + 0.014*"video" + 0.013*"android" + 0.012*"stock"

Topic: 2,
Words: 0.105*"charger" + 0.063*"price" + 0.061*"range" + 0.056*"phone" + 0.033*"option" + 0.025*"model" + 0.025*"turbo" + 0.025*"contact" + 0.022*"star" + 0.015*"cable"

Topic: 3,
Words: 0.401*"mobile" + 0.033*"please" + 0.028*"excellent" + 0.027*"everything" + 0.023*"camera" + 0.022*"function" + 0.020*"bit" + 0.019*"bill" + 0.014*"concern" + 0.013*"app"

Topic: 4,
Words: 0.109*"handset" + 0.080*"service" + 0.068*"experience" + 0.058*"delivery" + 0.048*"volta" + 0.031*"centre" + 0.027*"glass" + 0.026*"gud" + 0.025*"item" + 0.024*"purchase"

Topic: 5,
Words: 0.090*"issue" + 0.074*"battery" + 0.042*"phone" + 0.038*"day" 

In [33]:
print(dictionary)

Dictionary(10080 unique tokens: ['improvement', 'update', 'amazon', 'backup', 'battery']...)


In [34]:
corpus

[[(0, 1), (1, 1)],
 [(2, 1),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(14, 1), (15, 1)],
 [],
 [(2, 2), (16, 1), (17, 3), (18, 1)],
 [(19, 1), (20, 1)],
 [(17, 1), (21, 1), (22, 1)],
 [(4, 1), (23, 1)],
 [(17, 2), (18, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)],
 [(30, 1), (31, 1), (32, 1), (33, 1)],
 [(34, 1)],
 [(5, 1), (17, 1), (35, 1), (36, 1)],
 [(4, 1), (37, 1), (38, 1)],
 [(4, 1), (12, 1), (18, 2), (36, 1), (38, 1), (39, 1)],
 [(3, 1), (17, 1), (40, 1), (41, 1), (42, 1)],
 [(43, 1)],
 [(44, 1)],
 [(12, 1),
  (17, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 1)],
 [(4, 2), (38, 1), (56, 1)],
 [(57, 1)],
 [],
 [(17, 1), (18, 1), (58, 1), (59, 1)],
 [(4, 1), (60, 1), (61, 1), (62, 1)],
 [(52, 1)],
 [(4, 1), (52, 2), (60, 1), (63, 1), (64, 1)],
 [(12, 1), (32, 2), (41, 1), (65, 1), (66, 1), (67, 1), (68, 1)],
 [(17, 1)

In [35]:
from gensim.models import CoherenceModel

# Assuming you have an LDA model called 'lda_model'
# Assuming you have a corpus and dictionary created for the model

# Compute the coherence score
coherence_score = CoherenceModel(model=lda_model, texts=review_preprocessed, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_score.get_coherence()
print('Coherence Score for this LDA model is: ', coherence_lda)


Coherence Score for this LDA model is:  0.4762954609965774


Analyze the topics through the business lens.

Determine which of the topics can be combined.

In [36]:

import pyLDAvis
import warnings
warnings.simplefilter('ignore')

pyLDAvis.enable_notebook()

In [37]:
from pyLDAvis import gensim


In [38]:
# Let us visualize topics in our text

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)


In [39]:
pyLDAvis.save_html(LDAvis_prepared, 'LDA_model_vis'+'.html')


In [40]:
LDAvis_prepared

From topics printed above and model visualization, it can be concluded that following topics can be clubbed together:

Topic- 1,2,3,4,5
Topic- 9,10,11,13,14,15
Topic- 6,8,12
Topic- 7


Task-10. Create topic model using LDA with what you think is the optimal number of topics

In [41]:
# From our exercise in previous step, we find that optimum no. of topics will 4 for given text.
# Let us again create model with 6 topics
import gensim
from gensim import corpora
lda_model1 = gensim.models.LdaModel(corpus, num_topics = 4, id2word = dictionary)

for idx, topic in lda_model1.print_topics(-1):
    print("\nTopic: {} \nWords: {}".format(idx, topic ))


Topic: 0 
Words: 0.092*"product" + 0.056*"battery" + 0.035*"phone" + 0.033*"issue" + 0.024*"time" + 0.022*"performance" + 0.015*"delivery" + 0.014*"backup" + 0.013*"superb" + 0.008*"day"

Topic: 1 
Words: 0.141*"phone" + 0.047*"mobile" + 0.024*"issue" + 0.024*"price" + 0.022*"money" + 0.020*"network" + 0.019*"service" + 0.016*"product" + 0.014*"amazon" + 0.014*"lenovo"

Topic: 2 
Words: 0.051*"phone" + 0.044*"battery" + 0.040*"problem" + 0.028*"note" + 0.016*"day" + 0.015*"k8" + 0.014*"screen" + 0.014*"heating" + 0.014*"hour" + 0.013*"heat"

Topic: 3 
Words: 0.105*"camera" + 0.053*"phone" + 0.045*"quality" + 0.031*"battery" + 0.014*"performance" + 0.013*"feature" + 0.011*"processor" + 0.011*"mode" + 0.009*"product" + 0.009*"price"


In [42]:
coherence_score1 = CoherenceModel(model=lda_model1, texts=review_preprocessed, dictionary=dictionary, coherence='c_v')
coherence_lda1 = coherence_score1.get_coherence()
print('Coherence Score for new LDA model_1 is: ', coherence_lda1)

Coherence Score for new LDA model_1 is:  0.5318483139697435


Task-11. The business should be able to interpret the topics.
Name each of the identified topics.
Create a table with the topic name and the top 10 terms in each to present to the business.

In [43]:
topic_words = {}

for idx, topic in lda_model1.print_topics(-1): 
    temp = []
    for item in topic.split('+'):
        item_alpha = [letter for letter in item if letter.isalpha()]
        temp.append("".join(item_alpha))    
    topic_words[('Topic_'+str(idx+1))] = temp

topic_table = pd.DataFrame(topic_words)    
topic_table.index = ['Word_'+str(i+1) for i in range(topic_table.shape[0])]
topic_table

Unnamed: 0,Topic_1,Topic_2,Topic_3,Topic_4
Word_1,product,phone,phone,camera
Word_2,battery,mobile,battery,phone
Word_3,phone,issue,problem,quality
Word_4,issue,price,note,battery
Word_5,time,money,day,performance
Word_6,performance,network,k,feature
Word_7,delivery,service,screen,processor
Word_8,backup,product,heating,mode
Word_9,superb,amazon,hour,product
Word_10,day,lenovo,heat,price


Another method to print table of topics-words:

In [44]:
lda_model1.show_topics(formatted=False)


[(0,
  [('product', 0.09207516),
   ('battery', 0.056342654),
   ('phone', 0.03489608),
   ('issue', 0.03312302),
   ('time', 0.024246098),
   ('performance', 0.021854267),
   ('delivery', 0.015185235),
   ('backup', 0.013508419),
   ('superb', 0.012635901),
   ('day', 0.007902044)]),
 (1,
  [('phone', 0.14076255),
   ('mobile', 0.046774674),
   ('issue', 0.024476245),
   ('price', 0.02351254),
   ('money', 0.021528715),
   ('network', 0.019738259),
   ('service', 0.018842446),
   ('product', 0.015944613),
   ('amazon', 0.014179689),
   ('lenovo', 0.013917896)]),
 (2,
  [('phone', 0.05070373),
   ('battery', 0.044253744),
   ('problem', 0.039859414),
   ('note', 0.027876474),
   ('day', 0.015673125),
   ('k8', 0.014907773),
   ('screen', 0.0143647855),
   ('heating', 0.014295443),
   ('hour', 0.014148522),
   ('heat', 0.012673206)]),
 (3,
  [('camera', 0.10498125),
   ('phone', 0.053075913),
   ('quality', 0.045487747),
   ('battery', 0.031451013),
   ('performance', 0.014366571),
   (

In [46]:
LDAvis_prepared1 = pyLDAvis.gensim.prepare(lda_model1, corpus, dictionary)
pyLDAvis.save_html(LDAvis_prepared1, 'LDA_model_vis_1'+'.html')


In [47]:
LDAvis_prepared1