In [None]:
"""
Problem Statement: 

A popular mobile phone brand, Lenovo has launched their budget smartphone in the Indian market. The client wants to understand the 
VOC (voice of the customer) on the product. This will be useful to not just evaluate the current product,but to also get some direction 
for developing the product pipeline. The client is particularly interested in the different aspects that customers care about. 
Product reviews by customers on a leading e-commerce site should provide a good view.

Domain: Amazon reviews for a leading phone brand

Analysis to be done: POS tagging, topic modeling using LDA, and topic interpretation
"""

In [1]:
#Importing the required libraries and dataset
import pandas as pd
reviews_df = pd.read_csv("E:/Education/PGP Simplilearn-Purdue/PGP in Data Science/Natural Language Processing/Assessement Projects/Review Project Analysis/K8 Reviews v0.2.csv")
reviews_df.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [2]:
#Checking for null values
reviews_df.isna().sum()
#Observation:
#No null values are present

sentiment    0
review       0
dtype: int64

In [3]:
reviews_df.sentiment.value_counts(normalize=True)*100
#Observations:
#'1' corresponds to positive sentiment(47.44%) and '0' corresponds to negative sentiment(52.55)
#Negative sentiment outweighs positive sentiment by 5 percentage points

0    52.551959
1    47.448041
Name: sentiment, dtype: float64

In [4]:
#Using predefined list of stop words in nltk
from nltk.corpus import stopwords
stopwords1 = stopwords.words('english')
print(type(stopwords1))
print(stopwords1[0:10])
print("Total no.of stopwords:",len(stopwords1))

<class 'list'>
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
Total no.of stopwords: 179


In [5]:
#Punctuation Marks
from string import punctuation
print(punctuation)
print("Total Punctation Marks:",len(punctuation))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Total Punctation Marks: 32


In [6]:
#Appending puctuation to list of stop words
stopwords_punct = stopwords1 + list(punctuation)
print(type(stopwords_punct))
print(len(stopwords_punct))
print(stopwords_punct)

<class 'list'>
211
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'sam

In [7]:
#Passing reviews into a list
reviews_list = list(reviews_df.review)
print(len(reviews_list))
print(reviews_list[0:5])

14675
['Good but need updates and improvements', "Worst mobile i have bought ever, Battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if I put mobile idle its getting discharged.This is biggest lie from Amazon & Lenove which is not at all expected, they are making full by saying that battery is 4000MAH & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.Don't know how Lenovo will survive by making full of us.Please don;t go for this else you will regret like me.", 'when I will get my 10% cash back.... its already 15 January..', 'Good', 'The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone .Highly disappointing of amazon']


In [8]:
#Function for word tokenization, stopwords and punctuation removal and normalization (lower case)
from nltk import word_tokenize
def clean_text(x):
    x1 = word_tokenize(x.lower())
    x2 = [x for x in x1 if x not in stopwords_punct]
    return ' '.join(x2)

In [9]:
#Normalize casings for the review text and extract the text into a list for easier manipulation.
#Tokenize the reviews using NLTKs word_tokenize function.
tokenized_clean_text = []
for sent in reviews_list:
    tokenized_clean_text.append(clean_text(sent))
print(len(tokenized_clean_text))

14675


In [10]:
#Sample text after word tokenization, lower casing, removing stopwords and punctuation
tokenized_clean_text[0:5]

['good need updates improvements',
 "worst mobile bought ever battery draining like hell backup 6 7 hours internet uses even put mobile idle getting discharged.this biggest lie amazon lenove expected making full saying battery 4000mah booster charger fake takes least 4 5 hours fully charged.do n't know lenovo survive making full us.please go else regret like",
 'get 10 cash back .... already 15 january ..',
 'good',
 'worst phone everthey changed last phone problem still amazon returning phone .highly disappointing amazon']

In [11]:
#Lemmatization
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

clean_text_lemma = []
for sent in tokenized_clean_text:
    clean_text_lemma.append(lemmatizer.lemmatize(sent))

print(len(clean_text_lemma))
print(clean_text_lemma[0:5])

14675
['good need updates improvements', "worst mobile bought ever battery draining like hell backup 6 7 hours internet uses even put mobile idle getting discharged.this biggest lie amazon lenove expected making full saying battery 4000mah booster charger fake takes least 4 5 hours fully charged.do n't know lenovo survive making full us.please go else regret like", 'get 10 cash back .... already 15 january ..', 'good', 'worst phone everthey changed last phone problem still amazon returning phone .highly disappointing amazon']


In [12]:
#Combining lemmatized text
clean_text_lemma_combined = ' '.join(clean_text_lemma)

print(len(clean_text_lemma_combined))
print(type(clean_text_lemma_combined))
print(clean_text_lemma_combined[0:100])

1235836
<class 'str'>
good need updates improvements worst mobile bought ever battery draining like hell backup 6 7 hours 


In [13]:
#POS tagging
from nltk.tag import pos_tag

word_list = word_tokenize(clean_text_lemma_combined)
clean_text_tagged = pos_tag(word_list)

print(len(clean_text_tagged))
print(clean_text_tagged[0:5])

190017
[('good', 'JJ'), ('need', 'NN'), ('updates', 'VBZ'), ('improvements', 'NNS'), ('worst', 'RB')]


In [14]:
#For the topic model, we should  want to include only nouns.
#Find out all the POS tags that correspond to nouns.
#Limit the data to only terms with these tags.
noun_tag_text = []
for i in range(len(clean_text_tagged)):
    if((clean_text_tagged[i][1]=='NN') | (clean_text_tagged[i][1]=='NNS')):
        noun_tag_text.append(clean_text_tagged[i][0])

print(len(noun_tag_text))
print(type(noun_tag_text))
print(noun_tag_text[0:10])

84971
<class 'list'>
['need', 'improvements', 'hell', 'backup', 'hours', 'uses', 'lie', 'amazon', 'lenove', 'battery']


In [15]:
#Create a topic model using LDA on the cleaned-up data with 12 topics.
# Build a Dictionary - association word to numeric id
import gensim
from gensim import corpora

dictionary = corpora.Dictionary([noun_tag_text])
print(len(dictionary))

9325


In [16]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in [noun_tag_text]]
print(corpus[0][0:100])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 3), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 394), (20, 157), (21, 57), (22, 27), (23, 11), (24, 1), (25, 2), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 2), (38, 2), (39, 1), (40, 8), (41, 3), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 9), (56, 3), (57, 5), (58, 2), (59, 1), (60, 1), (61, 3), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 3), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 22), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 7), (99, 1)]


In [17]:
num_topics = 12
from gensim import models
lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)

In [18]:
print("LDA Model:")
 
for idx in range(num_topics):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

LDA Model:
Topic #0: 0.059*"phone" + 0.031*"battery" + 0.025*"camera" + 0.022*"product" + 0.019*"quality" + 0.016*"problem" + 0.012*"price" + 0.011*"note" + 0.011*"issue" + 0.010*"lenovo"
Topic #1: 0.058*"phone" + 0.033*"camera" + 0.025*"battery" + 0.022*"product" + 0.019*"problem" + 0.015*"quality" + 0.014*"note" + 0.012*"performance" + 0.009*"price" + 0.009*"lenovo"
Topic #2: 0.084*"phone" + 0.039*"battery" + 0.029*"product" + 0.026*"camera" + 0.014*"note" + 0.013*"problem" + 0.012*"quality" + 0.011*"issue" + 0.010*"lenovo" + 0.010*"price"
Topic #3: 0.077*"phone" + 0.030*"camera" + 0.027*"battery" + 0.022*"product" + 0.013*"quality" + 0.012*"problem" + 0.011*"price" + 0.011*"performance" + 0.010*"lenovo" + 0.010*"issue"
Topic #4: 0.055*"phone" + 0.030*"battery" + 0.027*"camera" + 0.020*"product" + 0.015*"problem" + 0.011*"performance" + 0.011*"lenovo" + 0.010*"quality" + 0.009*"price" + 0.009*"note"
Topic #5: 0.064*"phone" + 0.034*"camera" + 0.030*"battery" + 0.020*"product" + 0.017*

In [19]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [20]:
#Analyze the topics through the business lens.
#Determine which of the topics can be combined.
#Observations:
#As per the coherence model visualization, it can be clearly seen that the topics can be divided into 5 categories (based on inter topic distances)
#Topic I- Combination of Topics 1,2 and 3
#Topic II- Combination of Topics 4,5,6,7 and 8
#Topic III- Combination of Topics 11 and 12
#Topic IV- Topic 9
#Topic V- Topics 10

  and should_run_async(code)


In [21]:
#What is the coherence of the model with the c_v metric?
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=[noun_tag_text], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Coherence Score:  0.2501310434261979


In [22]:
#Create topic model using LDA with what you think is the optimal number of topics
num_topics = 5
from gensim import models
lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)

  and should_run_async(code)


In [23]:
print("LDA Model:")
 
for idx in range(num_topics):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

LDA Model:
Topic #0: 0.077*"phone" + 0.037*"camera" + 0.033*"battery" + 0.029*"product" + 0.017*"quality" + 0.014*"problem" + 0.013*"note" + 0.011*"performance" + 0.010*"price" + 0.010*"lenovo"
Topic #1: 0.072*"phone" + 0.028*"camera" + 0.027*"battery" + 0.020*"product" + 0.014*"problem" + 0.012*"quality" + 0.011*"note" + 0.011*"lenovo" + 0.011*"price" + 0.010*"issue"
Topic #2: 0.070*"phone" + 0.038*"battery" + 0.033*"camera" + 0.017*"product" + 0.015*"problem" + 0.011*"quality" + 0.011*"note" + 0.011*"performance" + 0.010*"lenovo" + 0.010*"issue"
Topic #3: 0.065*"phone" + 0.029*"battery" + 0.029*"camera" + 0.020*"product" + 0.016*"problem" + 0.012*"quality" + 0.010*"issue" + 0.010*"lenovo" + 0.009*"performance" + 0.009*"money"
Topic #4: 0.036*"phone" + 0.023*"camera" + 0.019*"battery" + 0.012*"quality" + 0.010*"lenovo" + 0.009*"problem" + 0.009*"note" + 0.008*"product" + 0.008*"issue" + 0.007*"price"


  and should_run_async(code)


In [24]:
#What is the coherence of the model with the c_v metric?
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=[noun_tag_text], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Coherence Score:  0.252757552986883


In [25]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

  and should_run_async(code)
