# 1. Read the .csv file using Pandas. Take a look at the top few records.

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv("K8 Reviews v0.2.csv")
data

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...
...,...,...
14670,1,"I really like the phone, Everything is working..."
14671,1,The Lenovo K8 Note is awesome. It takes best p...
14672,1,Awesome Gaget.. @ this price
14673,1,This phone is nice processing will be successf...


# Data Cleaning

In [3]:
# convert all records into lower case
data["review"] = data["review"].str.lower()
data.head()

Unnamed: 0,sentiment,review
0,1,good but need updates and improvements
1,0,"worst mobile i have bought ever, battery is dr..."
2,1,when i will get my 10% cash back.... its alrea...
3,1,good
4,0,the worst phone everthey have changed the last...


In [4]:
# Removing Special Characters 
data['clean_review'] = data['review'].str.replace(r'[^a-zA-Z\s]', ' ',regex=True) 
data.head()

Unnamed: 0,sentiment,review,clean_review
0,1,good but need updates and improvements,good but need updates and improvements
1,0,"worst mobile i have bought ever, battery is dr...",worst mobile i have bought ever battery is dr...
2,1,when i will get my 10% cash back.... its alrea...,when i will get my cash back its alrea...
3,1,good,good
4,0,the worst phone everthey have changed the last...,the worst phone everthey have changed the last...


## WORD TOKENIZATION

In [5]:
data['clean_review'] =data['clean_review'].apply(lambda x: word_tokenize(x))
data.head()

Unnamed: 0,sentiment,review,clean_review
0,1,good but need updates and improvements,"[good, but, need, updates, and, improvements]"
1,0,"worst mobile i have bought ever, battery is dr...","[worst, mobile, i, have, bought, ever, battery..."
2,1,when i will get my 10% cash back.... its alrea...,"[when, i, will, get, my, cash, back, its, alre..."
3,1,good,[good]
4,0,the worst phone everthey have changed the last...,"[the, worst, phone, everthey, have, changed, t..."


In [6]:
# Cleaning further after tokenization.
data['clean_review'] = data['clean_review'].apply\
(lambda x:[word for word in x if word not in stopwords.words("english") and len(word) > 3])
data.head(), data.shape

(   sentiment                                             review  \
 0          1             good but need updates and improvements   
 1          0  worst mobile i have bought ever, battery is dr...   
 2          1  when i will get my 10% cash back.... its alrea...   
 3          1                                               good   
 4          0  the worst phone everthey have changed the last...   
 
                                         clean_review  
 0                [good, need, updates, improvements]  
 1  [worst, mobile, bought, ever, battery, drainin...  
 2                     [cash, back, already, january]  
 3                                             [good]  
 4  [worst, phone, everthey, changed, last, phone,...  ,
 (14675, 3))

In [7]:
data = data[data['clean_review'].map(lambda x: len(x)) > 1].reset_index(drop=True)
data.shape

(12952, 3)

## LEMMATIZATION

In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
data['clean_review'] = data['clean_review'].apply\
(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])
data.head()

Unnamed: 0,sentiment,review,clean_review
0,1,good but need updates and improvements,"[good, need, update, improvement]"
1,0,"worst mobile i have bought ever, battery is dr...","[worst, mobile, bought, ever, battery, drainin..."
2,1,when i will get my 10% cash back.... its alrea...,"[cash, back, already, january]"
3,0,the worst phone everthey have changed the last...,"[worst, phone, everthey, changed, last, phone,..."
4,0,only i'm telling don't buyi'm totally disappoi...,"[telling, buyi, totally, disappointedpoor, bat..."


## Extracting only NOUN

In [10]:
data['clean_review'] = data['clean_review'].apply\
(lambda x: [word for word in x if nltk.pos_tag([word])[0][1] == 'NN'])

In [11]:
data = data[data['clean_review'].map(lambda x: len(x)) > 1].reset_index(drop=True)
# Keeping records with more than single words

In [12]:
data.head(), data.shape

(   sentiment                                             review  \
 0          1             good but need updates and improvements   
 1          0  worst mobile i have bought ever, battery is dr...   
 2          1  when i will get my 10% cash back.... its alrea...   
 3          0  the worst phone everthey have changed the last...   
 4          0  only i'm telling don't buyi'm totally disappoi...   
 
                                         clean_review  
 0                        [need, update, improvement]  
 1  [mobile, bought, battery, hell, backup, hour, ...  
 2                                    [cash, january]  
 3  [phone, everthey, phone, problem, amazon, phon...  
 4  [buyi, disappointedpoor, batterypoor, camerawa...  ,
 (11181, 3))

## Document Term Matrix

In [13]:
import gensim
from gensim import corpora

In [14]:
dictionary = corpora.Dictionary(data['clean_review'])
print(dictionary)

# We have 6724 unique tokens

Dictionary(6724 unique tokens: ['improvement', 'need', 'update', 'amazon', 'backup']...)


In [15]:
doc_term_matrix = data['clean_review'].apply(lambda x: dictionary.doc2bow(x))
doc_term_matrix[:10]

# Each tokenized words has been assigned index value and thier count in corpus

0                             [(0, 1), (1, 1), (2, 1)]
1    [(3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1...
2                                   [(19, 1), (20, 1)]
3                  [(3, 2), (21, 1), (22, 3), (23, 1)]
4        [(24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]
5    [(14, 1), (22, 1), (29, 1), (30, 1), (31, 1), ...
6                           [(5, 1), (36, 1), (37, 1)]
7    [(14, 2), (22, 2), (23, 2), (34, 1), (38, 1), ...
8                 [(44, 1), (45, 1), (46, 1), (47, 1)]
9                  [(8, 1), (22, 1), (48, 1), (49, 1)]
Name: clean_review, dtype: object

# LDA

In [16]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(corpus=doc_term_matrix, num_topics=12, id2word=dictionary, passes=10,random_state=45)

In [17]:
ldamodel.print_topics()

[(0,
  '0.208*"camera" + 0.104*"quality" + 0.041*"phone" + 0.033*"sound" + 0.028*"front" + 0.023*"battery" + 0.022*"mode" + 0.020*"depth" + 0.019*"display" + 0.018*"rear"'),
 (1,
  '0.117*"feature" + 0.065*"phone" + 0.057*"android" + 0.032*"stock" + 0.032*"card" + 0.028*"memory" + 0.028*"contact" + 0.023*"user" + 0.022*"slot" + 0.016*"option"'),
 (2,
  '0.341*"mobile" + 0.184*"problem" + 0.094*"heating" + 0.023*"heat" + 0.017*"network" + 0.014*"battery" + 0.011*"hang" + 0.009*"month" + 0.007*"connection" + 0.006*"player"'),
 (3,
  '0.061*"screen" + 0.061*"charger" + 0.058*"phone" + 0.050*"turbo" + 0.029*"glass" + 0.020*"processor" + 0.018*"gorilla" + 0.017*"core" + 0.016*"month" + 0.016*"time"'),
 (4,
  '0.156*"update" + 0.079*"software" + 0.042*"system" + 0.036*"phone" + 0.034*"oreo" + 0.024*"need" + 0.016*"problem" + 0.016*"bill" + 0.015*"hardware" + 0.015*"lenovo"'),
 (5,
  '0.192*"phone" + 0.102*"battery" + 0.062*"price" + 0.058*"performance" + 0.057*"camera" + 0.052*"awesome" + 0.

## Compute Coherence Score

In [18]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel,texts=data['clean_review'],\
                                     dictionary=dictionary , coherence='c_v')
print('\nCoherence Score: ', coherence_model_lda.get_coherence())


Coherence Score:  0.5527744184708548


## Combining topics from 12 to 6 

In [20]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(corpus=doc_term_matrix, num_topics=6, id2word=dictionary, passes=10,random_state=45)

In [21]:
ldamodel.print_topics()

[(0,
  '0.157*"camera" + 0.078*"quality" + 0.039*"phone" + 0.022*"sound" + 0.021*"front" + 0.019*"mode" + 0.015*"depth" + 0.014*"display" + 0.014*"performance" + 0.014*"rear"'),
 (1,
  '0.081*"note" + 0.071*"lenovo" + 0.041*"phone" + 0.025*"call" + 0.024*"feature" + 0.019*"android" + 0.015*"product" + 0.012*"speaker" + 0.012*"option" + 0.010*"stock"'),
 (2,
  '0.168*"mobile" + 0.090*"problem" + 0.049*"heating" + 0.037*"amazon" + 0.035*"product" + 0.030*"issue" + 0.022*"return" + 0.013*"time" + 0.013*"want" + 0.012*"network"'),
 (3,
  '0.071*"phone" + 0.035*"money" + 0.034*"screen" + 0.025*"charger" + 0.023*"product" + 0.021*"turbo" + 0.019*"lenovo" + 0.019*"waste" + 0.014*"amazon" + 0.013*"value"'),
 (4,
  '0.085*"phone" + 0.045*"issue" + 0.039*"update" + 0.032*"service" + 0.027*"lenovo" + 0.027*"network" + 0.027*"problem" + 0.020*"software" + 0.014*"signal" + 0.012*"call"'),
 (5,
  '0.133*"battery" + 0.128*"phone" + 0.034*"price" + 0.030*"awesome" + 0.030*"fast" + 0.029*"camera" + 0.0

In [22]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel,texts=data['clean_review'],\
                                     dictionary=dictionary , coherence='c_v')
print('\nCoherence Score: ', coherence_model_lda.get_coherence())


Coherence Score:  0.5855503994613088


In [23]:
topic_names = {0:'Camera Quality',1:'Phone Features',2:'Mobile Heating',3:'Charger',4:'Network',5:'Battery'}
topic_details=[]
for i,j in enumerate(ldamodel[doc_term_matrix]):
    topic_details.append(sorted(j,key=lambda x: x[1],reverse=True)[0][0])
data["Topic_No"] = topic_details
data['Topic_Name'] = data["Topic_No"].map(topic_names)

In [24]:
data

Unnamed: 0,sentiment,review,clean_review,Topic_No,Topic_Name
0,1,good but need updates and improvements,"[need, update, improvement]",0,Camera Quality
1,0,"worst mobile i have bought ever, battery is dr...","[mobile, bought, battery, hell, backup, hour, ...",3,Charger
2,1,when i will get my 10% cash back.... its alrea...,"[cash, january]",0,Camera Quality
3,0,the worst phone everthey have changed the last...,"[phone, everthey, phone, problem, amazon, phon...",5,Battery
4,0,only i'm telling don't buyi'm totally disappoi...,"[buyi, disappointedpoor, batterypoor, camerawa...",4,Network
...,...,...,...,...,...
11176,0,very poour battery parformance and prosecer,"[poour, battery, parformance, prosecer]",5,Battery
11177,1,"i really like the phone, everything is working...","[phone, everything, fine, whater, phone]",5,Battery
11178,1,the lenovo k8 note is awesome. it takes best p...,"[lenovo, note, awesome, picture, camera, slim,...",5,Battery
11179,1,awesome gaget.. @ this price,"[awesome, gaget, price]",5,Battery


In [25]:
print(data.iloc[11176])

sentiment                                                 0
review          very poour battery parformance and prosecer
clean_review        [poour, battery, parformance, prosecer]
Topic_No                                                  5
Topic_Name                                          Battery
Name: 11176, dtype: object
