In [1]:
# Importing the respective libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

import pyLDAvis
import pyLDAvis.sklearn


In [3]:
# Importing the respective dataframe for the topic modelling
df = pd.read_csv(r'C:\Users\aksha\Downloads\translated_cleaned_data_1_unsupervised.csv')

In [4]:
df

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewText,overall,summary,title,brand,review_length,cleaned_reviewText,lemmatized_review_length,target,y
0,0,0,0,0,A30TL5EWN6DFXT,120401325X,Looks Good Looks Good They look good and st...,4,Looks Good,,,41,looks good look good look good stick good not ...,22,Good,1
1,1,1,1,1,ASY55RVNIL0UD,120401325X,Really great product. Really great product. ...,5,Really great product.,,,38,really great product really great product stic...,19,Good,1
2,2,2,2,2,A2TMXE2AFO7ONB,120401325X,LOVE LOVE LOVE LOVE LOVE LOVE These are awe...,5,LOVE LOVE LOVE,,,40,love love love love love love awesome make pho...,21,Good,1
3,3,3,3,3,AWJ0WZQYMYFQ4,120401325X,Cute! Cute! Item arrived in great time and ...,4,Cute!,,,53,cute cute item arrive great time perfect condi...,31,Good,1
4,4,4,4,4,ATX7CZYFXI1KW,120401325X,leopard home button sticker for iphone 4s le...,5,leopard home button sticker for iphone 4s,,,37,leopard home button sticker iphone leopard hom...,23,Good,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194171,194171,194171,194181,194511,A1YMNTFLNDYQ1F,B00LORXVUE,This works just perfect! This works just per...,5,This works just perfect!,,,32,work perfect work perfect work great like orig...,16,Good,1
194172,194172,194172,194182,194512,A15TX8B2L8B20S,B00LORXVUE,Great replacement cable. Apple certified Gre...,5,Great replacement cable. Apple certified,,,39,great replacement cable apple certify great re...,29,Good,1
194173,194173,194173,194183,194513,A3JI7QRZO1QG8X,B00LORXVUE,Real quality Real quality This is a great c...,5,Real quality,,,141,real quality real quality great cable good exp...,65,Good,1
194174,194174,194174,194184,194514,A1NHB2VC68YQNM,B00LORXVUE,I really like it becasue it works well with my...,5,I really like it becasue it works well with my...,,,51,really like becasue work well life proof reall...,27,Good,1


In [5]:
# Dropping all the null values based on cleaned_reviewText
df = df[df['cleaned_reviewText'].notna()]

In [6]:
len(df)

194176

In [8]:
# Constructing the Bag of words using CountVectorizer
from pandas.core.common import random_state
vectorizer = CountVectorizer(      
                             min_df=10,                       
                             stop_words='english',             
                             lowercase=True, 
                             ngram_range=(1,3),         
                             max_features=5000      
                            )

In [9]:
# Vectorizing the lemmatized reviews
data_vectorized = vectorizer.fit_transform(df['cleaned_reviewText'])

In [10]:
# Running the model for 10 topics
lda_model = LatentDirichletAllocation(n_components=10, # Number of topics
                                            max_iter=10,
                                            learning_method='online',
                                            random_state=0,       
                                            n_jobs = -1 , # Use all available CPUs
                                            learning_decay =0.9
                                          )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model.perplexity(data_vectorized))

1018.60124059011


In [11]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

  default_term_info = default_term_info.sort_values(


In [93]:
lda_model.components_

array([[1.00454010e-01, 2.56170196e+01, 1.00007288e-01, ...,
        8.95570247e+01, 1.15675726e-01, 1.94486071e+00],
       [1.00013909e-01, 1.00035064e-01, 1.00002183e-01, ...,
        1.00102000e-01, 1.00006056e-01, 1.02354554e-01],
       [1.15867290e-01, 1.51095835e+02, 1.00008975e-01, ...,
        5.72101804e+00, 1.00242079e-01, 1.09844029e-01],
       ...,
       [2.64667239e+02, 9.55044006e+01, 1.00039893e-01, ...,
        2.03061696e+02, 2.94052101e+02, 1.41284977e-01],
       [1.00677211e-01, 4.08702998e+02, 1.00014510e-01, ...,
        8.72170515e+01, 1.00004668e-01, 1.00141209e-01],
       [4.74016917e-01, 2.38762529e+02, 2.67430411e+02, ...,
        1.85677949e+00, 1.00065206e-01, 1.00219566e-01]])

In [20]:
# Pulling out the top 20 words in a topic based on the topic_weights
import numpy as np
def show_topics(vectorizer=vectorizer, lda_model=lda_output, n_words=20):
    
    # Extracting the feature names out from the vectorizer
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
   
    # Picking the top 20 indexs out based on the weights for each individual topic and mapping them back with the sparse martix generated
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

# Applying the function for extraction
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)

# Creating the topic keywords dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords 

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,like,good,really,star,thing,say,review,bad,pretty,color,love,buy,try,day,product,know,way,different,problem,look
Topic 1,screen,protector,screen protector,product,great,easy,bubble,ok,come,use,awesome,iphone,clear,apply,scratch,buy,install,clean,glass,quality
Topic 2,nice,use,hold,stylus,mount,easy,like,design,work,pen,excellent,way,tip,small,device,holder,best,fit,nice case,touch
Topic 3,use,power,need,note,phone,plug,ipad,unit,galaxy,time,samsung,high,cell,provide,test,come,stand,adapter,feature,want
Topic 4,phone,cover,make,new,look,feel,want,like,screen,camera,use,htc,card,service,worth,button,think,sure,edge,good
Topic 5,work,great,phone,good,buy,work great,use,time,price,charger,recommend,cheap,item,need,car,far,purchase,month,product,fast
Topic 6,case,phone,fit,love,great,good,look,protection,iphone,like,protect,drop,price,great case,color,otterbox,cute,perfect,phone case,hard
Topic 7,battery,charge,life,day,hour,mah,battery life,perfect,phone,pack,time,battery pack,original,come,fully,capacity,good,recharge,long,fit
Topic 8,sound,headset,bluetooth,quality,use,good,ear,speaker,headphone,music,great,volume,pair,device,button,hear,sound quality,phone,voice,easy
Topic 9,charge,charger,device,cable,usb,product,iphone,port,great,car,apple,nexus,android,price,cord,use,great product,tablet,light,micro


In [21]:
# Creating the topic themes based on the keywords
Topics_theme = ['Generic',
                'Screen Protector- Phone Accessory',
                'Touch Screen Pens',
                'Phone Charge',
                'Phone Camera',
                'Overall Phone Experience',
                'Phone Case',
                'Power Bank/Portability',
                'Phone Audio/Connectivity',
                'Phone Ports']

In [22]:
# Assigning the topic names 
df_topic_keywords['topic_theme'] = Topics_theme

In [23]:
# Setting the index with topic names
df_topic_keywords.set_index('topic_theme', inplace=True)

In [24]:
df_topic_keywords.T

topic_theme,Generic,Screen Protector- Phone Accessory,Touch Screen Pens,Phone Charge,Phone Camera,Overall Phone Experience,Phone Case,Power Bank/Portability,Phone Audio/Connectivity,Phone Ports
Word 0,like,screen,nice,use,phone,work,case,battery,sound,charge
Word 1,good,protector,use,power,cover,great,phone,charge,headset,charger
Word 2,really,screen protector,hold,need,make,phone,fit,life,bluetooth,device
Word 3,star,product,stylus,note,new,good,love,day,quality,cable
Word 4,thing,great,mount,phone,look,buy,great,hour,use,usb
Word 5,say,easy,easy,plug,feel,work great,good,mah,good,product
Word 6,review,bubble,like,ipad,want,use,look,battery life,ear,iphone
Word 7,bad,ok,design,unit,like,time,protection,perfect,speaker,port
Word 8,pretty,come,work,galaxy,screen,price,iphone,phone,headphone,great
Word 9,color,use,pen,time,camera,charger,like,pack,music,car


In [25]:
# Creating Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

In [26]:
data_vectorized

<194176x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 6723045 stored elements in Compressed Sparse Row format>

In [27]:
# Extracting all the column names from the df_topic_keywords Transpose df
topicnames = df_topic_keywords.T.columns
# Creating an index for all the documents
docnames = ["Doc" + str(i) for i in range(len(df))]

In [28]:
topicnames

Index(['Generic', 'Screen Protector- Phone Accessory', 'Touch Screen Pens',
       'Phone Charge', 'Phone Camera', 'Overall Phone Experience',
       'Phone Case', 'Power Bank/Portability', 'Phone Audio/Connectivity',
       'Phone Ports'],
      dtype='object', name='topic_theme')

In [29]:
# Making a complete dataframe with Document-Topic Matrix
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

In [30]:
df_document_topic

topic_theme,Generic,Screen Protector- Phone Accessory,Touch Screen Pens,Phone Charge,Phone Camera,Overall Phone Experience,Phone Case,Power Bank/Portability,Phone Audio/Connectivity,Phone Ports
Doc0,0.00,0.00,0.0,0.00,0.00,0.97,0.00,0.00,0.00,0.00
Doc1,0.70,0.00,0.0,0.00,0.00,0.17,0.00,0.00,0.00,0.11
Doc2,0.00,0.00,0.0,0.00,0.00,0.00,0.97,0.00,0.00,0.00
Doc3,0.00,0.97,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00
Doc4,0.00,0.82,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.15
...,...,...,...,...,...,...,...,...,...,...
Doc194171,0.00,0.00,0.0,0.05,0.00,0.72,0.00,0.20,0.00,0.00
Doc194172,0.00,0.00,0.0,0.03,0.00,0.46,0.00,0.05,0.00,0.45
Doc194173,0.00,0.06,0.0,0.00,0.08,0.27,0.00,0.13,0.06,0.39
Doc194174,0.29,0.00,0.0,0.00,0.15,0.16,0.18,0.21,0.00,0.00


In [32]:
# Geting the dominant topic for each document
# Getting the max topic values from the df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

# Creating a new column to place the values extracted
df_document_topic['dominant_topic'] = dominant_topic

# Resetting the index for the merge
df_document_topic.reset_index(inplace=True)

# Merging both original reviews dataframe and df_document_topic dataframe
df_sent_topic= pd.merge(df, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)


In [33]:
df_document_topic

topic_theme,index,Generic,Screen Protector- Phone Accessory,Touch Screen Pens,Phone Charge,Phone Camera,Overall Phone Experience,Phone Case,Power Bank/Portability,Phone Audio/Connectivity,Phone Ports,dominant_topic
0,Doc0,0.00,0.00,0.0,0.00,0.00,0.97,0.00,0.00,0.00,0.00,5
1,Doc1,0.70,0.00,0.0,0.00,0.00,0.17,0.00,0.00,0.00,0.11,0
2,Doc2,0.00,0.00,0.0,0.00,0.00,0.00,0.97,0.00,0.00,0.00,6
3,Doc3,0.00,0.97,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1
4,Doc4,0.00,0.82,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.15,1
...,...,...,...,...,...,...,...,...,...,...,...,...
194171,Doc194171,0.00,0.00,0.0,0.05,0.00,0.72,0.00,0.20,0.00,0.00,5
194172,Doc194172,0.00,0.00,0.0,0.03,0.00,0.46,0.00,0.05,0.00,0.45,5
194173,Doc194173,0.00,0.06,0.0,0.00,0.08,0.27,0.00,0.13,0.06,0.39,9
194174,Doc194174,0.29,0.00,0.0,0.00,0.15,0.16,0.18,0.21,0.00,0.00,0


In [35]:
# Selecting the required columns 
df_topic_theme = df_sent_topic.loc[:,['reviewerID','asin','reviewText','overall','summary','title','target','dominant_topic']]

# Function for transforming the dominant topics to topic namez
def label_theme(row):
    if row == 0 :
        return 'Generic'
    if row == 1 :
        return 'Screen Protector- Phone Accessory'
    if row == 2 :
        return 'Touch Screen Pens'
    if row == 3:
        return 'Phone Charge'
    if row == 4:
        return 'Phone Camera'
    if row == 5:
        return 'Overall Phone Experience'
    if row == 6:
        return 'Phone Case'
    if row == 7:
        return 'Power Bank/Portability'
    if row == 8:
        return 'Phone Audio/Connectivity'
    if row == 9:
        return 'Phone Ports'

In [36]:
df_topic_theme['dominant_topic_theme'] = df_topic_theme['dominant_topic'].apply(label_theme)

In [37]:
df_topic_theme

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,title,target,dominant_topic,dominant_topic_theme
0,A30TL5EWN6DFXT,120401325X,Looks Good Looks Good They look good and st...,4,Looks Good,,Good,5,Overall Phone Experience
1,ASY55RVNIL0UD,120401325X,Really great product. Really great product. ...,5,Really great product.,,Good,0,Generic
2,A2TMXE2AFO7ONB,120401325X,LOVE LOVE LOVE LOVE LOVE LOVE These are awe...,5,LOVE LOVE LOVE,,Good,6,Phone Case
3,AWJ0WZQYMYFQ4,120401325X,Cute! Cute! Item arrived in great time and ...,4,Cute!,,Good,1,Screen Protector- Phone Accessory
4,ATX7CZYFXI1KW,120401325X,leopard home button sticker for iphone 4s le...,5,leopard home button sticker for iphone 4s,,Good,1,Screen Protector- Phone Accessory
...,...,...,...,...,...,...,...,...,...
194171,A1YMNTFLNDYQ1F,B00LORXVUE,This works just perfect! This works just per...,5,This works just perfect!,,Good,5,Overall Phone Experience
194172,A15TX8B2L8B20S,B00LORXVUE,Great replacement cable. Apple certified Gre...,5,Great replacement cable. Apple certified,,Good,5,Overall Phone Experience
194173,A3JI7QRZO1QG8X,B00LORXVUE,Real quality Real quality This is a great c...,5,Real quality,,Good,9,Phone Ports
194174,A1NHB2VC68YQNM,B00LORXVUE,I really like it becasue it works well with my...,5,I really like it becasue it works well with my...,,Good,0,Generic
