# Topic Modelling 

### Importing libraries

In [62]:
import nltk; nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vibha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Vibha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [82]:

import numpy as np 
import pandas as pd
from pprint import pprint

# spacy for lemmatization
import spacy

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


In [64]:
! pip install pyLDAvis



In [65]:
# For visualization
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

#### Preparing stop words

In [66]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Dataset is obtained from kaggle https://www.kaggle.com/datasets/yasserh/amazon-product-reviews-dataset

In [67]:
# Loading the dataset.
data = pd.read_csv('amazon.csv')

In [68]:
data.shape

(1597, 27)

In [69]:
data.head(2)

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",,,Cristina M,,,205 grams
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,Allow me to preface this with a little history...,One Simply Could Not Ask For More,,,Ricky,,,205 grams


In [70]:
data.groupby(['id', 'reviews.text']).size()


id                    reviews.text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [71]:
data = pd.DataFrame(data)

###  Data pre-processing
Here I am using tokenization to split the raw text into small chunks of words or sentences called as tokens.

In [72]:

# Convert to list
data_1 = data['reviews.text'].values.tolist()


In [73]:
print(data_1[0][0:90])

I initially had trouble deciding between the paperwhite and the voyage because reviews mor


### Performing lemmatization

In [74]:
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    texts_out=[]
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return(texts_out)
lemmatized_txt = lemmatization(data_1)
print(lemmatized_txt[0][:90])

initially trouble decide paperwhite voyage review more less say same thing paperwhite grea


In [75]:
def sentences_to_words(texts):
    for sentence in texts:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
    return(sentence)
data_words = list(sentences_to_words(lemmatized_txt))
print(data_words[0][:90])
print(data_words[:1])

['initially', 'trouble', 'decide', 'paperwhite', 'voyage', 'review', 'more', 'less', 'say', 'same', 'thing', 'paperwhite', 'great', 'spend', 'money', 'go', 'voyage', 'fortunately', 'friend', 'own', 'so', 'end', 'buy', 'paperwhite', 'basis', 'model', 'now', 'ppi', 'dollar', 'jump', 'turn', 'pricey', 'voyage', 'page', 'press', 'always', 'sensitive', 'fine', 'specific', 'setting', 'need', 'auto', 'light', 'week', 'love', 'paperwhite', 'regret', 'touch', 'screen', 'receptive', 'easy', 'use', 'keep', 'light', 'specific', 'setting', 'regardless', 'time', 'day', 'case', 'hard', 'change', 'setting', 'either', 'only', 'change', 'light', 'level', 'certain', 'time', 'day', 'now', 'then', 'glad', 'go', 'international', 'shipping', 'option', 'extra', 'expense', 'delivery', 'time', 'tracking', 'need', 'worry', 'custom', 'use', 'third', 'party', 'shipping']
[['initially', 'trouble', 'decide', 'paperwhite', 'voyage', 'review', 'more', 'less', 'say', 'same', 'thing', 'paperwhite', 'great', 'spend', 'mo

Now here I have the stop words removed and have a cleaner text.

### Dictionary to look up words and their frequency

In [76]:
id_to_word = corpora.Dictionary(data_words)
# Create Corpus
new = data_words

# Term Document Frequency
corpus = [id_to_word.doc2bow(text) for text in new]
print(corpus[0][:20])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]


In [77]:
from collections import Counter
bow_simple = Counter(id_to_word)
# Print the 10 most common tokens
print(bow_simple.most_common(10))

[(660, 'zw'), (3075, 'zooming'), (3320, 'zoom'), (3713, 'zombie'), (1974, 'zippy'), (962, 'zip'), (3418, 'zen'), (3282, 'yr'), (3660, 'ypithvtylczt'), (1312, 'youtube')]


### Building a topic model

In [78]:
LDA_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id_to_word,
                                           num_topics=20, random_state=100,
                                           update_every=1, chunksize=100,
                                           passes=10, alpha='auto',
                                           per_word_topics=True)

In [79]:
pprint(LDA_model.print_topics())
doc_LDA = LDA_model[corpus]

[(0,
  '0.106*"tablet" + 0.063*"model" + 0.053*"complete" + 0.050*"screen" + '
  '0.045*"first" + 0.038*"new" + 0.035*"touch" + 0.030*"early" + '
  '0.027*"consider" + 0.026*"figure"'),
 (1,
  '0.087*"suppose" + 0.072*"music" + 0.047*"exactly" + 0.045*"average" + '
  '0.044*"free" + 0.043*"travel" + 0.043*"safe" + 0.041*"mean" + 0.023*"more" '
  '+ 0.021*"so"'),
 (2,
  '0.040*"work" + 0.038*"just" + 0.037*"use" + 0.036*"design" + 0.033*"nice" + '
  '0.033*"set" + 0.029*"really" + 0.028*"so" + 0.026*"love" + 0.024*"buy"'),
 (3,
  '0.051*"read" + 0.039*"light" + 0.035*"easily" + 0.033*"way" + 0.031*"so" + '
  '0.028*"finish" + 0.028*"book" + 0.027*"line" + 0.023*"turn" + 0.021*"look"'),
 (4,
  '0.046*"sound" + 0.044*"well" + 0.041*"feel" + 0.033*"more" + '
  '0.029*"product" + 0.027*"review" + 0.027*"people" + 0.026*"year" + '
  '0.025*"think" + 0.025*"purchase"'),
 (5,
  '0.117*"question" + 0.105*"ask" + 0.083*"answer" + 0.076*"replace" + '
  '0.047*"drop" + 0.046*"month" + 0.042*"clean

### Compute model and coherence score
to judge how good the given topic model is 

In [80]:
# Coherence score 
lda_coherence_model = CoherenceModel(model=LDA_model, texts=data_words, dictionary=id_to_word, coherence='c_v')
coherence_lda= lda_coherence_model.get_coherence()
print("Coherence score: ", coherence_lda)

Coherence score:  0.4096251425298079


### Visualization

In [81]:
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(LDA_model, corpus, id_to_word,mds='mmds')
vis

  default_term_info = default_term_info.sort_values(
