In [1]:
#pip install --upgrade gensim
#pip install tqdm
#pip install pyldavis
# install pickle

In [1]:
# Define IAM role
import boto3

# NLP things
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# visualisation
import pyLDAvis.gensim 
import pyLDAvis

# import others
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import io
from io import StringIO
import string
import re

import os
import json
import time
from datetime import datetime, timedelta
import pickle
from pprint import pprint
import sys
import urllib.parse
import csv



In [2]:
# import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
# use English stopwords
stops = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#########################################
##  TEXT CLEANING FUNCTIONS
#########################################

# Function for deleting emoji
# This function is from Adam (2018): https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def deleteEmojis(text):    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00010000-\U0010ffff"
                              "]+", flags=re.UNICODE)    
    return emoji_pattern.sub(r' ',text)

# Function for deleting default tags or labels in the tweets like 'VIDEO:' and 'AUDIO:'
# This function is partly from Bica (2010): https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def remove_tweet_marks(tweet):
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    tweet = re.sub('&amp', '', tweet)
    tweet = re.sub('RT @', '', tweet) # keep one space
    return tweet

# Function for expanding contractions
# This function is from Dubois (2017): https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# Function for tweet text cleaning
# This function is partly from Zx81 (2014): https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet/24399874
# This function is partly from Oneporter (2014): https://stackoverflow.com/questions/817122/delete-digits-in-python-regex
def text_cleaning(data):
    # delete emoji
    data = data.map(lambda text: deleteEmojis(text))
    # deleting the URL
    data = data.map(lambda text: re.sub(r"http\S+", "", text))
    # deleting 'VIDEO' and 'AUDIO'
    data = data.map(lambda text: remove_tweet_marks(text))
    # convert the relevant column to lowercase
    data = data.str.lower()
    # expending contractions
    data = data.map(lambda text: decontracted(text))
    # delete punctuations
    data = data.map(lambda text: re.sub(r'[,\.!?:;@#&*$¥+~•₹€£=—\-\–\\→\⇢\<\>\|\“\”\’\{\}\'\"\`\[\]\(\)_\-\%\/]', ' ', text))
    # remove all single characters
    data = data.map(lambda text: re.sub(r'\s+[a-zA-Z]\s+', ' ', text))
    # remove digits
    # notice that in this case some product names or terms may contain numbers, e.g. "P50","4g","5g"
    # Thus only remove those digits that are not part of another word
    data = data.map(lambda text: re.sub(r'\b\d+\b', ' ', text))
    # deleting surplus spacings
    data = data.map(lambda text:  re.sub(r'\s+', ' ', text))
    
    # 删除过短的记录 delete short sentence?? # 可以结合word token，计算长度？
    
    return data

In [None]:
# This function is for tokenising sentences in the corpus
def tokenising_corpus(data):
    # Transform df into list
    words = data.tolist()
    # tokenising each sentence
    word_tokens = []
    for tweet in words:
        word_tokens.append(word_tokenize(tweet))
        
    return word_tokens

In [24]:
# This function allows user to remove stopwords
# and also allow to specify and remove some irrelevant words in this case (such sentiments) for tuning the model
# This will only apply to the first LDA model for get more clear topics
def custom_words_remover(word_lst, text_tokens):
    # create a new list with specified words removed 
    processed_tokens = []
    for token in text_tokens:
        processed_tokens.append([w for w in token if not w in word_lst])
        
    return processed_tokens

In [None]:
# This function is for stemming the words in the corpus
def stemming_words(text_tokens):
    #from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    # stemming the tokens
    stemmed = []
    for token in text_tokens:
        stemmed.append([ps.stem(word) for word in token])
    
    return stemmed

In [5]:
# Function for making biagram
# This funciton is from Prabhakaran (2018): https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def make_bigrams(data,min_count,thres):
    # Build the bigram model with min_count=10
    # higher threshold fewer phrases.
    bigram = gensim.models.Phrases(data, min_count=min_count, threshold=thres)
    
    # Faster way to get a sentence clubbed as a bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    return [bigram_mod[doc] for doc in data]

In [42]:
# Compute the perplexity and coherence score of the model
# This function is from Kapadia (2019):https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
def model_benchmarking(data, model, dictionary, corpus):
    # Compute Model Perplexity
    p = model.log_perplexity(corpus)
    print('\nPerplexity: ', p)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model = model, 
                                         texts = data, 
                                         dictionary = dictionary, 
                                         coherence = 'c_v')

    coherence_lda = coherence_model_lda.get_coherence()

    print('\nCoherence Score: ', coherence_lda)

In [None]:
# Supporting function of the model tuning: build individual lda model and compute its coherence
# This function is from Kapadia (2019):https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
from gensim.models import CoherenceModel

def compute_coherence_values_basic(data,corpus,dictionary,k,alpha):
    # build individual lda model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           alpha = alpha,
                                           random_state=5,
                                           passes=10)
    
    #p = lda_model.log_perplexity(corpus)
    # build coherence model
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                         texts=data, 
                                         dictionary=dictionary, 
                                         coherence='c_v')
    # get coherence score
    return coherence_model_lda.get_coherence()

In [None]:
# This function search the optimal hyperparameter settings for the lda model
# Similar to the grid search 
# It can take a long time to run
# This function is from Kapadia (2019):https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
def tuning_lda_model(data,corpus,dictiornay, min_topics, max_topics, step_size):
    # tqdm is a progress bar for visualising the cost time
    import tqdm

    grid = {}
    grid['Validation_Set'] = {}
    
    # Set topics range
    topics_range = range(min_topics, max_topics, step_size)
    
    # Alpha parameter
    alpha = [0.01, 0.1, 0.3, 0.6, 1]
    alpha.append('symmetric')
    alpha.append('asymmetric')
    
    # Use 75% of original corpus as the validation sets
    num_of_docs = len(corpus)
    corpus_sets = [ gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
                    corpus]

    corpus_title = ['75% Corpus', '100% Corpus']
    model_results = {'Validation_Set': [],
                     'Alpha':[],
                     'Topics': [],
                     'Coherence': []}
    
    # calculate iterating times
    t = 0
    for i in range(len(corpus_sets)):
        for a in alpha:
            for k in topics_range:
                #print(i,' ',a,' ',k)
                t += 1
                
    print('iteration times: ',t)
    
    # Can take a long time to run
    pbar = tqdm.tqdm(total=t)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through alpha values
        for a in alpha:
            # iterate through number of topics
            for k in topics_range:
                # Compute the coherence for each model
                cv = compute_coherence_values_basic(data=data,
                                                    corpus=corpus_sets[i],
                                                    dictionary=dictiornay,
                                                    k=k,
                                                    alpha=a)                
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Alpha'].append(a)
                model_results['Topics'].append(k)
                model_results['Coherence'].append(cv)
                print('pass')
                
                pbar.update(1)
                
    pbar.close()
    
    return model_results

In [None]:
# This functions is for creating the documents-topic matrix
# which can show the individual document's probabilities for each topic
# This function is from Wang (2019): https://stackoverflow.com/questions/56408849/after-applying-gensim-lda-topic-modeling-how-to-get-documents-with-highest-prob

def create_doc_topic_matrix(model, corpus, num_topics):
    # Create a dictionary, with topic ID as the key, and the value is a list of tuples (docID, probability of this particular topic for the doc) 
    topic_dict = {i: [] for i in range(num_topics)}
    
    # Remember to set the minimum_probability=0 in the model or can't get probabilities of one under each topic
    # Loop over all the documents to group the probability of each topic
    for doc_id in range(len(corpus)):
        topic_vector = model[corpus[doc_id]]
        for topic_id, prob in topic_vector: 
            topic_dict[topic_id].append(prob)
    
    # Create documents-topic matrix
    doc_topic = pd.DataFrame.from_dict(topic_dict)
    
    return doc_topic

In [88]:
# Function for creating eta matrix for training the guided lda model
# the eta matrix can be used as a prior belief on word probability
# can be use to assign probabilities for each word-topic combination
def create_eta_matrix(num_topics,top_n,lda_model,id2word):
    # get dictionary length
    dic_len = len(id2word.token2id)
    # initialising eta matrix with 0.001
    eta_matrix = np.full((num_topics, dic_len), 0.001)
    
    # update the eta_matrix
    # add the confidence to top_n words based on the model output probabilities
    # hierarchical assignment: assign top 10 words with extra 0.15 and assign the top 10-20 words with 0.1
    for topic_i in range(num_topics):
        top_words = lda_model.get_topic_terms(topicid=topic_i,topn=top_n)
        #count = 0
        for pair in top_words:
            #print(pair[0],pair[1],'\n')
            if top_words.index(pair) < 10:
                eta_matrix[topic_i][pair[0]] = pair[1] + 0.10
            else:
                eta_matrix[topic_i][pair[0]] = pair[1] + 0.05
                
    return eta_matrix

In [86]:
# Function is for deleting abandoned topic from the eta matrix
# This function is from Deshpande (2012): https://stackoverflow.com/questions/3877491/deleting-rows-in-numpy-array
def abandon_topic(topic_id, matrix):
    matrix = np.delete(matrix, (topic_id), axis=0)
    return matrix

In [None]:
# save the model to model_path
def save_lda_model(model, model_name, save_path):
    # save the model to model_path
    model.save(save_path+'{}.model'.format(model_name))
    # get list of componenets
    components = [file for file in os.listdir(model_path) if file.startswith(model_name)]
    
    return components

In [None]:
# Function for uploading eta matrix and list of component to S3
# This function is from Shabani (2018): https://stackoverflow.com/questions/49120069/writing-a-pickle-file-to-an-s3-bucket-in-aws
def file_upload_helper(file, file_name, bucket_name):
    # create S3 resource
    s3_resource = boto3.resource('s3')
    
    # covert the file to pkl
    obj_pkl = pickle.dumps(file)
    obj_key = '{}.pkl'.format(file_name)
    
    s3_resource.Object(bucket_name, obj_key).put(Body=obj_pkl)
    print('Success')

In [None]:
# Function for uploading model to S3
# This function is from Sophros (2020): https://stackoverflow.com/questions/61638940/save-a-gensim-lda-model-to-s3
def model_upload_helper(file_lst, local_path, bucket_name):
    for file_name in file_lst:
        # get file path
        file_path = local_path + file_name        
        # create s3 resource
        s3_resource = boto3.resource('s3')
        # upload file
        s3_resource.meta.client.upload_file(file_path, bucket_name, file_name)
        print('successfully upload ' + file_name)

In [6]:
# read the data
bucket_name = "proxy-data-and-pre-collected-data-for-training"
file_key_text = "20191226-reviews.csv"
file_key_brand = "20191226-items.csv"

data_location_text = "s3://proxy-data-and-pre-collected-data-for-training/20191226-reviews.csv"
data_location_brand = "s3://proxy-data-and-pre-collected-data-for-training/20191226-items.csv"

In [7]:
df_text = pd.read_csv(data_location_text)
df_brand = pd.read_csv(data_location_brand)

In [8]:
df_text.head()

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,amy m. teague,3,"March 18, 2004",False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,tristazbimmer,4,"August 28, 2005",False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0


In [9]:
# merge the two dataset
df_brand = df_brand[['asin','brand']]
df = df_text.merge(df_brand, how='left', on='asin')
#drop irrelevant contents
df.drop(['asin','name','date','verified'], axis=1, inplace=True)
df.head()

Unnamed: 0,rating,title,body,helpfulVotes,brand
0,3,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0,
1,1,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0,
2,5,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0,
3,3,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0,
4,4,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0,


In [10]:
# Initial exploratory data analysis
df.groupby(['brand']).count()

Unnamed: 0_level_0,rating,title,body,helpfulVotes
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ASUS,251,251,251,132
Apple,5145,5144,5145,1792
Google,3787,3787,3786,1743
HUAWEI,2225,2225,2225,1006
Motorola,8880,8880,8879,3686
Nokia,5915,5915,5914,2750
OnePlus,347,347,347,171
Samsung,33629,33616,33612,12567
Sony,3196,3196,3195,1676
Xiaomi,4411,4411,4411,1557


In [11]:
# Check NA
print(df.isnull().sum())

rating              0
title              14
body               21
helpfulVotes    40771
brand             200
dtype: int64


In [12]:
# Delete NA in review title and body
df = df.dropna(axis=0, subset=['title','body'])

In [13]:
# reset the dataframe index
df = df.reset_index()
# drop old index column
df = df.drop(['index'], axis=1)

In [14]:
# Combine the review title and body into full text for analysis
df['text'] = df['title'] + ' ' +df['body']

In [15]:
df.head()

Unnamed: 0,rating,title,body,helpfulVotes,brand,text
0,3,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0,,"Def not best, but not worst I had the Samsung ..."
1,1,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0,,Text Messaging Doesn't Work Due to a software ...
2,5,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0,,"Love This Phone This is a great, reliable phon..."
3,3,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0,,"Love the Phone, BUT...! I love the phone and a..."
4,4,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0,,"Great phone service and options, lousy case! T..."


In [16]:
#########################################
##  BASIC CLEANING AND TEXT PROCESSING
#########################################

# text cleaning
df['text'] = text_cleaning(df['text'])

In [19]:
# Tokenising sentences in the corpus
word_tokens = tokenising_corpus(df['text'])

# keep the copies
df['text_tokens'] = word_tokens

# get the length of each text
df['text_len'] = df['text_tokens'].map(lambda x: len(x))

In [25]:
###################################
#      Deleting stop words
###################################

# create a list of stopwords
stops = set(stopwords.words("english"))
# remove stopwords
filtered_tokens = custom_words_remover(stops, word_tokens)

In [None]:
###################################
#           Stemming
###################################
stemmed = stemming_words(filtered_tokens)

In [20]:
'''
When keeping the emotional words, the LDA model tends to classify topics based on positive and negative sentiments, 
rather than based on single functions. 
For example, the negative comments about the price and the negative comments about the battery
will be put under one topic. In this way, reviews about one single function might be scattered on several topics. 
However, the focus of the first LDA model should be extract hot words about specific cellphone features or functions 
and generate the corresponding eta matrix.  
Therefore, when tuning the model, irrelevant words that express sentiments should be removed from the texts.

"star" will also be deleted, 
because in this case "star" is typically used to describe the sentimental polarity of customers (5 star = best while 1star = worst)

Another example is that in the first model, the topic "screen" might associate with strongly positive attitudes, 
but when it comes to the second model, most people might actually complain the srceen of the Huawei P50, 
and associate the negetive word with the topic "screen"
'''

'\nWhen keeping the emotional words, the LDA model tends to classify topics based on positive and negative sentiments, \nrather than based on single functions. \nFor example, the negative comments about the price and the negative comments about the battery\nwill be put under one topic. In this way, reviews about one single function might be scattered on several topics. \nHowever, the focus of the first LDA model should be extract hot words about specific cellphone features or functions \nand generate the corresponding eta matrix.  \nTherefore, when tuning the model, irrelevant words that express sentiments should be removed from the texts.\n\n"star" will also be deleted, \nbecause in this case "star" is typically used to describe the sentimental polarity of customers (5 star = best while 1star = worst)\n\nAnother example is that in the first model, the topic "screen" might associate with strongly positive attitudes, \nbut when it comes to the second model, most people might actually co

In [30]:
# Use defined function to specify some irrelevant words in this case (such sentiments) for tuning the model
# This will only apply to the first LDA model for get more clear topics

# Add emotional words as new stopword list
newstopwords = ['like', 'good', 'better', 'best', 'bad', 'worse', 'worst', 'happi', 'great', 'really','realli',
                'love', 'lov', 'also', 'awesome','awesom','amaz','lousi','far','well','perfectli','ok',
                'ever','perfect','fun','excelent','excel','excelled','absolut','less','much','more','fewer','fine','finest',
                'exactli','poor','pleas','glad','veri','high','terribl','minim','never','even','thank','gift','star','thank'] 

# delete new added stopwords from the text tokens
df['processed_tokens'] = custom_words_remover(newstopwords, stemmed)

In [25]:
# Phrase Modeling: Making Bigrams
# Build the bigram model with min_count=10
# higher threshold fewer phrases.
df['processed_tokens'] = make_bigrams(data= df['processed_tokens'],min_count= 10,thres= 100)

In [34]:
# drop processed columns
df.drop(['title','body'], axis=1, inplace=True)

In [40]:
df.head()

Unnamed: 0,rating,helpfulVotes,brand,text,text_tokens,text_len,processed_tokens
0,3,1.0,,def not best but not worst had the samsung a60...,"[def, not, best, but, not, worst, had, the, sa...",313,"[def, samsung, a600, awhil, doo, doo, read, re..."
1,1,17.0,,text messaging does not work due to software i...,"[text, messaging, does, not, work, due, to, so...",136,"[text, messag, work, due, softwar, issu, nokia..."
2,5,5.0,,love this phone this is great reliable phone a...,"[love, this, phone, this, is, great, reliable,...",126,"[phone, reliabl, phone, purchas, phone, samsun..."
3,3,1.0,,love the phone but love the phone and all beca...,"[love, the, phone, but, love, the, phone, and,...",101,"[phone, phone, need, one, expect, price, bill,..."
4,4,1.0,,great phone service and options lousy case the...,"[great, phone, service, and, options, lousy, c...",132,"[phone, servic, option, case, phone, everi, pu..."


In [28]:
#########################################################
#  Create word dictionary and bag of words (bow) corpus
#########################################################
# Create id to word Dictionary
# id2word is a dictionary containing the IDs of all input words
id2word = corpora.Dictionary(df['processed_tokens'])

# Create corpus that contains all documents
texts = df['processed_tokens']

# Create bag of word for each document in the corpus 
# each bow contains the id of each word in that single document and its number of occurrences in that document 
# (term id, term document Frequency)
corpus = [id2word.doc2bow(text) for text in texts]


In [43]:
##########################################################################
##            Building the basic LDA model
##########################################################################
# assuming number of topics
num_topics = 10

# Build LDA model
# Remember to set the minimum_probability=0 in the model or can't get probabilities of a word under each topic
lda_model = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = num_topics,
                                       passes = 10,
                                       random_state=5, 
                                       minimum_probability=0)

In [44]:
# Compute the perplexity and coherence score of the model
model_benchmarking(df['processed_tokens'], lda_model, id2word, corpus)


Perplexity:  -7.131014341772236

Coherence Score:  0.4913308492786602


In [None]:
##########################################################################
##                  Tuning the LDA model on proxy data
##########################################################################
# Use defined functions to tune the lda model and find optimal hyperparameter settings
# It can take a long time to run
model_results = tuning_lda_model(data = df['processed_tokens'],
                                 corpus = corpus,
                                 dictiornay = id2word,
                                 min_topics = 5, 
                                 max_topics = 6,
                                 step_size = 1)

# Convert to the dataframe and save to the csv files
model_results_df = pd.DataFrame.from_dict(model_results)
model_results_df.to_csv("lda_on_proxy_tuning_results.csv")

In [79]:
##########################################################################
##          Building the LDA model (with optimal parameter)
##########################################################################

# Optimal model after tuning: 
# Hyperparameters: num_topics = 11, alpha = 'symmetric', passes =10

# Set number of topics
num_topics = 11

# Build LDA model
# Remember to set the minimum_probability=0 in the model or can't get probabilities of a word under each topic
lda_model = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = num_topics,
                                       passes = 10,
                                       alpha = 'symmetric',
                                       random_state=5, 
                                       minimum_probability=0)

# Perplexity:  -7.206703803237903
# Coherence Score:  0.5061053071793964

  0%|          | 0/14 [23:15<?, ?it/s]


In [80]:
# Compute the perplexity and coherence score of the model
model_benchmarking(df['processed_tokens'], lda_model, id2word, corpus)


Perplexity:  -7.206703803237903

Coherence Score:  0.5061053071793964


In [32]:
# from pprint import pprint
# print the top 20 keywords under each topic
pprint(lda_model.print_topics(num_words=20))

[(0,
  '0.061*"phone" + 0.055*"call" + 0.032*"work" + 0.021*"speaker" + '
  '0.021*"issu" + 0.020*"wifi" + 0.019*"connect" + 0.016*"problem" + '
  '0.016*"sound" + 0.015*"hear" + 0.014*"time" + 0.011*"volum" + '
  '0.011*"bluetooth" + 0.011*"sometim" + 0.010*"para" + 0.010*"make" + '
  '0.010*"freez" + 0.010*"signal" + 0.009*"slow" + 0.009*"precio"'),
 (1,
  '0.056*"phone" + 0.026*"use" + 0.015*"text" + 0.013*"call" + 0.010*"get" + '
  '0.010*"need" + 0.010*"want" + 0.009*"one" + 0.008*"go" + 0.008*"would" + '
  '0.008*"featur" + 0.008*"time" + 0.007*"messag" + 0.007*"nokia" + '
  '0.007*"make" + 0.006*"thing" + 0.005*"set" + 0.005*"window" + 0.005*"iphon" '
  '+ 0.005*"work"'),
 (2,
  '0.168*"batteri" + 0.091*"phone" + 0.076*"life" + 0.056*"five" + '
  '0.040*"charg" + 0.032*"day" + 0.030*"last" + 0.027*"fast" + 0.025*"product" '
  '+ 0.022*"work" + 0.015*"long" + 0.013*"four" + 0.013*"hour" + 0.013*"use" + '
  '0.012*"expect" + 0.009*"time" + 0.008*"samsung" + 0.007*"nice" + '
  '0.0

In [None]:
# Visualisation
# Visualize the topics 
# lambda = 0.6 can be ideal
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [82]:
##########################################################
##   Check topics and contents under each topic
##########################################################
# create test df
test = df
texts = test['processed_tokens']
# create new corpus
corpus_new = [lda_model.id2word.doc2bow(text) for text in texts]

In [84]:
# Creating the documents-topic matrix
# which can show the individual document's probabilities for each topic
doc_topic = create_doc_topic_matrix(model = lda_model,
                                    corpus = corpus_new,
                                    num_topics = num_topics)
print(doc_topic.head())

         0         1         2         3         4         5         6   \
0  0.000612  0.284389  0.000612  0.070213  0.034841  0.000612  0.000612   
1  0.001340  0.394747  0.001340  0.001340  0.130991  0.214136  0.001340   
2  0.001526  0.465676  0.001526  0.146594  0.001526  0.242606  0.001526   
3  0.002331  0.297373  0.101239  0.113062  0.002332  0.002332  0.002331   
4  0.001653  0.001654  0.001654  0.001654  0.045375  0.001654  0.001653   

         7         8         9         10  
0  0.087441  0.059395  0.009382  0.451889  
1  0.032093  0.208670  0.012665  0.001340  
2  0.034764  0.001527  0.001527  0.101202  
3  0.002332  0.472006  0.002331  0.002332  
4  0.031937  0.306900  0.001654  0.604212  


In [103]:
# Concat documents-topic matrix with the review dataframe
joined_df = pd.concat([df, doc_topic], axis = 1, join = 'outer')

In [88]:
# Select the 20 comments that are most relevant to topic n
# Notice that the column name is INT value in this case
joined_df.sort_values(by = 10,ascending=False)['text'].iloc[0:19].tolist()

['not very rugged bought this phone because my previous one kept falling out of my belt clip and getting scratched up figured would pay the extra money to get mil spec phone and not have to worry about damage happening have kayak and wanted to be able to take phone out on the boat without worry about water getting in it also purchased verizon belt clip with holder that wraps around the phone well this first thing found was that the belt clip kept coming off my belt and the phone kept landing on hard floors etc the second thing know the outside screen thin plastic is cracked verizon wants minimum of to replace it with used phone unless would purchased insurance phone repair places can not get parts to repair it of course this eliminates the waterproofing feature since water can likely get in the cracks on the screen do not mind the cracks themselves since do not really use the outside screen third happening was when the holder that wraps around the phone started coming off and letting t

In [76]:
# show the top 20 words under each topic
lda_model.show_topic(topicid = 0, topn = 20)

[('phone', 0.11901073),
 ('work', 0.062967874),
 ('new', 0.06042277),
 ('came', 0.028426593),
 ('look', 0.026468102),
 ('condit', 0.025736837),
 ('brand', 0.023311382),
 ('price', 0.022512201),
 ('expect', 0.018078534),
 ('everyth', 0.017559929),
 ('scratch', 0.016999235),
 ('buy', 0.015849011),
 ('purchas', 0.01523436),
 ('arriv', 0.014811356),
 ('fast', 0.014788588),
 ('product', 0.012467896),
 ('seller', 0.010434202),
 ('charger', 0.010263215),
 ('ship', 0.008981515),
 ('describ', 0.008774676)]

In [89]:
#################################
#    Create eta matrix
#################################
# Creating eta matrix with top 20 words under each topic
# the eta matrix can be used to train the guided lda model as a prior belief on word probability 
# can be use to assign probabilities for each word-topic combination
eta_matrix = create_eta_matrix(num_topics,20,lda_model,id2word)

In [90]:
# Check if it works well
print(eta_matrix.shape,'\n')
print(lda_model.get_topic_terms(topicid=5,topn=20),'\n')
print(eta_matrix[5][72],'\n')
print(eta_matrix[5][122],'\n')

(11, 26345) 

[(1793, 0.03089669), (1410, 0.026323322), (116, 0.0187444), (72, 0.01663353), (1327, 0.016337896), (755, 0.014705432), (3577, 0.010032764), (158, 0.007873814), (367, 0.007668735), (619, 0.007611291), (43, 0.007158337), (401, 0.0068745613), (740, 0.006723654), (182, 0.0061422177), (185, 0.0057118465), (217, 0.0056817266), (788, 0.0056452076), (181, 0.0055978703), (1391, 0.0054754717), (783, 0.005114052)] 

0.11663352921605111 

0.001 



In [39]:
# Deleting abandoned topic from the eta matrix
# topic 3 is reviews in Spanish and topic 4 is talking about cellphone refurbishment
# They can be considered irrelevant in the future analysis, so need to be abandoned
# abandon topic 3: Spanish 
eta_matrix = abandon_topic(topic_id = 3, matrix = eta_matrix)
# abandon topic 4: Refurbishment
# Notice now original topic 4 become the topic 3 in the eta matrix (after deleting the previous topic3)
eta_matrix = abandon_topic(topic_id = 3, matrix = eta_matrix)

In [43]:
# Check if it works well
#eta_matrix[:,72]
print(eta_matrix.shape,'\n')
print(lda_model.get_topic_terms(3),'\n')
print(lda_model.get_topic_terms(4),'\n')

print(lda_model.get_topic_terms(5),'\n')
print(eta_matrix[3][72],'\n')

(9, 26345) 

[(72, 0.19704744), (191, 0.0590912), (122, 0.030578958), (61, 0.024904126), (116, 0.020562498), (367, 0.019361166), (341, 0.019017749), (314, 0.015218113), (146, 0.0146378195), (258, 0.014177452)] 

[(72, 0.07276104), (147, 0.021716094), (122, 0.021425875), (217, 0.016713087), (15, 0.015563478), (123, 0.014971952), (258, 0.013351448), (235, 0.012475495), (227, 0.01203926), (43, 0.011461665)] 

[(1793, 0.03986148), (72, 0.028285624), (116, 0.018964099), (755, 0.015180794), (618, 0.01181858), (1410, 0.0098934015), (43, 0.008738315), (1391, 0.008016017), (401, 0.0079249935), (2993, 0.0075179385)] 

0.17828562445938587 



In [96]:
#################################
#    Save the model
#################################
#homepath = '/home/ec2-user/SageMaker/'
homepath = os.getcwd()
model_path = homepath + 'LDA_model_on_proxy_data/'
print(model_path)

/home/ec2-user/SageMaker/LDA_model_on_proxy_data/


In [100]:
# save the model to model_path and get list of componenets
components = save_lda_model(lda_model, 'lda_model_on_proxy_data', model_path)

In [102]:
# Check if it works well
print(os.listdir(model_path),'\n')
print(components)

['lda_model_on_proxy_data.model.id2word', 'lda_model_on_proxy_data.model', 'lda_model_on_proxy_data.model.expElogbeta.npy', '.ipynb_checkpoints', 'lda_model_on_proxy_data.model.state']


In [None]:
#################################
#    Upload to S3
#################################
bucket_name = 'lda-model-on-proxy-data'
#homepath = '/home/ec2-user/SageMaker/'
#model_path = homepath + 'LDA_model_on_proxy_data/'

In [229]:
# Upload eta_matrix and list of component to S3
file_upload_helper(file = eta_matrix, file_name ='eta_matrix', bucket_name='lda-model-on-proxy-data')
file_upload_helper(file = components, file_name ='components', bucket_name='lda-model-on-proxy-data')

Success


In [151]:
# Upload model to S3
model_upload_helper(components, model_path, bucket_name)