### Load Libraries and Tools Required

In [1]:
# Tomotopy
import tomotopy as tp

# Toolkit
import pandas as pd
import spacy
import sys
import numpy as np
import pprint

# NLTK
import re
import nltk
from nltk.corpus import stopwords

# Gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import Phrases,CoherenceModel
from gensim.models.phrases import Phraser

### Load Data

In [2]:
data = pd.read_csv('data/random_sample_data.csv', sep=',', header=0)
data = data.dropna()
data.head()

Unnamed: 0,review_body
0,UPDATED: The Roy has slipped and they can't ge...
1,I went to renew my drivers license at this loc...
2,Much has been written about Olive Garden and h...
3,I've been coming here since July. I had never ...
4,"I just recently moved to Scottsdale, AZ from C..."


### Preprocessing Function Wrapper

In [3]:
def run_preprocess(df, min_token_len=3, rm_accent=True, bigram_min_cnt=5, bigram_thresh=100,
                   extra_stops=['from','subject','re', 'edu','use'],
                   postags=['NOUN','VERB','ADV','ADJ']):

    '''Function wrapper to preprocess the dataset and generate ready to model results
    
    *** Inputs**
    df: dataframe with "review_body" column containing text inputs
    min_token_len: int -> tokens less than this number are excluded during tokenization
    rm_accent : bool -> flag whether to remove deaccents
    bigram_min_cnt: int -> ignore all words and bigrams with total collected count lower than this value
    bigram_thresh: int -> threshold for building phrases, higher means fewer phrases
    extra_stops: list -> extra stopwords to ignore asidr from NLTK default
    postags:list -> words/bigrams to include based on POS (part-of-speech)
    
    ** Returns**
    df: Master df with data and labels
    word_list_lemmatized: list -> list of lists w/ lemmatized bigrams 
    '''
    
    ### Setting up stopwords and Spacy
    nltk.download('stopwords', quiet=True)
    st_words = stopwords.words('english')
    st_words.extend(extra_stops)

    # Convert values to list
    doc_list = df.review_body.values.tolist()

    # Remove email signs, newlines, single quotes
    doc_list = [re.sub(r'\S*@\S*\s?', '', txt) for txt in doc_list]
    doc_list = [re.sub(r'\s+', ' ', txt) for txt in doc_list]
    doc_list = [re.sub(r"\'", "", txt) for txt in doc_list]

    # Tokenize based on min_token_len and deaccent flags
    print("Tokenizing...\n")
    word_list = [simple_preprocess(txt, deacc=rm_accent, min_len=min_token_len) for txt in doc_list]
     
    # Create bigram models
    bigram = Phrases(word_list, min_count=bigram_min_cnt, threshold=bigram_thresh) # use original wordlist to build model
    bigram_model = Phraser(bigram)
    
    # Remove stopwords
    print("Removing Stopwords...\n")
    word_list_nostops = [[word for word in txt if word not in st_words] for txt in word_list]
    
    # Implement bigram models
    print("Create bigrams...\n")
    word_bigrams = [bigram_model[w_vec] for w_vec in word_list_nostops] # implement it in the list w/ no stopwords
    
    # Lemmatize POS-tags to keep
    print("Lemmatizing, keeping " + ",".join(postags)+ " POS tags...\n")
    word_list_lemmatized = lemmatize(word_bigrams, ptags=postags)

    print("Done preprocessing " + str(df.shape[0]) + " documents")
    return df, word_list_lemmatized
    

In [4]:
# Helper function    
def lemmatize(word_list, ptags):
    '''Lemmatizes words based on allowed postags, input format is list of sublists 
       with strings'''
    spC = spacy.load('en_core_web_sm')
    lem_lists =[]
    for vec in word_list:
        sentence = spC(" ".join(vec))
        lem_lists.append([token.lemma_ for token in sentence if token.pos_ in ptags])
    # Remove Empty Lists after Lemmatizing
    lem_lists = list(filter(None, lem_lists))
    return lem_lists

### Run Preprocessing

In [5]:
df, word_list_lemmatized = run_preprocess(data)

Tokenizing...

Removing Stopwords...

Create bigrams...

Lemmatizing, keeping NOUN,VERB,ADV,ADJ POS tags...

Done preprocessing 670866 documents


In [6]:
# Preview Processed Data
print(word_list_lemmatized[0])
df.head()

['update', 'slip', 'can', 'pop', 'recently', 'post', 'baby', 'stroller', 'quick', 'drink', 'take', 'bartender', 'minute', 'take', 'order', 'find', 'odd', 'considering', 'may', 'people', 'place', 'drink', 'selection', 'food', 'selection', 'bore', 'baby', 'stroller', 'signage', 'could', 'do', 'totally', 'different', 'way', 'due', 'limited', 'space', 'ask', 'people', 'leave', 'baby', 'stroller', 'least', 'give', 'people', 'option', 'piss', 'sign', 'would', 'thanking', 'patron', 'come', 'spend', 'money', 'establishment', 'would', 'want', 'offend', 'people', 'resident', 'area', 'need', 'new', 'serve', 'traditional', 'irish', 'fare', 'reasonable', 'price', 'open', 'great', 'job', 'make', 'former', 'digsof', 'soon', 'walk', 'feel', 'home', 'welcome', 'many', 'place', 'day', 'owner', 'actually', 'greet', 'soon', 'walk', 'issue', 'would', 'comment', 'lack', 'space', 'front', 'bar', 'could', 'configured', 'different', 'way', 'would', 'allowed', 'room', 'stool', 'bar', 'lead', 'comment', 'authent

Unnamed: 0,review_body
0,UPDATED: The Roy has slipped and they can't ge...
1,I went to renew my drivers license at this loc...
2,Much has been written about Olive Garden and h...
3,I've been coming here since July. I had never ...
4,"I just recently moved to Scottsdale, AZ from C..."


### Build PAM Model

In [7]:
# Function to Train the HDP Model
def train_Pachinko_model(k1, k2, rm_top, min_cf, word_list, iterations, top_n=10):
    '''Wrapper function to train tomotopy HDP Model object
    
    *** Inputs**
    k1: int -> number of super-topics(document-topic frequency)
    k2: int -> number of sub topics (word-topic frequency)
    rm_top: int -> number of top words to be removed. default is 0
    min_cf: int -> minimum frequency of words. Those less than min_cf are excluded
    word_list: list -> lemmatized word list of lists
    iterations : int -> number of iterations, in increments of 10, to train the model
    top_n: int -> number of keywords to generate per sub-topic
    
    ** Returns**
    PAM: trained PAM Model 
    '''
    PAM = tp.PAModel(k1=k1,
                     k2=k2,
                     rm_top=rm_top,
                     min_cf=min_cf
                     )
    # Add docs to train
    for vec in word_list:
        PAM.add_doc(vec)

    print('Starting training model')
    for i in range(0, 1000, iterations):
        PAM.train(iterations)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, PAM.ll_per_word))
    for k in range(PAM.k1):
        subtopics = PAM.get_sub_topics(k)
        print('\n\nSubtopics of topic #%s' % k)
        for subtopic, probability in subtopics:
            print('Top 10 words of subtopic topic #%s: probability in supertopic #%s: %r' % (subtopic, k, probability))
            print('%r' % PAM.get_topic_words(subtopic, top_n=top_n))
        
    print("Done\n")  
    return PAM

In [8]:
PAM_mdl = train_Pachinko_model(k1=4, 
                               k2=5,
                               rm_top=100,
                               min_cf=50,
                               word_list=word_list_lemmatized,
                               iterations=10
                               )

Starting training model
Iteration: 0	Log-likelihood: -10.139930041875736
Iteration: 10	Log-likelihood: -9.487160654009745
Iteration: 20	Log-likelihood: -9.366111707817879
Iteration: 30	Log-likelihood: -9.349898029101945
Iteration: 40	Log-likelihood: -9.34554649224123
Iteration: 50	Log-likelihood: -9.344710931362744
Iteration: 60	Log-likelihood: -9.344964528040455
Iteration: 70	Log-likelihood: -9.344842011395947
Iteration: 80	Log-likelihood: -9.34437556670947
Iteration: 90	Log-likelihood: -9.34419823489943
Iteration: 100	Log-likelihood: -9.343431213578812
Iteration: 110	Log-likelihood: -9.342986520735392
Iteration: 120	Log-likelihood: -9.34249805384473
Iteration: 130	Log-likelihood: -9.34117902017156
Iteration: 140	Log-likelihood: -9.34108757908582
Iteration: 150	Log-likelihood: -9.3396596257298
Iteration: 160	Log-likelihood: -9.339401235201548
Iteration: 170	Log-likelihood: -9.338533200431389
Iteration: 180	Log-likelihood: -9.337970612794223
Iteration: 190	Log-likelihood: -9.3373775424

In [9]:
from pprint import pprint
# View topics per sub-topic - k2 value
for i in range(0, PAM_mdl.k2):
    subtopic_num = i
    print(f"Sub Topic #{subtopic_num}")
    pprint(PAM_mdl.get_topic_words(subtopic_num))
    print("\n")

Sub Topic #0
[('car', 0.009082858450710773),
 ('care', 0.0058250329457223415),
 ('do', 0.005801891442388296),
 ('help', 0.005711236968636513),
 ('job', 0.005666292272508144),
 ('manager', 0.00539853610098362),
 ('thank', 0.004992503207176924),
 ('company', 0.00497567281126976),
 ('guy', 0.004796850029379129),
 ('hair', 0.004788052290678024)]


Sub Topic #1
[('hotel', 0.007915666326880455),
 ('stay', 0.007871336303651333),
 ('show', 0.006334336474537849),
 ('store', 0.0058915442787110806),
 ('fun', 0.004742237739264965),
 ('play', 0.004234579391777515),
 ('music', 0.00422906968742609),
 ('old', 0.003973362036049366),
 ('kid', 0.0039525749161839485),
 ('parking', 0.003924274351447821)]


Sub Topic #2
[('pizza', 0.015741825103759766),
 ('coffee', 0.010375240817666054),
 ('cheese', 0.00828607752919197),
 ('breakfast', 0.008261464536190033),
 ('sandwich', 0.007934670895338058),
 ('burger', 0.007767122704535723),
 ('flavor', 0.005895623937249184),
 ('delicious', 0.005863597150892019),
 ('swe