In [None]:
!pip install gensim==3.8.3 pyLDAvis top2vec

In [2]:
import gensim

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

pyLDAvis.enable_notebook()

import pandas as pd
import numpy as np
import re

import wrappers
utils = wrappers.utils

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load CSV dataset
df = pd.read_csv('merged_crm_meta.csv')

# Data Preparation

### Load Stopwords ###

In [4]:
# Load Stopwords list for removal of frequently occuring words with minimal impact on Topic Generation
stopwords_path = 'text-file/manualnotes_stopwords.txt'
stopwords = open(stopwords_path)
stopwords = stopwords.read().splitlines()

# Initializing top2vec Wrapper with stopwords, importing a list of words to be removed from dataset before fitting into LDA Model
lda = wrappers.LdaWrap(stopwords, manual_note=True)

#extend and remove stopwords
extend_stopwords = ["customer","cx","needed","nfa","said","advise","adv","insurance","n","s","can",
                    "will","via","ph","get","canx"]

remove_stopwords = []



### Lemmatization of Words ###

In [5]:
# Lemmatization is a test pre-processing technique which groups words or abreviations into a single term.

# lemmatize list
lemmatize = utils.make_lemmatize_list([
 (["cx","cust","cm"], "customer"),
 (["adv"], "advise"),
 (["alr"], "already"),
 (["purchased","buy","bought"], "purchase"),
 (["renewal","renewed"], "renew"),
 (["paid","payment"], "pay"),
 (["successfully"], "success"),
 (["thru"], "through"),
 (["nvr","never"], "not"),
 (["goes"], "go"),
 (["checking"], "check"),
 (["days"], "day"),
 (["saved"], "save"),
 (["ph"], "phone")
])


### Permutation of phrases, creates bigrams (two-word sequence) combination and permutation of words ###
# Example: 2 lists ["known"], ["well","not"] returns ["known_well", "known_not", "well_known", "not_known"]

# create permuations for successful payment
phrase_perm = utils.phrase_permutation(["success","already","proceed"],["purchase","pay","renew"])

phrases = [*phrase_perm,"axs_machine", "credit_card","wire_card", "debit_card", "go_through", "cross_sell", "resend_email", "call_back","generate_new", "day_later","save_quote"]

# removing additonal stopwords after permutation
stopwords_after_phrases = ["already","not","dont","purchase", "pay","go","through", "day"]

# lemmatizing of phrases
phrase_lemmatize = utils.make_lemmatize_list([(["axs_machine"], "axs"), (["cc","card"], "credit_card"), (["wire_card"], "wirecard"), 
                    (phrase_perm, "successful_payment"), (["crosssell, xsell"], "cross_sell"),
                    (["call_back"], "callback")])

In [6]:
### Method to process text data and convert it into usable format for LDA model ###

# Apply Stopwords, Lemmatization and Permuation into text dataset
processed_df, df_process_obj = lda.process_text(df['manual_notes2'],extend_stopwords=extend_stopwords,
                                                            remove_stopwords=remove_stopwords, 
                                                            custom_bigrams = phrases, context_words = ["not","dont","send","check"], 
                                                            stopwords_aft_phrase = stopwords_after_phrases,
                                                            lemmatize = lemmatize, lemmatize_bigrams = phrase_lemmatize,
                                                            freq_bigrams=True, filter_email=True)

Processing text...
Text processed in 2.250196933746338 seconds
Making N-grams...
N-grams completed in 0.12698578834533691 seconds


### Create Word Dictionary ###

In [7]:
# Create dictionary which converts processed dataset to bag-of-words corpus for model fitting
dictionary = lda.make_dictionary(processed_df,no_below=10, no_above=0.1, filter_extremes=False)

# no_below : Word that appears in 10 or more data rows would be included in dictionary
# no_above : Words that appears in more than 10% of data rows would be removed

Creating dictionary...
Dictionary complete in 0.012746572494506836 seconds


# Topic Modeling (Manual Notes)

## Create Topic Model

In [8]:
# creates multiple models exhaustively to find optimal topicnum (searches through 2 - 10 of topics)
models, topics, _, coherences, perplexities ,_ \
= lda.exhaustive_search(processed_df, dictionary, 
                        max_seed=100, min_topics=2, max_topics=10, 
                        min_passes=1,max_passes=1, min_seed=100)

# Determine suitable number of topics by Coherence Scores (Shows how interpretable and quality the topics generated were)

Iteration 9/9
Mean coherence score: 0.235512
Mean perplexity: -6.396460
Estimated time of completion: 0.000000 seconds

Training LDA model...
LDA model complete in 0.7315361499786377 seconds
model with highest coherence score: 8
highest coherence score: 0.2724708061424101


In [9]:
### Review prediction distribution for dataset ###

# Define number of Topics Generated
models_selected = models[8]

# 
predictions,_ = lda.predict(models_selected, 
                            dictionary, 
                            join_id = df['Contact ID'], 
                            actual_text = df['manual_notes2'], 
                            preprocess_obj = df_process_obj)
predictions.head()

Processing text...
Text processed in 0.23102807998657227 seconds
Making N-grams...
N-grams completed in 0.1284475326538086 seconds
Predicting...
 476/476 predictions complete.
Prediction completed in 0.40522193908691406 seconds


Unnamed: 0,Contact ID,prediction,probability
0,27499ae1-f149-4b74-907c-6db8b35b2ec2,0,0.91816
1,cf8a5e6d-a49a-444b-9563-1a47a04e231a,5,0.546969
2,ff8c0c6a-96c4-44af-91d2-69394d672b51,2,0.887469
3,18f51a26-70ba-4fd0-9786-9dc450c4b116,2,0.93075
4,67f3cfcb-268b-44c0-88ce-99ff9d12f411,5,0.924979


## Generated Topic Evaluation

### 1) Unique Words of Topic

In [10]:
### Evaluate and Perceive each Topics by unique/non-duplicate words ###

# Words that appear in one topic would not appear in subsequent topics 

highest_coherence_topic = topics[8]

output_dict = {}
for index, topic in enumerate(highest_coherence_topic):
    if index == 0:
        words_i = [word[0] for word in highest_coherence_topic[index]['words']]
        set_of_words_already = set(words_i)
        output_dict[index] = words_i
    else:
        words_i = [word[0] for word in highest_coherence_topic[index]['words'] if word[0] not in set_of_words_already]
        set_of_words_already = set_of_words_already.union(set(words_i))
        output_dict[index] = words_i

In [11]:
output_dict

# Possible Topics
# 0 : Policy Cancellation Guide
# 1 : Account Verification / Activation
# 2 : Cancellation and Refund on Motorcycle Product
# 3 : Inquire of Motor Product
# 4 : Activation and Downloading of FWD APP
# 5 : Changing of Travel Product
# 6 : Verification of Policy Refund
# 7 : Updating/Cancellation of policy
# 8 : Login into FWD Account
# 9 : Travel Product Extension and Claim

{0: ['"policy"',
  '"cp"',
  '"cancel"',
  '"email"',
  '"guide"',
  '"phone"',
  '"note"',
  '"app"',
  '"online"',
  '"extend"'],
 1: ['"claim"', '"submit"', '"account"', '"verify"'],
 2: ['"refund"', '"pnmc"', '"date"', '"road_tax"'],
 3: ['"car"'],
 4: ['"activation"', '"activate"', '"download"'],
 5: ['"change"', '"travel"', '"new"'],
 6: ['"add"'],
 7: ['"update"', '"cancellation"'],
 8: ['"login"'],
 9: ['"trip"', '"poi"', '"start"']}

### 2) All the Words within Topic

In [13]:
### Evaluate and Perceive each Topics by all keywords ###

# Each word in Topic showcases its weightage/importance to the Topic generation


for topic in highest_coherence_topic:
    print('topic {}'.format(topic['topic']))
    for word_list in topic['words']:
        word = word_list[0]
        value = word_list[1]
        print('{:<20} {:<20}'.format(word, value))
    print('-'*25)

topic 0
"policy"             0.054               
"cp"                 0.040               
"cancel"             0.037               
"email"              0.029               
"guide"              0.027               
"phone"              0.026               
"note"               0.016               
"app"                0.016               
"online"             0.014               
"extend"             0.013               
-------------------------
topic 1
"claim"              0.055               
"submit"             0.033               
"online"             0.020               
"phone"              0.019               
"guide"              0.019               
"account"            0.017               
"policy"             0.016               
"cp"                 0.012               
"verify"             0.012               
"cancel"             0.011               
-------------------------
topic 2
"policy"             0.043               
"cp"                 0.039               


In [None]:
# Possible Topics
# 0 : Policy Cancellation Guide
# 1 : Account Verification / Activation
# 2 : Cancellation and Refund on Motorcycle Product
# 3 : Inquire of Motor Product
# 4 : Activation and Downloading of FWD APP
# 5 : Changing of Travel Product
# 6 : Verification of Policy Refund
# 7 : Updating/Cancellation of policy
# 8 : Login into FWD Account
# 9 : Travel Product Extension and Claim