In [4]:
import gensim
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

pyLDAvis.enable_notebook()

import pandas as pd
import numpy as np
import re
import os, json

import wrappers
utils = wrappers.utils

In [5]:
# Load Transcribed JSON Call Data into Dataframe
path_to_script = 'script_data/'
json_files = [pos_json for pos_json in os.listdir(path_to_script) if pos_json.endswith('.json')]
df_jsons = pd.DataFrame()

for index, js in enumerate(json_files):
    with open(os.path.join(path_to_script, js)) as json_file:
        json_text = json.load(json_file)
        df_jsons = df_jsons.append(json_text, ignore_index = True)
        
df_jsons

Unnamed: 0,accountId,jobName,results,status
0,365339902192,sample92,{'transcripts': [{'transcript': 'Good morning....,COMPLETED
1,365339902192,sample62,{'transcripts': [{'transcript': 'Good afternoo...,COMPLETED
2,365339902192,sample19,"{'transcripts': [{'transcript': 'Good morning,...",COMPLETED
3,365339902192,sample6,{'transcripts': [{'transcript': 'good afternoo...,COMPLETED
4,365339902192,sample94,{'transcripts': [{'transcript': 'Good afternoo...,COMPLETED
...,...,...,...,...
95,365339902192,sample34,{'transcripts': [{'transcript': 'wD good after...,COMPLETED
96,365339902192,sample1,{'transcripts': [{'transcript': 'Good afternoo...,COMPLETED
97,365339902192,sample99,{'transcripts': [{'transcript': 'Good afternoo...,COMPLETED
98,365339902192,sample29,{'transcripts': [{'transcript': 'Good afternoo...,COMPLETED


# Data Extraction & Transformation

In [10]:
### Extract Transcript fron Dataframe ###

# List to store transcript
transcript_list = []
i = 0

# For each data row extract the results column from dataset
while i < len(df_jsons['results']):
    transcript_list.append(df_jsons['results'][i]['transcripts'])
    i = i + 1

# Extract embedded transcript list within the list
transcript = list(map(lambda x:x[0],transcript_list))

# Concatenate each transcript into a list 
script = []
i = 0

while i < len(transcript):
    script.append(transcript[i]['transcript'])
    i = i + 1

script

["Good morning. My name is Elena, how may I assist you? Hi good morning. Yeah. Okay uh last uh yesterday I bought travel insurance, annual travel insurance. Okay and uh I would like to uh request for a copy to be mailed to my uh email. Alright um May I get your I. D. Number S. 112. Hold on s. 0987 Demo. Alright and may I get your um under your policy? Right. How many are there? Uh Two. And how much is I'm sorry may I get your registered email address as well? Guns. Alright thank you so much for the verification. Yeah how do I address you is it? Yes. Okay so for your policy uh you would like to get the hard copy, this is for the policy that starts on 22nd August. Your mailing address, can I confirm is 97 court um 596553. Yeah. Uh No no no you can give me a copy as well through my email has already been sent to you on 20 August when you are you at the email right now? Sure last night actually received one but I cannot open. Yeah uh can I get you where to open or you can also get it from 

In [12]:
transcript

[{'transcript': "Good morning. My name is Elena, how may I assist you? Hi good morning. Yeah. Okay uh last uh yesterday I bought travel insurance, annual travel insurance. Okay and uh I would like to uh request for a copy to be mailed to my uh email. Alright um May I get your I. D. Number S. 112. Hold on s. 0987 Demo. Alright and may I get your um under your policy? Right. How many are there? Uh Two. And how much is I'm sorry may I get your registered email address as well? Guns. Alright thank you so much for the verification. Yeah how do I address you is it? Yes. Okay so for your policy uh you would like to get the hard copy, this is for the policy that starts on 22nd August. Your mailing address, can I confirm is 97 court um 596553. Yeah. Uh No no no you can give me a copy as well through my email has already been sent to you on 20 August when you are you at the email right now? Sure last night actually received one but I cannot open. Yeah uh can I get you where to open or you can al

In [13]:
script

["Good morning. My name is Elena, how may I assist you? Hi good morning. Yeah. Okay uh last uh yesterday I bought travel insurance, annual travel insurance. Okay and uh I would like to uh request for a copy to be mailed to my uh email. Alright um May I get your I. D. Number S. 112. Hold on s. 0987 Demo. Alright and may I get your um under your policy? Right. How many are there? Uh Two. And how much is I'm sorry may I get your registered email address as well? Guns. Alright thank you so much for the verification. Yeah how do I address you is it? Yes. Okay so for your policy uh you would like to get the hard copy, this is for the policy that starts on 22nd August. Your mailing address, can I confirm is 97 court um 596553. Yeah. Uh No no no you can give me a copy as well through my email has already been sent to you on 20 August when you are you at the email right now? Sure last night actually received one but I cannot open. Yeah uh can I get you where to open or you can also get it from 

In [None]:
df_transcript['transcript'] = script
df_transcript.to_csv("df_transcript.csv")

# Data Preparation 

### Load Stopwords

In [None]:
# Load Stopwords list for removal of frequently occuring words with minimal impact on Topic Generation
stopwords_path = 'text-file/manualnotes_stopwords.txt'
stopwords = open(stopwords_path)
stopwords = stopwords.read().splitlines()

# Initializing top2vec Wrapper with stopwords, importing a list of words to be removed from dataset before fitting into LDA Model
lda = wrappers.LdaWrap(stopwords, manual_note=True)

#extend and remove stopwords
extend_stopwords = ["customer","cx","needed","nfa","said","advise","adv","insurance","n","s","can",
                    "will","via","ph","get","canx"]

remove_stopwords = []

### Lemmatization of Words

In [None]:
# Lemmatization is a test pre-processing technique which groups words  or abreviations into a single term.

# lemmatize list
lemmatize = utils.make_lemmatize_list([
 (["cx","cust","cm"], "customer"),
 (["adv"], "advise"),
 (["alr"], "already"),
 (["purchased","buy","bought"], "purchase"),
 (["renewal","renewed"], "renew"),
 (["paid","payment"], "pay"),
 (["successfully"], "success"),
 (["thru"], "through"),
 (["nvr","never"], "not"),
 (["goes"], "go"),
 (["checking"], "check"),
 (["days"], "day"),
 (["saved"], "save"),
 (["ph"], "phone")
])


### Permutation of phrases, creates bigrams (two-word sequence) combination and permutation of words ###
# Example: 2 lists ["known"], ["well","not"] returns ["known_well", "known_not", "well_known", "not_known"]

# create permuations for successful payment
phrase_perm = utils.phrase_permutation(["success","already","proceed"],["purchase","pay","renew"])

phrases = [*phrase_perm,"axs_machine", "credit_card","wire_card", "debit_card", "go_through", "cross_sell", "resend_email", "call_back","generate_new", "day_later","save_quote"]

# removing additonal stopwords after permutation
stopwords_after_phrases = ["already","not","dont","purchase", "pay","go","through", "day"]

# lemmatizing of phrases
phrase_lemmatize = utils.make_lemmatize_list([(["axs_machine"], "axs"), (["cc","card"], "credit_card"), (["wire_card"], "wirecard"), 
                    (phrase_perm, "successful_payment"), (["crosssell, xsell"], "cross_sell"),
                    (["call_back"], "callback")])

In [None]:
### Method to process text data and convert it into usable format for LDA model ###

# Apply Stopwords, Lemmatization and Permuation into text dataset
processed_df, df_process_obj = lda.process_text(df_transcript['transcript'],extend_stopwords=extend_stopwords,
                                                            remove_stopwords=remove_stopwords, 
                                                            custom_bigrams = phrases, context_words = ["not","dont","send","check"], 
                                                            stopwords_aft_phrase = stopwords_after_phrases,
                                                            lemmatize = lemmatize, lemmatize_bigrams = phrase_lemmatize,
                                                            freq_bigrams=True, filter_email=True)

### Create Word Dictionary 

In [None]:
# Create dictionary which converts processed dataset to bag-of-words corpus for model fitting
dictionary = lda.make_dictionary(df_transcript['transcript'],no_below=10, no_above=0.1, filter_extremes=False)

# no_below : Word that appears in 10 or more data rows would be included in dictionary
# no_above : Words that appears in more than 10% of data rows would be removed

# Tropic Model (Transcript)

## Create Topic Model

In [None]:
# creates multiple models exhaustively to find optimal topicnum (searches through 2 - 10 of topics)
models, topics, _, coherences, perplexities ,_ \
= lda.exhaustive_search(df_transcript['transcript'], dictionary, 
                        max_seed=100, min_topics=2, max_topics=10, 
                        min_passes=1,max_passes=1, min_seed=100)

# Determine suitable number of topics by Coherence Scores (Shows how interpretable and quality the topics generated were)

In [None]:
### Review prediction distribution for dataset ###

# Define number of Topics Generated
models_selected = models[8]

# 
predictions,_ = lda.predict(models_selected, 
                            dictionary, 
                            join_id = df['Contact ID'], 
                            actual_text = df['manual_notes2'], 
                            preprocess_obj = df_process_obj)
predictions.head()

## Generated Topic Evaluation

### 1) Unqiue Words of Topic

In [None]:
### Evaluate and Perceive each Topics by unique/non-duplicate words ###

# Words that appear in one topic would not appear in subsequent topics 

highest_coherence_topic = topics[8]

output_dict = {}
for index, topic in enumerate(highest_coherence_topic):
    if index == 0:
        words_i = [word[0] for word in highest_coherence_topic[index]['words']]
        set_of_words_already = set(words_i)
        output_dict[index] = words_i
    else:
        words_i = [word[0] for word in highest_coherence_topic[index]['words'] if word[0] not in set_of_words_already]
        set_of_words_already = set_of_words_already.union(set(words_i))
        output_dict[index] = words_i

In [None]:
output_dict

# Possible Topics
# 0 : Policy Cancellation Guide
# 1 : Account Verification / Activation
# 2 : Cancellation and Refund on Motorcycle Product
# 3 : Inquire of Motor Product
# 4 : Activation and Downloading of FWD APP
# 5 : Changing of Travel Product
# 6 : Verification of Policy Refund
# 7 : Updating/Cancellation of policy
# 8 : Login into FWD Account
# 9 : Travel Product Extension and Claim

### 2) All Words within Topics

In [None]:
### Evaluate and Perceive each Topics by all keywords ###

# Each word in Topic showcases its weightage/importance to the Topic generation


for topic in highest_coherence_topic:
    print('topic {}'.format(topic['topic']))
    for word_list in topic['words']:
        word = word_list[0]
        value = word_list[1]
        print('{:<20} {:<20}'.format(word, value))
    print('-'*25)

In [None]:
# Possible Topics
# 0 : Policy Cancellation Guide
# 1 : Account Verification / Activation
# 2 : Cancellation and Refund on Motorcycle Product
# 3 : Inquire of Motor Product
# 4 : Activation and Downloading of FWD APP
# 5 : Changing of Travel Product
# 6 : Verification of Policy Refund
# 7 : Updating/Cancellation of policy
# 8 : Login into FWD Account
# 9 : Travel Product Extension and Claim