In [99]:
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### Fetching preprocessed data

In [179]:
preprocessed_data=pd.read_csv('Preprocessed_Transcripts_28012020160602.csv')

In [180]:
preprocessed_data.head()

Unnamed: 0,Call_ID,Raw Transcripts,Preprocessed Transcripts
0,56482635,Welcome to HDFC life my name is Kai how may I ...,"['welcome', 'life', 'help', 'hello', 'child', ..."
1,61876574,"Good afternoon, HDFC Life customer care. This ...","['good', 'afternoon', 'customer', 'care', 'hel..."
2,85550179,Good morning. Welcome to HDFC insurance? How c...,"['good', 'morning', 'welcome', 'insurance', 'h..."
3,98762981,Good afternoon.\rHDFC life customer care this ...,"['good', 'afternoon', 'customer', 'care', 'hel..."
4,114965010,Graph noon AGC life customer care. This is Ale...,"['graph', 'noon', 'life', 'customer', 'care', ..."


#### Tokenizing preprocessed transcripts to be used for genism dictionary

In [181]:
tokenising=[]
for text in preprocessed_data['Preprocessed Transcripts']:
    tokenising.append(gensim.utils.simple_preprocess(text))

In [182]:
preprocessed_data['tokenised']= tokenising

#### Creating the dictionary object that maps each tokenized word to a unique id

In [183]:
dictionary = gensim.corpora.Dictionary(preprocessed_data['tokenised'])

In [184]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 50:
        break

0 account
1 alright
2 angel
3 assist
4 check
5 child
6 chill
7 continuous
8 easy
9 facility
10 fine
11 hello
12 help
13 icsi
14 inform
15 insurance
16 know
17 life
18 line
19 link
20 maybe
21 phone
22 pleasant
23 policy
24 policy_number
25 post
26 procedure
27 process
28 purpose
29 register
30 registering
31 registration
32 second
33 section
34 service
35 space
36 summary
37 time
38 type
39 unit
40 update
41 value
42 verification
43 want
44 welcome
45 afternoon
46 agent
47 agree
48 benefit
49 card
50 care


#### Creating corpus (bag of words) -  a corpus contains each word’s id and its frequency count in that document

In [185]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_data['tokenised']]

In [186]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 1
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 2 ("angel") appears 1 time.
Word 8 ("easy") appears 1 time.
Word 12 ("help") appears 3 time.
Word 16 ("know") appears 4 time.
Word 26 ("procedure") appears 1 time.
Word 29 ("register") appears 1 time.
Word 43 ("want") appears 4 time.
Word 45 ("afternoon") appears 1 time.
Word 46 ("agent") appears 1 time.
Word 47 ("agree") appears 1 time.
Word 48 ("benefit") appears 1 time.
Word 49 ("card") appears 2 time.
Word 50 ("care") appears 1 time.
Word 51 ("collect") appears 1 time.
Word 52 ("come") appears 1 time.
Word 53 ("confirmation") appears 1 time.
Word 54 ("contact") appears 2 time.
Word 55 ("customer") appears 1 time.
Word 56 ("document") appears 2 time.
Word 57 ("easily") appears 1 time.
Word 58 ("enroll") appears 2 time.
Word 59 ("everybody") appears 1 time.
Word 60 ("good") appears 1 time.
Word 61 ("graph") appears 1 time.
Word 62 ("great") appears 1 time.
Word 63 ("immediately") appears 1 time.
Word 64 ("information") appears 2 time.
Word 65 ("mail") appears 1 time.
Word 66 ("m

#### Multi core Gensim LDA model on transcript

In [187]:

'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dictionary,                                    
                                   passes =5,
                                   workers = 1)

In [188]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.037*"plan" + 0.030*"know" + 0.030*"premium" + 0.027*"payment" + 0.024*"help" + 0.022*"want" + 0.015*"benefit" + 0.014*"good" + 0.013*"life" + 0.011*"time"


Topic: 1 
Words: 0.056*"policy" + 0.027*"know" + 0.024*"want" + 0.018*"premium" + 0.017*"help" + 0.013*"benefit" + 0.012*"assignment" + 0.010*"click" + 0.009*"type" + 0.009*"contact"


Topic: 2 
Words: 0.026*"claim" + 0.026*"know" + 0.026*"help" + 0.024*"want" + 0.018*"plan" + 0.017*"agent" + 0.017*"number" + 0.016*"need" + 0.015*"online" + 0.014*"policy_number"


Topic: 3 
Words: 0.038*"policy" + 0.026*"policy_number" + 0.025*"help" + 0.023*"know" + 0.022*"check" + 0.016*"need" + 0.015*"want" + 0.012*"register" + 0.011*"insurance" + 0.011*"sorry"




##### Perplexity and Coherence

In [189]:
print('\nPerplexity Score: ', lda_model.log_perplexity(bow_corpus))


Perplexity Score:  -5.949112015203176


In [190]:
# Compute Coherence Score
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=preprocessed_data['tokenised'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.25349447394426916


In [212]:
doc_lda = lda_model[bow_corpus]
for topic in doc_lda:
     print("Document Topics      : ", topic)  

Document Topics      :  [(2, 0.99509585)]
Document Topics      :  [(4, 0.99589247)]
Document Topics      :  [(4, 0.99518335)]
Document Topics      :  [(0, 0.9951229)]
Document Topics      :  [(2, 0.995408)]
Document Topics      :  [(4, 0.995648)]
Document Topics      :  [(0, 0.9959039)]
Document Topics      :  [(1, 0.99553454)]
Document Topics      :  [(0, 0.9968037)]
Document Topics      :  [(2, 0.99379224)]
Document Topics      :  [(1, 0.9956531)]
Document Topics      :  [(0, 0.9958857)]
Document Topics      :  [(4, 0.99279404)]
Document Topics      :  [(0, 0.9949156)]
Document Topics      :  [(1, 0.99478745)]
Document Topics      :  [(2, 0.99443704)]
Document Topics      :  [(1, 0.99509674)]
Document Topics      :  [(1, 0.9960071)]
Document Topics      :  [(1, 0.9935154)]
Document Topics      :  [(0, 0.9964995)]
Document Topics      :  [(4, 0.99481297)]
Document Topics      :  [(3, 0.99546427)]
Document Topics      :  [(3, 0.9949249)]
Document Topics      :  [(1, 0.99523324)]
Docume

### LDA with bigrams

In [192]:
# Getting bigrams  
vectorizer = CountVectorizer(ngram_range = (2,2)) 

In [193]:
X1=vectorizer.fit_transform(preprocessed_data['Preprocessed Transcripts'])

In [194]:
bigrams=[]
for text in preprocessed_data['Preprocessed Transcripts']:
    text=[text,]
    X1=(vectorizer.fit_transform(text))
    bigrams.append(vectorizer.get_feature_names()) 

In [141]:
bigrams

[['account register',
  'alright unit',
  'angel policy_number',
  'assist maybe',
  'check phone',
  'check value',
  'child policy',
  'chill insurance',
  'continuous update',
  'easy time',
  'facility registration_process',
  'fine register',
  'hello child',
  'help hello',
  'icsi insurance',
  'inform assist',
  'insurance registering',
  'insurance verification',
  'know check',
  'know policy_number',
  'life help',
  'life pleasant',
  'line register',
  'link policy',
  'maybe registration_process',
  'phone value',
  'policy chill',
  'policy policy',
  'policy summary',
  'policy want',
  'policy_number alright',
  'policy_number angel',
  'policy_number second',
  'post continuous',
  'procedure easy',
  'process post',
  'purpose know',
  'register facility',
  'register line',
  'register service',
  'registering procedure',
  'registration process',
  'registration_process icsi',
  'registration_process life',
  'second check',
  'section account',
  'service type',
 

In [195]:
preprocessed_data['Bigram_Transcript']=bigrams

In [196]:
dictionary = gensim.corpora.Dictionary(preprocessed_data['Bigram_Transcript'])

In [197]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_data['Bigram_Transcript']]

In [198]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dictionary,                                    
                                   passes =5,
                                   workers = 1)

In [199]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.004*"want know" + 0.004*"premium month" + 0.003*"payment update" + 0.003*"month january" + 0.003*"hour payment" + 0.003*"ignore message" + 0.003*"time payment" + 0.003*"premium time" + 0.003*"good morning" + 0.003*"life help"


Topic: 1 
Words: 0.005*"insurance help" + 0.005*"good afternoon" + 0.004*"welcome life" + 0.004*"afternoon welcome" + 0.003*"life insurance" + 0.003*"want know" + 0.003*"help today" + 0.002*"policy number" + 0.002*"welcome insurance" + 0.002*"help want"


Topic: 2 
Words: 0.005*"help today" + 0.005*"want know" + 0.004*"customer care" + 0.004*"good afternoon" + 0.004*"care help" + 0.003*"verification purpose" + 0.003*"know policy_number" + 0.003*"life help" + 0.003*"purpose know" + 0.003*"life customer"


Topic: 3 
Words: 0.006*"want know" + 0.005*"help today" + 0.004*"good afternoon" + 0.003*"know premium" + 0.003*"know policy_number" + 0.003*"customer care" + 0.003*"care help" + 0.002*"purpose know" + 0.002*"verification purpose" + 0.002*"ser

In [211]:
doc_lda = lda_model[bow_corpus]
for topic in doc_lda:
     print("Document Topics      : ", topic)  

Document Topics      :  [(2, 0.99509585)]
Document Topics      :  [(4, 0.99589247)]
Document Topics      :  [(4, 0.9951832)]
Document Topics      :  [(0, 0.99512386)]
Document Topics      :  [(2, 0.995408)]
Document Topics      :  [(4, 0.9956479)]
Document Topics      :  [(0, 0.995904)]
Document Topics      :  [(1, 0.99553454)]
Document Topics      :  [(0, 0.9968036)]
Document Topics      :  [(2, 0.9937906)]
Document Topics      :  [(1, 0.9956531)]
Document Topics      :  [(0, 0.9958856)]
Document Topics      :  [(4, 0.99279404)]
Document Topics      :  [(0, 0.9949151)]
Document Topics      :  [(1, 0.99478734)]
Document Topics      :  [(2, 0.99443704)]
Document Topics      :  [(1, 0.99509674)]
Document Topics      :  [(1, 0.99600756)]
Document Topics      :  [(1, 0.9935153)]
Document Topics      :  [(0, 0.9964994)]
Document Topics      :  [(4, 0.99481297)]
Document Topics      :  [(3, 0.99546486)]
Document Topics      :  [(3, 0.9949249)]
Document Topics      :  [(1, 0.99523324)]
Docume

### LDA with trigrams

In [201]:
# Getting trigram  
vectorizer = CountVectorizer(ngram_range = (3,3)) 
X1=vectorizer.fit_transform(preprocessed_data['Preprocessed Transcripts'])
trigrams=[]
for text in preprocessed_data['Preprocessed Transcripts']:
    text=[text,]
    X1=(vectorizer.fit_transform(text))
    trigrams.append(vectorizer.get_feature_names()) 

preprocessed_data['trigram_Transcript']=trigrams
dictionary = gensim.corpora.Dictionary(preprocessed_data['trigram_Transcript'])
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_data['trigram_Transcript']]
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dictionary,                                    
                                   passes =5,
                                   workers = 1)
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.003*"customer care help" + 0.003*"care help today" + 0.002*"good afternoon life" + 0.002*"life customer care" + 0.002*"afternoon life customer" + 0.001*"help today want" + 0.001*"insurance help want" + 0.001*"want know premium" + 0.001*"number phone number" + 0.001*"welcome insurance help"


Topic: 1 
Words: 0.003*"verification purpose know" + 0.003*"purpose know policy_number" + 0.003*"hour payment update" + 0.003*"premium month january" + 0.002*"month january possible" + 0.002*"ignore message receive" + 0.002*"message receive today" + 0.002*"kindly ignore message" + 0.002*"confirmation message regard" + 0.002*"help premium time"


Topic: 2 
Words: 0.003*"verification purpose know" + 0.002*"good afternoon welcome" + 0.002*"purpose know policy_number" + 0.002*"life insurance help" + 0.002*"welcome life insurance" + 0.002*"toll_free number insurance" + 0.002*"inconvenience verification purpose" + 0.002*"number request claim" + 0.002*"toll_free number request" + 0.002*

In [210]:
doc_lda = lda_model[bow_corpus]
for topic in doc_lda:
     print("Document Topics      : ", topic)  

Document Topics      :  [(2, 0.99509585)]
Document Topics      :  [(4, 0.99589247)]
Document Topics      :  [(4, 0.9951832)]
Document Topics      :  [(0, 0.99512404)]
Document Topics      :  [(2, 0.995408)]
Document Topics      :  [(4, 0.99564785)]
Document Topics      :  [(0, 0.9959039)]
Document Topics      :  [(1, 0.99553454)]
Document Topics      :  [(0, 0.9968036)]
Document Topics      :  [(2, 0.99379224)]
Document Topics      :  [(1, 0.9956531)]
Document Topics      :  [(0, 0.99588567)]
Document Topics      :  [(4, 0.99279404)]
Document Topics      :  [(0, 0.99491525)]
Document Topics      :  [(1, 0.99478745)]
Document Topics      :  [(2, 0.994437)]
Document Topics      :  [(1, 0.99509674)]
Document Topics      :  [(1, 0.9960069)]
Document Topics      :  [(1, 0.9935154)]
Document Topics      :  [(0, 0.9964994)]
Document Topics      :  [(4, 0.9948125)]
Document Topics      :  [(3, 0.99546486)]
Document Topics      :  [(3, 0.9949249)]
Document Topics      :  [(1, 0.99523324)]
Docum

### LDA with unigram,bigram,trigram

In [203]:
# Getting unigram,bigram,trigram  
vectorizer = CountVectorizer(ngram_range = (1,3)) 
X1=vectorizer.fit_transform(preprocessed_data['Preprocessed Transcripts'])
trigrams=[]
for text in preprocessed_data['Preprocessed Transcripts']:
    text=[text,]
    X1=(vectorizer.fit_transform(text))
    trigrams.append(vectorizer.get_feature_names()) 

preprocessed_data['trigram_Transcript']=trigrams
dictionary = gensim.corpora.Dictionary(preprocessed_data['trigram_Transcript'])
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_data['trigram_Transcript']]
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dictionary,                                    
                                   passes =5,
                                   workers = 1)
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.004*"help" + 0.003*"policy" + 0.003*"want" + 0.003*"know" + 0.003*"premium" + 0.002*"time" + 0.002*"sorry" + 0.002*"good" + 0.002*"plan" + 0.002*"insurance"


Topic: 1 
Words: 0.004*"help" + 0.004*"know" + 0.003*"good" + 0.003*"life" + 0.003*"policy_number" + 0.003*"right" + 0.003*"payment" + 0.002*"today" + 0.002*"afternoon" + 0.002*"thing"


Topic: 2 
Words: 0.005*"help" + 0.004*"know" + 0.004*"want" + 0.004*"plan" + 0.003*"life" + 0.003*"today" + 0.003*"policy" + 0.002*"policy_number" + 0.002*"good" + 0.002*"help today"


Topic: 3 
Words: 0.003*"want" + 0.003*"help" + 0.002*"know" + 0.002*"plan" + 0.002*"insurance" + 0.002*"premium" + 0.002*"need" + 0.002*"time" + 0.002*"payment" + 0.002*"want know"


Topic: 4 
Words: 0.003*"help" + 0.003*"want" + 0.003*"know" + 0.002*"plan" + 0.002*"benefit" + 0.002*"want know" + 0.002*"life" + 0.002*"policy" + 0.002*"insurance" + 0.002*"welcome"




In [209]:
doc_lda = lda_model[bow_corpus]
for topic in doc_lda:
    print("Document Topics      : ", topic)  

Document Topics      :  [(2, 0.99509585)]
Document Topics      :  [(4, 0.99589247)]
Document Topics      :  [(4, 0.9951832)]
Document Topics      :  [(0, 0.9951239)]
Document Topics      :  [(2, 0.99540806)]
Document Topics      :  [(4, 0.995648)]
Document Topics      :  [(0, 0.9959039)]
Document Topics      :  [(1, 0.99553454)]
Document Topics      :  [(0, 0.9968035)]
Document Topics      :  [(2, 0.99379224)]
Document Topics      :  [(1, 0.99565315)]
Document Topics      :  [(0, 0.9958857)]
Document Topics      :  [(4, 0.9927942)]
Document Topics      :  [(0, 0.9949156)]
Document Topics      :  [(1, 0.99478734)]
Document Topics      :  [(2, 0.994437)]
Document Topics      :  [(1, 0.99509674)]
Document Topics      :  [(1, 0.99600756)]
Document Topics      :  [(1, 0.9935154)]
Document Topics      :  [(0, 0.9964995)]
Document Topics      :  [(4, 0.99481297)]
Document Topics      :  [(3, 0.99546427)]
Document Topics      :  [(3, 0.994925)]
Document Topics      :  [(1, 0.99523324)]
Documen

In [161]:
doc_lda

<gensim.interfaces.TransformedCorpus at 0x20103e566d8>

In [155]:
trigrams

[['account',
  'account register',
  'account register line',
  'alright',
  'alright unit',
  'alright unit link',
  'angel',
  'angel policy_number',
  'angel policy_number second',
  'assist',
  'assist maybe',
  'assist maybe registration_process',
  'check',
  'check phone',
  'check phone value',
  'check value',
  'check value policy',
  'child',
  'child policy',
  'child policy want',
  'chill',
  'chill insurance',
  'chill insurance verification',
  'continuous',
  'continuous update',
  'continuous update fine',
  'easy',
  'easy time',
  'easy time registration',
  'facility',
  'facility registration_process',
  'facility registration_process icsi',
  'fine',
  'fine register',
  'fine register service',
  'hello',
  'hello child',
  'hello child policy',
  'help',
  'help hello',
  'help hello child',
  'icsi',
  'icsi insurance',
  'icsi insurance registering',
  'inform',
  'inform assist',
  'inform assist maybe',
  'insurance',
  'insurance registering',
  'insurance