# Topic Analysis

In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import spacy

In [2]:
df = pd.read_excel('AllTranscriptions.xlsx')  #reading reviews dataset

In [3]:
print(df.shape)
df.head()

(65, 3)


Unnamed: 0,Sno,Text,State
0,1,"Hi, welcome to Pizza Hut. My name is Christine...",Null
1,2,"Hi, welcome to a Pizza Hut is Chris may have y...",NSW
2,3,Thank you for calling Pizza Hut. This is Jenny...,Newtown
3,4,Thanks for calling Pizza Hut to spell. Can I h...,Victoria
4,5,"Hi, welcome to Pizza Hut. My name is Christine...",Null


In [4]:
df.isnull().sum() #Checking Null Values

Sno      0
Text     0
State    0
dtype: int64

In [5]:
nlp = spacy.load('en_core_web_sm') #Loading spacy english
# Add Stop words
nlp.Defaults.stop_words |= {"hut","called","calling",}

# Text Preprocessing

In [6]:
# Text Preprocessing
import string
punct = string.punctuation
def text_clean(text):
    text = text.lower()  #Convert text in lower case
    punc_removed = [char for char in text if char not in punct]  #Removing Punctuations
    punc_removed_join = ''.join(punc_removed) 
    
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']  #get Noun phrase
    text_out = []
    doc= nlp(punc_removed_join)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_            #lemmatization of token word
            text_out.append(lemma)
            
    return text_out

In [7]:
df['clean_doc'] = df['Text'].apply(text_clean) #Apply Preprocessing of text
df.head()

Unnamed: 0,Sno,Text,State,clean_doc
0,1,"Hi, welcome to Pizza Hut. My name is Christine...",Null,"[welcome, pizza, christine, pleasehi, dear, ip..."
1,2,"Hi, welcome to a Pizza Hut is Chris may have y...",NSW,"[welcome, pizza, chris, pleasesee, pay, like, ..."
2,3,Thank you for calling Pizza Hut. This is Jenny...,Newtown,"[thank, pizza, jenny, nameis, pizza, cannewtow..."
3,4,Thanks for calling Pizza Hut to spell. Can I h...,Victoria,"[thank, pizza, spell, pleasehi, pick, threei, ..."
4,5,"Hi, welcome to Pizza Hut. My name is Christine...",Null,"[welcome, pizza, christine, pleasehi, dear, ip..."


In [8]:

# print(df['review'][1])
# print(len(df['review'][1]))
# print(df['clean_doc'][1])
# print(len(df['clean_doc'][1]))
# print("percent token : %0.2f" %(float(len(df['clean_doc'][1]) * 100)/len(df['review'][1])))

In [9]:
# Getting text in list
val = []
for i in range(len(df)):
    val.append(df['clean_doc'][i])
# val

In [10]:
# import gensim library

from gensim import models,corpora
# Converting str to int in corpora
dictionary = corpora.Dictionary(val)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x174b6d01af0>

In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in val]   #Creating bag of words
# bow_corpus

# Topic Identification

In [12]:
# process gensim model for topic identification
import gensim
lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, passes = 10,workers = 2)

In [13]:
lda_model

<gensim.models.ldamulticore.LdaMulticore at 0x174b38d2fd0>

In [14]:
# Printing Topics and Its keywords
num_topics = 10
for i in range(num_topics):
    tt = lda_model.get_topic_terms(i,10)
    topic = ', '.join([dictionary[pair[0]] for pair in tt])
    print("TOPIC: {} \nTOPIC WORDS : {}".format(i+1, topic ))
    print()

TOPIC: 1 
TOPIC WORDS : order, time, place, pizza, store, let, thank, track, number, delivery

TOPIC: 2 
TOPIC WORDS : price, pizza, garlic, thank, pepperoni, large, bread, park, delivery, street

TOPIC: 3 
TOPIC WORDS : unique, app, mattress, thatuh, mystic, excited, emaildidoh, folderwhat, satellite, basisuh

TOPIC: 4 
TOPIC WORDS : order, pizza, thank, like, pick, want, large, ready, let, traditional

TOPIC: 5 
TOPIC WORDS : pizza, order, like, pick, thank, chicken, yeah, add, go, want

TOPIC: 6 
TOPIC WORDS : thank, pizza, order, like, pick, number, yeah, store, yes, know

TOPIC: 7 
TOPIC WORDS : pizza, order, thank, like, yeah, pick, number, code, let, ill

TOPIC: 8 
TOPIC WORDS : pizza, like, thank, order, barbecue, large, lover, meat, delivery, chicken

TOPIC: 9 
TOPIC WORDS : order, pizza, place, get, want, thank, line, say, number, alpha

TOPIC: 10 
TOPIC WORDS : order, like, pizza, store, thank, number, email, say, yeah, yes



In [15]:
# for idx, topic in lda_model.print_topics():
#     print("Topic: {} \n Words: {}".format(idx, topic ))
#     print("\n")

In [16]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis