In [27]:
#Packages Used and Imported
import re #clean our text using regex
import csv #this will read the csv file
from collections import defaultdict # accumlating values
from nltk.corpus import stopwords #remove stopwords
from gensim import corpora #create corpus and dictionary for LDA model
from gensim.models import LdaModel #use the LDA model
import pyLDAvis.gensim #visualise LDA model 
import pandas as pd # bring in panda data frames to easier readable format
import nltk #nlp library to perform NLP
nltk.download('stopwords') #stopwords run trial

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
#This is performing file operations to get the file contents
fileContents = defaultdict(list)
with open('FinalData.csv', 'r', encoding="utf-8") as f: #configure utf-8 to bypass any text issues
    reader = csv.DictReader(f)
    for row in reader: # read a row in this format {column1: value1, column2: value2,...}
        for (k,v) in row.items(): # go over each column name and value. K and V is each one value
            fileContents[k].append(v) # this will append the value into the appropriate list

In [29]:
#print file contents for review (#in our case it is under the extract column in the csv file)
from nltk.tokenize import word_tokenize
reviews = fileContents[ 'extract']
print(reviews) 



In [30]:
#cleaning the data. This will remove uncessary spaces between words and punctuation as well
reviews = [re.sub(r'[^\w\s]','',str(item)) for item in reviews]
print(reviews)



In [31]:
#Stop Words Implementation
# It is important to use stopwords because stop words occur in abundance mean they provide little to no unique information
#stopwords = set(stopwords.words('english'))
#from nltk.corpus import stopwords
#stopwords=stopwords.words('english')
#stopwords.extend(["diehard", "sim", "phone", "week", "release","used",]) 
#print(stopwords)
#clean = [word for word in reviews if word not in stopwords]
#clean_text = ' '.join(clean)
#words = re.findall('\w+', clean_text)
#print(clean)

In [33]:
#this is splitting the words using comma so it will be easier to read each word
#this also make sure this is also implemented alongside the stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords=stopwords.words('english')
#use extend for specific words that add no value or use to the model. Pretty important
stopwords.extend(["diehard", "sim", "phone", "phones", "mobile","smartphone", "iphone", "Apple", "apple", "blakberry", "Blackberry", "Nokia","nokia", "Samsung", "samsung", "Use", "Nubia", "Talk", "im","1", "2", "3", "4", "5", "6", "7", "8", "9", "one", "bought", "Bought", "case", "android", "case", "One"]) 
texts = [[word for word in document.lower().split() if word not in stopwords] for document in reviews]
print(texts)



In [34]:
#Taking out the less frequent words
#take a look at the words with most occurences
frequency = defaultdict(int)
for text in texts:
    for token in text:
         frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]
print(frequency)



In [35]:
#This will assemble our text into a corpus dictionary
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(3631 unique tokens: ['amazed', 'amazing', 'battery', 'defective', 'device']...)


In [36]:
#Convert document (a list of words) into the bag-of-words format. 
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)], [(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)], [(28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)], [(43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1)], [(9, 1), (12, 1), (48, 1), (56, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1)], [(50, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1)], [(56, 1), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (10

In [37]:
#We will guess the number of topics that could be in our LDA model
#Note: For the LDA model to run we need a corpus, a set number of topics, a dicationary, and a set number of iterations
#that will be passed in your paramter also known as paramter passing. Finally, you can visualize the model. 
##This might take some time. So please be patient.
NUM_TOPICS = 20
ldamodel = LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
print(ldamodel)

LdaModel(num_terms=3631, num_topics=20, decay=0.5, chunksize=2000)


In [38]:
#This will show LDA model topics
#Yes, this does look hard to read and messy that's why we setup pandas dataframe below to read it easier
topics = ldamodel.show_topics()
for topic in topics:
    print(topic)

(16, '0.116*"great" + 0.107*"good" + 0.060*"product" + 0.050*"excellent" + 0.043*"works" + 0.036*"condition" + 0.032*"price" + 0.025*"service" + 0.020*"problems" + 0.018*"love"')
(5, '0.037*"item" + 0.022*"buying" + 0.021*"buy" + 0.016*"go" + 0.015*"would" + 0.015*"money" + 0.012*"iphones" + 0.012*"product" + 0.011*"cheap" + 0.011*"loved"')
(8, '0.042*"working" + 0.034*"fine" + 0.026*"works" + 0.024*"everything" + 0.023*"exactly" + 0.020*"would" + 0.017*"worked" + 0.017*"box" + 0.017*"return" + 0.016*"properly"')
(15, '0.016*"screen" + 0.016*"time" + 0.014*"get" + 0.013*"apps" + 0.013*"touch" + 0.012*"good" + 0.012*"would" + 0.012*"thats" + 0.011*"days" + 0.011*"great"')
(2, '0.049*"battery" + 0.037*"life" + 0.036*"good" + 0.035*"awesome" + 0.031*"fast" + 0.029*"love" + 0.028*"great" + 0.025*"performance" + 0.022*"camera" + 0.017*"quality"')
(0, '0.066*"work" + 0.031*"could" + 0.029*"didnt" + 0.026*"att" + 0.026*"charger" + 0.025*"sprint" + 0.025*"unlocked" + 0.015*"get" + 0.014*"would

In [39]:
##This is our setup of pandas dataframe below to read it easier
#This is shown like an excel format for each topic
word_dict = {};
for i in range(NUM_TOPICS):
    words = ldamodel.show_topic(i, topn = 20)
    word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
pd.DataFrame(word_dict)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10,Topic # 11,Topic # 12,Topic # 13,Topic # 14,Topic # 15,Topic # 16,Topic # 17,Topic # 18,Topic # 19,Topic # 20
0,work,camera,battery,good,best,item,battery,great,working,talk,use,wanted,card,happy,nice,screen,great,son,new,perfect
1,could,good,life,battery,blackberry,buying,charge,like,fine,found,easy,3g,great,keyboard,described,time,good,get,like,galaxy
2,didnt,screen,good,heating,pleased,buy,using,years,works,looking,need,want,brilliant,amazing,cell,get,product,really,came,screen
3,att,quality,awesome,issues,ever,go,bad,os,everything,straight,screen,contract,hand,loves,month,apps,excellent,best,used,even
4,charger,better,fast,call,work,would,day,daughter,exactly,back,text,plan,still,scratch,needed,touch,works,description,battery,htc
5,sprint,best,love,20,time,money,even,blackberry,would,received,functions,reliable,full,basic,broke,good,condition,first,works,s8
6,unlocked,front,great,problem,long,iphones,months,using,worked,locked,great,data,sd,say,would,would,price,dont,well,n
7,get,display,performance,quality,verizon,product,charging,new,box,ordered,texting,expensive,yes,screen,within,thats,service,model,really,size
8,would,light,camera,issue,used,cheap,took,apps,return,get,old,back,fantastic,nice,replaced,days,problems,good,brand,change
9,card,comes,quality,goes,button,loved,dont,bb,properly,sent,people,seller,apps,tough,charger,great,love,try,screen,much


In [40]:
#we use a pyplot to display the LDA model in a multi-dimeonsinal model
#we can also traverse through mutiple topics in a multi-heirchal model
#for our insights, the most relevant terms with the most frequency can give the most accurate reviews and vice versa
#as people could commonly come on an agreement on a certain issue of the smartphone or a likeable feature
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)