In [1]:
data_dir = 'C:/Users/maruv/Desktop/DSB/LDA/'

In [2]:
import json
import os
import glob
import numpy as np
from scipy.stats import itemfreq

In [3]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
'''Read the numpy array'''
df = getDF(data_dir+'reviews_Cell_Phones_and_Accessories_5.json.gz')

In [5]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4.0,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5.0,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5.0,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4.0,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5.0,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [6]:
'''Get the item list'''
item_id = df.as_matrix(columns=df.columns[1:2])
freq_items =itemfreq(item_id)
#sorted_items = freq_items[a[:,1].argsort()]

In [7]:
'''Creating a dataframe of frequent items for ease of use'''
freq_df = pd.DataFrame(data=freq_items,columns=['item','freq'])

In [8]:
freq_df.head()

Unnamed: 0,item,freq
0,120401325X,7
1,3998899561,10
2,6073894996,37
3,7532385086,9
4,7887421268,13


In [9]:
'''Accesing the maximum Frequency item'''
freq_df.loc[freq_df['freq'].idxmax()]

item    B005SUHPO6
freq           837
Name: 3340, dtype: object

In [10]:
'''Creating a dataframe of items with highest reviews'''
max_reviewed_item = df.loc[df.asin.isin(['B005SUHPO6']),['reviewText']]
len(max_reviewed_item) #to check if executed properly, checking length of data frame with the frequency from earlier line of code

837

## Finding topics for most reviewed item
    The next few steps try to extract the topics from the most reviewed items.

In [11]:
max_reviewed_item.head()

Unnamed: 0,reviewText
59707,excellent product at 1/2 the price as sale at ...
59708,Sometimes the flap over the charging place is ...
59709,Great case. Fits like every other Otterbox De...
59710,Use these for our technicians and anyone that ...
59711,It's very strong and protects my 4S phone! I t...


In [12]:
final_max_corp = max_reviewed_item['reviewText']

## Cleaning the text
    For the given reviews, in the next few lines of code, stop words and punctuations will me removed and duplicate words are omitted.
    Code taken from : https://github.com/birupakhya/Topic-Modeling-of-Amazon-Reviews/blob/master/Topic%20Modeling%20of%20Amazon%20Reviews.ipynb

In [13]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_max_clean = [clean(doc).split() for doc in final_max_corp]

In [14]:
doc_max_clean[:2]

[['excellent',
  'product',
  '12',
  'price',
  'sale',
  'electronic',
  'store',
  'wow',
  'fit',
  'perfect',
  'iphone'],
 ['sometimes',
  'flap',
  'charging',
  'place',
  'hard',
  'stay',
  'locked',
  'in',
  'keep',
  'trying',
  'trying',
  'lock',
  'there',
  'drive',
  'crazy',
  'love',
  'color',
  'bought',
  'blue',
  'one',
  'used',
  'yet',
  'maybe',
  'next',
  'year',
  'like',
  'change',
  'awhileother',
  'locking',
  'flap',
  'happy',
  'them']]

## Creating the document term matrix
    This piece of code creates a document term matrix that can further be used to build LDA models.
    
    Testing for complete reviews on the same item.

In [16]:
import gensim
from gensim import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_max_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_max_term_matrix = [dictionary.doc2bow(doc) for doc in doc_max_clean]

Using TensorFlow backend.


## Building the topics using gensim
    

In [17]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_max_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [18]:
print(ldamodel.print_topics(num_topics=5, num_words=3))

[(0, '0.023*"otterbox" + 0.014*"iphone" + 0.013*"phone"'), (1, '0.037*"case" + 0.030*"phone" + 0.022*"it"'), (2, '0.039*"phone" + 0.030*"case" + 0.011*"well"'), (3, '0.051*"case" + 0.025*"phone" + 0.016*"iphone"'), (4, '0.022*"phone" + 0.014*"case" + 0.013*"otterbox"')]


## Observations:

    It can be observed that the product with asin : B005SUHPO6 is a phone case, from intial topics. 
    
    -> It was observed that people are taking about the "DEFENDER" series of cases and point out that the cases are "great"!
        This happened when the topics were collected on small number of reviews (ten).
        Link to product: https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=defender+iphone+7&rh=i%3Aaps%2Ck%3Adefender+iphone+7
        
    -> Later when the topics were collected on the complete set(837), it can be observed that people talk about otterbox iphone case.

# Test for items with fewer reviews

In [19]:
'''Accesing the minimum Frequency item'''
freq_df.loc[freq_df['freq'].idxmin()]

item    8199406933
freq             5
Name: 5, dtype: object

In [20]:
'''Creating a dataframe of items with highest reviews'''
min_reviewed_item = df.loc[df.asin.isin(['8199406933']),['reviewText']]
len(min_reviewed_item) #to check if executed properly, checking length of data frame with the frequency from earlier line of code

5

In [21]:
min_reviewed_item.head()

Unnamed: 0,reviewText
76,"very good charger, it woks fine, no complaints..."
77,This product arrived when promised and in the ...
78,Great product. Use this with my Galaxy S4. At ...
79,Honestly I have always loved this specific mod...
80,"I've had this charger well over a year now, an..."


In [22]:
final_min_corp = min_reviewed_item['reviewText']

In [23]:
doc_min_clean = [clean(doc).split() for doc in final_min_corp]

In [24]:
doc_min_clean[:2]

[['good',
  'charger',
  'wok',
  'fine',
  'complaint',
  'would',
  'recommend',
  'it',
  'good',
  'using',
  'anymore'],
 ['product',
  'arrived',
  'promised',
  'condition',
  'promised',
  'genuine',
  'blackberry',
  'charger',
  'brand',
  'new',
  'worked',
  'perfectly',
  'price',
  'dont',
  'think',
  'ask',
  'much',
  'that']]

In [25]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_min_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_min_term_matrix = [dictionary.doc2bow(doc) for doc in doc_min_clean]

In [26]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_min_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [27]:
print(ldamodel.print_topics(num_topics=5, num_words=3))

[(0, '0.081*"good" + 0.044*"would" + 0.044*"recommend"'), (1, '0.072*"charger" + 0.038*"work" + 0.038*"now"'), (2, '0.013*"charger" + 0.013*"would" + 0.013*"price"'), (3, '0.065*"promised" + 0.035*"product" + 0.035*"price"'), (4, '0.048*"end" + 0.048*"need" + 0.026*"ever"')]


## Observations:
    Fewer data is not so helpful for observing the topics, only the second topic was able to point out correctly that the reviews were about a charger. However, it can catch the sentiment of the reviews as good.
    
    Also, most of the time the second topic gives the exact topic and the first one is pointing out the sentiment.