# Nouns only Topic Modelling
   This script identifies various topics in the reviews, by filetring out only the nouns in the reviews.

In [41]:
data_dir = 'C:/Users/maruv/Desktop/DSB/LDA/'

### Reading the Data
     This piece of code unzips the package and parses the json document as a pandas dataframe.

In [42]:
import json
import os
import glob
import numpy as np
from scipy.stats import itemfreq

In [43]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [44]:
df_reviews = getDF(data_dir+'reviews_Cell_Phones_and_Accessories_5.json.gz')
df_metadata = getDF(data_dir+'meta_Cell_Phones_and_Accessories.json.gz')

### Functions to Extract Nouns
    By using the pos tagger in the NLTK we can extract the nouns from a given sentence/ review.

In [45]:
import nltk as nt
noun_tags = ["NN","NNP","NNS","POS","WP"] #"PRP$" , "PRP", removing proper nouns

In [46]:
def tags(sentence):
    array_words = nt.word_tokenize(sentence)
    tags = nt.pos_tag(array_words)
    return tags

In [47]:
def noun_words(review):
    nouns =[];
    all_tuples = tags(review)
    for one in all_tuples:
        for each in noun_tags:
            if(each == one[1]):
                nouns.append(one[0])
    return nouns

Forming a new data frame for the reviews of a selected items

In [48]:
'''Creating a dataframe of items with required item reviews'''
max_reviewed_item = df_reviews.loc[df_reviews.asin.isin(['B005SUHPO6']),['reviewText']]
max_reviewed_item.head()

Unnamed: 0,reviewText
59707,excellent product at 1/2 the price as sale at ...
59708,Sometimes the flap over the charging place is ...
59709,Great case. Fits like every other Otterbox De...
59710,Use these for our technicians and anyone that ...
59711,It's very strong and protects my 4S phone! I t...


In [49]:
final_max_corp = max_reviewed_item['reviewText']

## Extracting only Nouns and various forms of nouns
Nouns were extracted, and proper nouns are not considered (I,we , them, they, etc)

In [52]:
total_nouns = []
for each in final_max_corp:
    total_nouns.append(noun_words(each))

In [53]:
len(total_nouns),total_nouns[0]

(837, ['product', 'price', 'sale', 'store', 'fit', 'perfect', 'iphone'])

Since we already have cleaned Matrix, we are not using nltk funtions to remove stop words or punctuations.

## Creating Doc Term Matrix

This piece of code creates a document term matrix that can further be used to build LDA models.

Testing for complete reviews on the same item.

In [54]:
import gensim
from gensim import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(total_nouns)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_max_term_matrix = [dictionary.doc2bow(doc) for doc in total_nouns]

## Building Topics using gensim

In [55]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_max_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [56]:
print(ldamodel.print_topics(num_topics=5, num_words=3))

[(0, '0.015*"OtterBox" + 0.008*"Series" + 0.007*"one"'), (1, '0.062*"case" + 0.026*"Otterbox" + 0.024*"phone"'), (2, '0.053*"phone" + 0.041*"case" + 0.035*"iPhone"'), (3, '0.067*"phone" + 0.067*"case" + 0.016*"cases"'), (4, '0.084*"case" + 0.063*"phone" + 0.021*"screen"')]
