In [1]:
import pandas as pd
import numpy as np

In [2]:
new = pd.read_csv('Amazon_review_sentiment_score.csv', index_col=0)
new

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,polarity,subjectivity,good review
0,1,B001E4KFG0,A3SGXH7AUHU8GW,1,1,5,2011-04-27,good quality dog food,bought several vitality canned dog food produc...,0.425000,0.400000,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,0,0,1,2012-09-07,not as advertised,product arrived labeled jumbo salted peanutsth...,0.216667,0.762963,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,1,1,4,2008-08-18,delight says it all,confection around centuries light pillowy citr...,0.187000,0.548000,1
3,4,B000UA0QIQ,A395BORC6FGVXV,3,3,2,2011-06-13,cough medicine,looking secret ingredient robitussin believe f...,0.150000,0.650000,0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,0,0,5,2012-10-21,great taffy,great taffy great price wide assortment yummy ...,0.458333,0.600000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
568406,568450,B001EO7N10,A28KG5XORO54AY,0,0,5,2011-03-09,will not do without,great sesame chickenthis good better resturant...,0.675000,0.662500,1
568407,568451,B003S1WTCU,A3I8AFVPEE8KI5,0,0,2,2012-03-09,disappointed,disappointed flavor chocolate notes especially...,-0.250000,0.492857,0
568408,568452,B004I613EE,A121AA1GQV751Z,2,2,5,2012-02-21,perfect for our maltipoo,stars small give 1015 one training session tri...,-0.021875,0.418750,1
568409,568453,B004I613EE,A3IBEVCTXKNOH,1,1,5,2012-03-13,favorite training and reward treat,best treats training rewarding dog good groomi...,0.521429,0.678571,1


# Topic Modeling: Good reviews + only NOUNS

In [4]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [6]:
data_nouns = pd.DataFrame(new_good_reviews.Text.apply(nouns))
data_nouns

Unnamed: 0,Text
0,vitality food products quality product stew me...
2,confection centuries pillowy gelatin nuts case...
4,price assortment yummy taffy delivery taffy lo...
5,hair taffy pound bag flavors root beer melon p...
6,saltwater taffy flavors chewy none candies ver...
...,...
568405,complaint theres use amount spice jar sister
568406,sesame chickenthis resturants husband recipes
568408,stars training session train dog ceaser dog tr...
568409,treats dog calories doggies potatoes wet noses


In [7]:
new_good_reviews= new.loc[new['good review'] ==1]

# Create document-term matrix
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv_matrix = cv.fit_transform(data_nouns['Text'])

In [8]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=4)

# document topic matrix for cv_matrix
lda_output = lda.fit_transform(cv_matrix)
print(lda_output.shape)
print(lda_output)

(486404, 4)
[[0.02326464 0.02401771 0.92964661 0.02307104]
 [0.19044229 0.01232999 0.01287973 0.78434799]
 [0.30262062 0.02825773 0.02850944 0.64061221]
 ...
 [0.01318844 0.0135063  0.96067125 0.01263401]
 [0.03125577 0.23445684 0.69857643 0.03571095]
 [0.04414069 0.64535789 0.04262532 0.2678761 ]]


In [9]:
# topics and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

(4, 127730)
[[0.25002756 0.25000852 1.249854   ... 1.24989451 1.24986764 0.25002337]
 [0.25003488 0.25001506 0.25005564 ... 0.25004876 0.25005153 1.24829167]
 [0.25003091 0.25001283 0.25004865 ... 0.2500288  0.250041   0.25166263]
 [1.24990665 3.24996358 0.25004171 ... 0.25002794 0.25003983 0.25002233]]


In [10]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(data_nouns))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,topic
Doc0,0.02,0.02,0.93,0.02,2
Doc1,0.19,0.01,0.01,0.78,3
Doc2,0.3,0.03,0.03,0.64,3
Doc3,0.14,0.01,0.18,0.67,3
Doc4,0.02,0.12,0.02,0.84,3
Doc5,0.05,0.05,0.05,0.85,3
Doc6,0.04,0.04,0.68,0.24,2
Doc7,0.03,0.03,0.9,0.03,2
Doc8,0.09,0.72,0.01,0.18,1
Doc9,0.01,0.01,0.91,0.06,2


In [11]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
3,150445
0,146973
2,100467
1,88519


In [12]:
# topic word matrix
print(lda.components_)

# topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = cv.get_feature_names()
df_topic_words.index = topic_names

df_topic_words.head()

[[0.25002756 0.25000852 1.249854   ... 1.24989451 1.24986764 0.25002337]
 [0.25003488 0.25001506 0.25005564 ... 0.25004876 0.25005153 1.24829167]
 [0.25003091 0.25001283 0.25004865 ... 0.2500288  0.250041   0.25166263]
 [1.24990665 3.24996358 0.25004171 ... 0.25002794 0.25003983 0.25002233]]


Unnamed: 0,078ounce,092ounce,0ptions,100calories,100cals,100degrees,10calories,10cents,10packs,10years,...,zzzzzs,zzzzzz,zzzzzzzz,zzzzzzzzzzbr,µg,½inch,½ounce,ça,çaykur,île
Topic0,0.250028,0.250009,1.249854,0.250047,0.250012,0.250006,0.267647,0.250016,2.241221,0.281597,...,5.249223,0.250008,0.263456,0.362666,0.250002,0.250002,0.250012,1.249895,1.249868,0.250023
Topic1,0.250035,0.250015,0.250056,10.832942,0.250017,0.250007,2.232332,0.253344,0.253854,1.205042,...,0.25003,0.250014,0.325747,1.137252,2.249992,1.248971,0.262687,0.250049,0.250052,1.248292
Topic2,0.250031,0.250013,0.250049,0.253658,0.253061,0.250007,0.25001,0.250016,0.250007,0.263341,...,0.250021,1.240675,0.250006,0.250045,0.250003,0.251025,0.704127,0.250029,0.250041,0.251663
Topic3,1.249907,3.249964,0.250042,0.663354,1.246909,1.24998,0.25001,1.246624,0.254918,0.25002,...,0.250726,0.259304,1.160791,0.250038,0.250002,0.250002,0.783173,0.250028,0.25004,0.250022


In [13]:
# print top n keywords for each topic
def print_topic_words(cv, lda_model, n_words):
    words = np.array(cv.get_feature_names())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(cv=cv, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,coffee,tea,flavor,taste,cup,product,price,time,order,drink,use,water,day,amazon,morning
Topic 1,water,flavor,taste,product,oil,use,salt,sauce,rice,time,dont,coconut,sugar,ingredients,price
Topic 2,food,dog,dogs,product,treats,cat,cats,time,treat,day,price,loves,eat,foods,years
Topic 3,taste,chocolate,flavor,chips,snack,cookies,sugar,product,milk,bars,butter,bag,time,calories,eat
