In [1]:
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
"""Load data..."""

import load
df = load.loadDF(1e11)

#Slice required columns
df_reviews = df[['reviewText','category']]

category = df_reviews.category.unique()
data = [[str(i) for i in df_reviews[df_reviews.category == cat]['reviewText']] for cat in category] 

len(category)

11

In [10]:
n_samples = 20
n_features = 100000
n_components = 10
n_top_words = 10


In [4]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [5]:
"""COUNT VECTORIZER FOR LDA"""
ct_vectorizer = CountVectorizer(min_df=0.01,max_features=n_features,
                                stop_words='english')

ct_vectorizer.fit([i for j in data for i in j])

ct = [ct_vectorizer.transform(i) for i in data]


print('A total of %s features extracted' %(len(ct_vectorizer.get_feature_names())))
print('First 20 features extracted:')
ct_vectorizer.get_feature_names()[0:20]

A total of 377 features extracted
First 20 features extracted:


['10',
 '100',
 '20',
 'able',
 'absolutely',
 'action',
 'actually',
 'add',
 'ago',
 'amazing',
 'amazon',
 'aren',
 'arrived',
 'art',
 'available',
 'away',
 'awesome',
 'bad',
 'based',
 'beat']

In [6]:
"""Stemmed vectroizer"""
import multiprocessing as mp
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

def stemm_1item(it):
    return ' '.join([stemmer.stem(word) for word in it.split(' ')])
    
#new_corpus=[stemm_1item(text) for text in data]

with mp.Pool() as pool:
    new_corpus = pool.map(stemm_1item, [i for j in data for i in j])

ct_vectorizer.fit(new_corpus)

ct = [ct_vectorizer.transform(i) for i in data]

print('A total of %s features extracted' %(len(ct_vectorizer.get_feature_names())))
print('First 20 features extracted:')
ct_vectorizer.get_feature_names()[0:20]

A total of 412 features extracted
First 20 features extracted:


['10',
 '100',
 '20',
 'abl',
 'absolut',
 'action',
 'actual',
 'ad',
 'add',
 'addit',
 'ago',
 'allow',
 'alreadi',
 'alway',
 'amaz',
 'amazon',
 'ani',
 'anoth',
 'anyon',
 'anyth']

In [7]:
"""Lemmatized Vectorizer"""
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

def lemm_1item(it):
    return ' '.join([lemmer.lemmatize(word, pos = 'v') for word in it.split(' ')])

with mp.Pool() as pool:
    new_corpus = pool.map(lemm_1item, [i for j in data for i in j])

ct_vectorizer.fit(new_corpus)

ct = [ct_vectorizer.transform(i) for i in data]

print('A total of %s features extracted' %(len(ct_vectorizer.get_feature_names())))
print('First 20 features extracted:')
ct_vectorizer.get_feature_names()[0:20]

A total of 375 features extracted
First 20 features extracted:


['10',
 '100',
 '20',
 'able',
 'absolutely',
 'action',
 'actually',
 'add',
 'ago',
 'allow',
 'amaze',
 'amazing',
 'amazon',
 'aren',
 'arrive',
 'art',
 'available',
 'away',
 'awesome',
 'bad']

In [11]:
"""Performing LDA on Vectorized set"""
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,
                                n_jobs=-1)


for cat in range(len(category)):
    #lda_fit = lda.fit(ct[cat])
    print("\nTopics in LDA model for %s:" %(category[cat]))
    tf_feature_names = ct_vectorizer.get_feature_names()
    print_top_words(lda_fit, tf_feature_names, n_top_words)


Topics in LDA model for Software:
Topic #0: great product fast good super deal original add wonderful ship
Topic #1: work just used don want card look didn buy like
Topic #2: time need just way long don make plastic tape like
Topic #3: use easy amazon used home product better come problem light
Topic #4: nice like really fine use little write just pencil desk
Topic #5: good quality price colors set cartridges best box different exactly
Topic #6: love perfect buy recommend definitely highly book size especially sturdy
Topic #7: printer ink paper pen color print black cartridge smooth blue
Topic #8: pens office year expensive cost worth school machine old use
Topic #9: works great phone excellent item ok perfectly useful power stuff


Topics in LDA model for All_Beauty:
Topic #0: great product fast good super deal original add wonderful ship
Topic #1: work just used don want card look didn buy like
Topic #2: time need just way long don make plastic tape like
Topic #3: use easy amazon us

In [13]:
len([i for j in data for i in j])

2068055

In [None]:
# importing all necessery modules 
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
# Reads data files
#df = load.loadDF()



stopwords = set(STOPWORDS) 


# iterate through the csv file 
for val in data: 
    comment_words = '' 
    # typecaste each val to string 
    val = str(val)
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
  
    plt.show() 
