In [1]:
# !pip install textblob
# !pip install plotly
#!pip install cufflinks

In [2]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from textblob import TextBlob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv('etiko_reviews.csv')

In [4]:
df1 = pd.DataFrame()
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')  
    return ReviewText
df1['Review Text'] = preprocess(df['review'])
df1['polarity'] = df1['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)
df1['review_len'] = df1['Review Text'].astype(str).apply(len)
df1['word_count'] = df1['Review Text'].apply(lambda x: len(str(x).split()))
df1['rating'] = df['stars']

In [5]:
# polarity tells us how +ve or -ne a sentence is. if its closer to +1, then its +ve 
# if its close to -1, then it is a -ve sentence

import plotly
from plotly.offline import iplot

df1['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

In [6]:
dr = df1

In [7]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(dr['Review Text'], 20)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in reviews after removing stop words')

In [8]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(dr['Review Text'], 20)
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df4 = df4[3:]
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in reviews after removing stop words')

In [9]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(dr['Review Text'], 20)
df6 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df6 = df6[4:]
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in reviews after removing stop words')


# Reference

* https://medium.com/swlh/nlp-text-visualization-twitter-sentiment-analysis-in-r-65b14240258f

In [59]:
# !pip install wordcloud

In [56]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
 
# # Create a list of word
# new = ""
# for each in dr['Review Text']:
#     new = new + each
# # Create the wordcloud object
# wordcloud = WordCloud(max_font_size=20, min_font_size=10,stopwords=["Read Read", "review stating"]).generate(text)
 
# # Display the generated image:
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.margins(x=0, y=0)
# plt.show()


In [57]:
# text = ""
# for each in range(0,len(df2)):
#     for i in range(0,df2['count'][each]):
# #         print(df2['ReviewText'][each])
#         text = text + " " + str(df2['ReviewText'][each])

In [58]:
# text