### Getting the Data

In [53]:
import requests

In [54]:
from bs4 import BeautifulSoup

In [55]:
r = requests.get('https://www.amazon.in/Blue-Tokai-Coffee-Roasters-Attikan/product-reviews/B07968P6RD/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews')

In [56]:
r.status_code

200

In [57]:
r.text



In [58]:
soup = BeautifulSoup(r.text, 'html.parser')

In [59]:
results = soup.findAll(class_='a-size-base review-text review-text-content')

In [60]:
reviews=[]
for result in results:
  reviews.append(result.find('span').text)

In [61]:
for review in reviews:
    print(review, '\n')


  Not high quality Arabica to begin with. Very small size seeds which may seem like Robusta and maybe is Robusta. Most importantly, whatever it maybe low quality Arabica or Robusta, the roasting is very pathetic. It's not a medium dark roast but a light roast, hence if you try brewing an espresso from it, you'll get a sour product. Even other techniques cannot be used to overcome the flaw in this coffee's roasting blunder. You don't roast coffee like peanuts and expect it to taste good. If you have ever tasted good coffee then you will not appreciate what they have done with the roasting. Blue Tokai does not hold any expertise in coffee roasting, it's just they know how to package and send it to you at such a high price. They would have done a better job if they had sent green beans to me for roasting afresh. If a consumer can shift from buying ground coffee to whole beans roasted then why can't a consumer demand green beans and roast them at home. I will never buy again from Blue Tok

### Getting the Metrics and Data Cleaning

In [62]:
import pandas as pd
import numpy as np

In [74]:
df = pd.DataFrame(np.array(reviews), columns=['reviews'])

In [75]:
df['reviews'] = df['reviews'].apply(lambda x: x[3:-2])

In [76]:
df

Unnamed: 0,reviews
0,Not high quality Arabica to begin with. Very s...
1,I have also bought the Byenemara estate and I ...
2,I was tired of paying 750 every time for Starb...
3,I am long time customer of blue tokkai coffee ...
4,I purchased this for use in a French press. I'...
5,The beans have intense aroma and an earthy fla...
6,"This coffee is premium quality, Single Origin ..."
7,There aren't any Blue Tokai outlets in Souther...
8,I am a coffee junkie. Everyone teases me about...
9,I opened the packet and found it almost half e...


In [81]:
df['word count'] = df['reviews'].apply(lambda x: len(x.split()))

In [82]:
df

Unnamed: 0,reviews,word count
0,Not high quality Arabica to begin with. Very s...,184
1,I have also bought the Byenemara estate and I ...,66
2,I was tired of paying 750 every time for Starb...,28
3,I am long time customer of blue tokkai coffee ...,67
4,I purchased this for use in a French press. I'...,52
5,The beans have intense aroma and an earthy fla...,76
6,"This coffee is premium quality, Single Origin ...",97
7,There aren't any Blue Tokai outlets in Souther...,75
8,I am a coffee junkie. Everyone teases me about...,112
9,I opened the packet and found it almost half e...,97


In [83]:
df['char_count'] = df['reviews'].apply(len)

In [84]:
df.head()

Unnamed: 0,reviews,word count,char_count
0,Not high quality Arabica to begin with. Very s...,184,999
1,I have also bought the Byenemara estate and I ...,66,367
2,I was tired of paying 750 every time for Starb...,28,163
3,I am long time customer of blue tokkai coffee ...,67,359
4,I purchased this for use in a French press. I'...,52,276


In [86]:
def avg_words(review):
    words = review.split()
    return sum(len(word) for word in words)/len(words)

In [87]:
df['reviews'].apply(avg_words)

0    4.434783
1    4.575758
2    4.857143
3    4.373134
4    4.326923
5    4.789474
6    5.020619
7    4.813333
8    4.017857
9    3.742268
Name: reviews, dtype: float64

In [88]:
df['word_length'] = df['reviews'].apply(avg_words)

In [89]:
df.head()

Unnamed: 0,reviews,word count,char_count,word_length
0,Not high quality Arabica to begin with. Very s...,184,999,4.434783
1,I have also bought the Byenemara estate and I ...,66,367,4.575758
2,I was tired of paying 750 every time for Starb...,28,163,4.857143
3,I am long time customer of blue tokkai coffee ...,67,359,4.373134
4,I purchased this for use in a French press. I'...,52,276,4.326923


In [90]:
from nltk.corpus import stopwords

In [91]:
stop_words = stopwords.words('english')

In [92]:
len(stop_words)

179

In [94]:
df['reviews'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

0    86
1    32
2    12
3    31
4    24
5    33
6    33
7    34
8    53
9    50
Name: reviews, dtype: int64

In [95]:
df['stopword_count'] = df['reviews'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [96]:
df.head()

Unnamed: 0,reviews,word count,char_count,word_length,stopword_count
0,Not high quality Arabica to begin with. Very s...,184,999,4.434783,86
1,I have also bought the Byenemara estate and I ...,66,367,4.575758,32
2,I was tired of paying 750 every time for Starb...,28,163,4.857143,12
3,I am long time customer of blue tokkai coffee ...,67,359,4.373134,31
4,I purchased this for use in a French press. I'...,52,276,4.326923,24


In [97]:
df['stopword_rate'] = df['stopword_count']/df['word count']

In [98]:
df.head()

Unnamed: 0,reviews,word count,char_count,word_length,stopword_count,stopword_rate
0,Not high quality Arabica to begin with. Very s...,184,999,4.434783,86,0.467391
1,I have also bought the Byenemara estate and I ...,66,367,4.575758,32,0.484848
2,I was tired of paying 750 every time for Starb...,28,163,4.857143,12,0.428571
3,I am long time customer of blue tokkai coffee ...,67,359,4.373134,31,0.462687
4,I purchased this for use in a French press. I'...,52,276,4.326923,24,0.461538


In [100]:
df.sort_values(by = 'stopword_rate')

Unnamed: 0,reviews,word count,char_count,word_length,stopword_count,stopword_rate
6,"This coffee is premium quality, Single Origin ...",97,583,5.020619,33,0.340206
2,I was tired of paying 750 every time for Starb...,28,163,4.857143,12,0.428571
5,The beans have intense aroma and an earthy fla...,76,439,4.789474,33,0.434211
7,There aren't any Blue Tokai outlets in Souther...,75,435,4.813333,34,0.453333
4,I purchased this for use in a French press. I'...,52,276,4.326923,24,0.461538
3,I am long time customer of blue tokkai coffee ...,67,359,4.373134,31,0.462687
0,Not high quality Arabica to begin with. Very s...,184,999,4.434783,86,0.467391
8,I am a coffee junkie. Everyone teases me about...,112,561,4.017857,53,0.473214
1,I have also bought the Byenemara estate and I ...,66,367,4.575758,32,0.484848
9,I opened the packet and found it almost half e...,97,459,3.742268,50,0.515464


In [101]:
df.describe()

Unnamed: 0,word count,char_count,word_length,stopword_count,stopword_rate
count,10.0,10.0,10.0,10.0,10.0
mean,85.4,464.1,4.495129,38.8,0.452146
std,42.190573,225.503117,0.400415,20.247359,0.046384
min,28.0,163.0,3.742268,12.0,0.340206
25%,66.25,361.0,4.338476,31.25,0.438991
50%,75.5,437.0,4.50527,33.0,0.462113
75%,97.0,535.5,4.807368,46.0,0.471759
max,184.0,999.0,5.020619,86.0,0.515464


In [103]:
df['review_lower'] = df['reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [104]:
df['review_nopunc'] = df['review_lower'].str.replace('[^\w\s]', '')

  df['review_nopunc'] = df['review_lower'].str.replace('[^\w\s]', '')


In [105]:
df['review_nopunc_nostop'] = df['review_nopunc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [108]:
freq= pd.Series(" ".join(df['review_nopunc_nostop']).split()).value_counts()[:30]

In [109]:
freq

coffee       25
beans        10
roast         8
like          7
aroma         6
light         5
get           5
roasting      5
blue          4
good          4
roasted       4
quality       4
espresso      4
tokai         3
use           3
try           3
estate        3
medium        3
brew          3
bitter        3
robusta       3
im            3
also          3
black         3
ground        3
would         3
starbucks     3
arabica       3
taste         3
ive           2
dtype: int64

In [110]:
other_stopwords = ['get', 'us', 'see', 'use', 'said', 'asked', 'day', 'go',
  'even', 'ive', 'right', 'left', 'always', 'would', 'told',
  'get', 'us', 'would', 'get', 'one', 'im', 'go', 'even',
  'also', 'ever', 'try', 'take', 'let' ]

In [111]:
df['review_nopunc_nostop_nocommon'] = df['review_nopunc_nostop'].apply(lambda x: "".join(" ".join(x for x in x.split() if x not in other_stopwords)))

In [112]:
df.head()

Unnamed: 0,reviews,word count,char_count,word_length,stopword_count,stopword_rate,review_lower,review_nopunc,review_nopunc_nostop,review_nopunc_nostop_nocommon
0,Not high quality Arabica to begin with. Very s...,184,999,4.434783,86,0.467391,not high quality arabica to begin with. very s...,not high quality arabica to begin with very sm...,high quality arabica begin small size seeds ma...,high quality arabica begin small size seeds ma...
1,I have also bought the Byenemara estate and I ...,66,367,4.575758,32,0.484848,i have also bought the byenemara estate and i ...,i have also bought the byenemara estate and i ...,also bought byenemara estate give comparison t...,bought byenemara estate give comparison two at...
2,I was tired of paying 750 every time for Starb...,28,163,4.857143,12,0.428571,i was tired of paying 750 every time for starb...,i was tired of paying 750 every time for starb...,tired paying 750 every time starbucks beans me...,tired paying 750 every time starbucks beans me...
3,I am long time customer of blue tokkai coffee ...,67,359,4.373134,31,0.462687,i am long time customer of blue tokkai coffee ...,i am long time customer of blue tokkai coffee ...,long time customer blue tokkai coffee beans gr...,long time customer blue tokkai coffee beans gr...
4,I purchased this for use in a French press. I'...,52,276,4.326923,24,0.461538,i purchased this for use in a french press. i'...,i purchased this for use in a french press im ...,purchased use french press im quite fastidious...,purchased french press quite fastidious brew c...


### Lemmatization

In [116]:
#Import Text Blob
from textblob import Word

In [117]:
df['cleaned_review'] = df['review_nopunc_nostop_nocommon'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [118]:
df['cleaned_review']

0    high quality arabica begin small size seed may...
1    bought byenemara estate give comparison two at...
2    tired paying 750 every time starbucks bean med...
3    long time customer blue tokkai coffee bean gri...
4    purchased french press quite fastidious brew c...
5    bean intense aroma earthy flavor roast nowhere...
6    coffee premium quality single origin arabica u...
7    arent blue tokai outlet southern bombay hadnt ...
8    coffee junkie everyone tease coffee running th...
9    opened packet found almost half empty r 400 to...
Name: cleaned_review, dtype: object

### Sentiment Analysis

In [119]:
from textblob import TextBlob

In [120]:
df['polarity'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment[0])

In [121]:
df[['reviews', 'polarity']]

Unnamed: 0,reviews,polarity
0,Not high quality Arabica to begin with. Very s...,0.074706
1,I have also bought the Byenemara estate and I ...,0.090476
2,I was tired of paying 750 every time for Starb...,0.033333
3,I am long time customer of blue tokkai coffee ...,0.33125
4,I purchased this for use in a French press. I'...,0.0
5,The beans have intense aroma and an earthy fla...,0.203704
6,"This coffee is premium quality, Single Origin ...",0.271088
7,There aren't any Blue Tokai outlets in Souther...,0.366667
8,I am a coffee junkie. Everyone teases me about...,0.31125
9,I opened the packet and found it almost half e...,0.043122


In [122]:
df['subjectivity'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment[1])

In [123]:
df[['reviews', 'subjectivity']]

Unnamed: 0,reviews,subjectivity
0,Not high quality Arabica to begin with. Very s...,0.463529
1,I have also bought the Byenemara estate and I ...,0.583333
2,I was tired of paying 750 every time for Starb...,0.9
3,I am long time customer of blue tokkai coffee ...,0.3875
4,I purchased this for use in a French press. I'...,0.0
5,The beans have intense aroma and an earthy fla...,0.533333
6,"This coffee is premium quality, Single Origin ...",0.639116
7,There aren't any Blue Tokai outlets in Souther...,0.555556
8,I am a coffee junkie. Everyone teases me about...,0.476667
9,I opened the packet and found it almost half e...,0.328042


In [None]:
#The results of this analysis are insignificant as the number of reviews was just 10. 