# Part 1 :  Data Scrapping from Amazon
****
****

### Step 1: Import required libraries..
***

In [12]:
from bs4 import BeautifulSoup as bs
import requests

### Step 2: Url link  & page requests.. 
***

In [13]:
# provide review page link so that we can fetch the data.
url = 'https://www.amazon.in/boAt-BassHeads-100-Headphones-Black/product-reviews/B071Z8M4KX/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'

# set page request..
page = requests.get(url)

### Step 3: Parse the html data using beautiful soup
***

In [14]:
soup = bs(page.content, 'html.parser')
# print(soup.prettify())
# the prettify function will print data in the satndard format so that it will readable

### Step 4: Fetch the data required from diff html tags from url resource page/by page inspect method
***

#### 4.1 Fetching User names..
***

In [26]:
# do right click on user name & inspect. observe where the user name is provided.
#..in this case user name is in 'span' tag. But html page contains huge no. of span tags so we take check
#...in which class all user name span tag is associated. 

users = soup.find_all('span', class_='a-profile-name')

# since it is list, crossverifying first name from the given list..
users[0]

<span class="a-profile-name">Neshway</span>

In [29]:
# extracting only text from users list..

user_names = []
for i in range(0, len(users)):
    user_names.append(users[i].get_text())

In [20]:
user_names

['Neshway',
 'Himalayan Rony',
 'Neshway',
 'Himalayan Rony',
 'Hrithik Modi',
 'Aashita ',
 'Akshay joshi',
 'Soham Roy',
 'martin',
 'Nikhil',
 'Amazon Customer',
 'Gnc']

#### 4.2 Fetching Review titles...
***

![](https://github.com/ShrikantUppin/NLP/blob/main/amazon_review.png?raw=true)

In [55]:
titles = soup.find_all('a', class_='review-title-content')

# since it is list, crossverifying revie title from the given list..
titles[0]

<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title" href="/gp/customer-reviews/R270ZWETAMQXXT?ASIN=B071Z8M4KX">
<span>Worth the money</span>
</a>

In [56]:
# extracting only review text from titltles list..

review_title = []
for i in range(0, len(titles)):
    review_title.append(titles[i].get_text())
review_title 

['\nWorth the money\n',
 '\nMust watch this detailed 👉 REVIEW ☺️\n',
 '\nVFM Product\n',
 '\nThe repair/replacement process is hassle free and easy!\n',
 '\nGood but lack bass...\n',
 '\nQuality sound\n',
 '\nBest for the price\n',
 '\nExcellent after sales services and value for money\n',
 '\nBroken headset\n',
 '\nGood\n']

In [57]:
# We can obeserve that \n in above review_titles.So need to clean it..

for i in range(len(review_title)):
    review_title[i] = review_title[i].lstrip('\n')
    review_title[i] = review_title[i].rstrip('\n')
    
review_title

['Worth the money',
 'Must watch this detailed 👉 REVIEW ☺️',
 'VFM Product',
 'The repair/replacement process is hassle free and easy!',
 'Good but lack bass...',
 'Quality sound',
 'Best for the price',
 'Excellent after sales services and value for money',
 'Broken headset',
 'Good']

#### 4.3 Fetching Review ratings i.e. reviews star numbers..
***

In [58]:
ratings = soup.find_all('i', class_='review-rating')

# since it is list, crossverifying first rating from the given list..
ratings[0]

<i class="a-icon a-icon-star a-star-5 review-rating" data-hook="review-star-rating-view-point"><span class="a-icon-alt">5.0 out of 5 stars</span></i>

In [69]:
# extracting only review text from ratings list..

rating_numbers = []
for i in range(0, len(ratings)):
    rating_numbers.append(ratings[i].get_text())
    
rating_numbers

['5.0 out of 5 stars',
 '3.0 out of 5 stars',
 '5.0 out of 5 stars',
 '3.0 out of 5 stars',
 '5.0 out of 5 stars',
 '5.0 out of 5 stars',
 '3.0 out of 5 stars',
 '4.0 out of 5 stars',
 '4.0 out of 5 stars',
 '5.0 out of 5 stars',
 '1.0 out of 5 stars',
 '4.0 out of 5 stars']

In [71]:
# putting only float type rating in rating_number list..

for i in range(0, len(rating_numbers)):
    rating_numbers[i] = float(rating_numbers[i][0:3]) # Used lsit slicing +string slicing..

In [72]:
rating_numbers

[5.0, 3.0, 5.0, 3.0, 5.0, 5.0, 3.0, 4.0, 4.0, 5.0, 1.0, 4.0]

#### 4.4 Fetching Review text
***

![](https://github.com/ShrikantUppin/NLP/blob/main/revie_text.png?raw=true)

In [75]:
# if we observe the review span tag is not having any class name...
#.. we will take the upper span & will access without using class name..
# so to access any tag data without using class name.. the dict method is used in beautiful soup 
#........................................................................................as below.


review_para = soup.find_all('span', {"data-hook":"review-body"})

# since it is list, crossverifying first review from the given list..
review_para[0]

<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>
  One of the best pair of earphone for the price<br/><br/>1} Sound quality is amazing for the price 😍<br/>2} Built quality is good but not the best but  for the budget price I can say its good ⚡<br/>3} Headphone jack is not gold plated ⚠️<br/>4} I have been using this earphone from past more than a year and have replaced it once under warranty which was super easy hassel free🎶<br/>5} I have used many other brand earphones such as JBL, Sony, Sennheiser, RHA, Boultaudio and recently Infinity and I am not a professional but have experience of many earphones so I can say this is the best sounding in budget and very comfortable in shape to use for longer duration 💯💥<br/><br/>I can definitely recommend this earphones must buy in budget segment 🔝🔝✅🔥🔥
</span>
</span>

In [77]:
# extracting only review para from ratings list..

review_detail = []
for i in range(0, len(review_para)):
    review_detail.append(review_para[i].get_text())
    
review_detail[0]

'\n\n  One of the best pair of earphone for the price1} Sound quality is amazing for the price 😍2} Built quality is good but not the best but  for the budget price I can say its good ⚡3} Headphone jack is not gold plated ⚠️4} I have been using this earphone from past more than a year and have replaced it once under warranty which was super easy hassel free🎶5} I have used many other brand earphones such as JBL, Sony, Sennheiser, RHA, Boultaudio and recently Infinity and I am not a professional but have experience of many earphones so I can say this is the best sounding in budget and very comfortable in shape to use for longer duration 💯💥I can definitely recommend this earphones must buy in budget segment 🔝🔝✅🔥🔥\n\n'

In [79]:
# We can obeserve that \n in above review_details .So need to clean it..

for i in range(len(review_detail)):
    review_detail[i] = review_detail[i].lstrip('\n')
    review_detail[i] = review_detail[i].rstrip('\n')
    
review_detail[0]

'  One of the best pair of earphone for the price1} Sound quality is amazing for the price 😍2} Built quality is good but not the best but  for the budget price I can say its good ⚡3} Headphone jack is not gold plated ⚠️4} I have been using this earphone from past more than a year and have replaced it once under warranty which was super easy hassel free🎶5} I have used many other brand earphones such as JBL, Sony, Sennheiser, RHA, Boultaudio and recently Infinity and I am not a professional but have experience of many earphones so I can say this is the best sounding in budget and very comfortable in shape to use for longer duration 💯💥I can definitely recommend this earphones must buy in budget segment 🔝🔝✅🔥🔥'

### Step 5: Creating pandas Dataframe
***

In [113]:
# since pandas dataframe requires equal length arrays/lists. Need to check list lengths first.
import numpy as np
import pandas as pd 

lists = [user_names, review_title, rating_numbers, review_detail]

length = [] 
for i in range(len(lists)):
    length.append(len(lists[i]))
    if len(lists[i])>max(length):
        lists[i] = lists[i]
    else:
        difference = max(length)-len(lists[i])
        lists[i].extend([np.nan for i in range(difference)])


In [114]:
amazon_reviews = pd.DataFrame({'User_Name':user_names, 'Review_title':review_title,\
                              'Ratings':rating_numbers, 'Review':review_detail})

In [115]:
amazon_reviews

Unnamed: 0,User_Name,Review_title,Ratings,Review
0,Neshway,Worth the money,5.0,One of the best pair of earphone for the pri...
1,Himalayan Rony,Must watch this detailed 👉 REVIEW ☺️,3.0,I have purchased this boat earphone few mont...
2,Neshway,VFM Product,5.0,Build quality (5/5)Durability (4/5)Wire leng...
3,Himalayan Rony,The repair/replacement process is hassle free ...,3.0,"Love the sound quality, but moreover the ser..."
4,Hrithik Modi,Good but lack bass...,5.0,I bought this in sept and I'm NOT SATISFIED ...
5,Aashita,Quality sound,5.0,This is a very great earphone with a good qu...
6,Akshay joshi,Best for the price,3.0,"For a price of rs 400 it's worth the cost, i..."
7,Soham Roy,Excellent after sales services and value for m...,4.0,I bought this earphone cause I had read in t...
8,martin,Broken headset,4.0,One of my friend has this product that's y I...
9,Nikhil,Good,5.0,Got it for 400. Okay for the price.
