# WEB SCRAPPING

In [None]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", 200)
import requests
from bs4 import BeautifulSoup

In [199]:
urls = ["https://www.amazon.com/Ulta-Beauty-Email-Delivery/product-reviews/B01CT5PDO6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"]

In [200]:
for url in urls:
    review_html = requests.get(url).text

In [201]:
HEADERS = ({'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

In [202]:
webpage = requests.get(url, headers=HEADERS)

In [203]:
# Was response success
webpage

<Response [200]>

In [204]:
type(webpage.content)

bytes

In [205]:
# Initialize the beautifulsoup framework
soup = BeautifulSoup(webpage.content, "html.parser")

In [206]:
## Extract the review title
new_review_rating = soup.findAll('span',attrs={'class':'a-icon-alt'})
new_review_text = soup.findAll('span', attrs={'data-hook': 'review-body'})

In [207]:
new_review_rating[0:4]

[<span class="a-icon-alt">4.9 out of 5 stars</span>,
 <span class="a-icon-alt">5.0 out of 5 stars</span>,
 <span class="a-icon-alt">3.0 out of 5 stars</span>,
 <span class="a-icon-alt">5.0 out of 5 stars</span>]

In [208]:
new_review_text[0:4]

[<span class="a-size-base review-text review-text-content" data-hook="review-body">
 <span>Do you actually make for a email delivery gift card? It worked as expected. Lol you can save the card in your Google pay and pay that way instead of online</span>
 </span>,
 <span class="a-size-base review-text review-text-content" data-hook="review-body">
 <span>Quick and easy delivery, perfect for a last minute birthday gift</span>
 </span>,
 <span class="a-size-base review-text review-text-content" data-hook="review-body">
 <span>This is so easy and quick! I used it one day when I was very short on time. My daughter was so happy she didn’t have to wait! You will love it!</span>
 </span>,
 <span class="a-size-base review-text review-text-content" data-hook="review-body">
 <span>Appreciated the ease. And my 78 year old mom could easily figure out how to print the email and loved using it!</span>
 </span>]

In [209]:
all_urls = []
for i in range(20):
    if i == 0:
        all_urls.append( url )
    else:
        all_urls.append( url + "&pageNumber=" + str(i) )

In [210]:
all_review_texts = []
all_review_rating = []
for each_url in all_urls:
    webpage = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")
    new_review_rating = soup.findAll('span',attrs={'class':'a-icon-alt'})
    new_review_text = soup.findAll('span', attrs={'data-hook': 'review-body'}) 
    this_review_rating = [r.text for r in new_review_rating]
    all_review_rating = all_review_rating + this_review_rating
    this_review_texts = [r.text for r in new_review_text]
    all_review_texts = all_review_texts + this_review_texts

In [211]:
all_review_rating[:5]

['4.9 out of 5 stars',
 '5.0 out of 5 stars',
 '3.0 out of 5 stars',
 '5.0 out of 5 stars',
 '5.0 out of 5 stars']

In [212]:
all_review_texts[:5]

['\nDo you actually make for a email delivery gift card? It worked as expected. Lol you can save the card in your Google pay and pay that way instead of online\n',
 '\nQuick and easy delivery, perfect for a last minute birthday gift\n',
 '\nThis is so easy and quick! I used it one day when I was very short on time. My daughter was so happy she didn’t have to wait! You will love it!\n',
 '\nAppreciated the ease. And my 78 year old mom could easily figure out how to print the email and loved using it!\n',
 '\nPurchases for a birthday present, fast delivery and super convenient\n']

In [213]:
# Remove all characters other than alpha numerics from the review titles as well
import re
r_rating = list(map(lambda x: x.replace(" out of 5 stars", ""),all_review_rating))
r_reviewText = list( map( lambda x: re.sub("[^a-zA-Z]", " ", x ), all_review_texts ))

In [None]:
from itertools import zip_longest
data_dict1 = {"Rating": [], "ReviewText": []}
for rating, review_text in zip_longest(r_rating, r_reviewText, fillvalue=float('nan')):
    data_dict1["Rating"].append(rating)
    data_dict1["ReviewText"].append(review_text)

# Convert dictionary to dataframe
new_df = pd.DataFrame(data_dict1)

In [None]:
existing_df = pd.read_csv("Original_dataset.csv")

In [None]:
new_df

Unnamed: 0,Rating,ReviewText
0,4.9,She loved it
1,5.0,My girlfriend loves Aerie and the gift card was off for Black Friday so it was a no brainer I encountered issues with delivery with USPS nothing to do with Amazon but Amazon customer supp...
2,3.0,Bought this gift card for Like getting a discount
3,5.0,Grand daughter liked it She wants to go shopping
4,5.0,Gift card Works as advertised D
...,...,...
255,5.0,
256,5.0,
257,5.0,
258,5.0,


In [None]:
df = pd.concat([existing_df, new_df], ignore_index=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Rating      1560 non-null   object
 1   ReviewText  1200 non-null   object
dtypes: object(2)
memory usage: 24.5+ KB


In [215]:
df.to_csv("GiftReviews_TestData.csv",index=False)

In [216]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Rating      1560 non-null   object
 1   ReviewText  1200 non-null   object
dtypes: object(2)
memory usage: 24.5+ KB
