In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

In [2]:
def get_soup(url, headers):
    webpage = requests.get(url, headers=headers)

    if webpage.status_code != 200:
        print("Erro ao requisitar a página!")
        exit(-1)
    
    soup = bs(webpage.text, "html.parser")
    return soup

def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:

        r_rating_element = review.select_one("i.review-rating")
        r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None

        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None

        r = {
            "rating": r_rating,
            "content": r_content,
        }

        scraped_reviews.append(r)

    return scraped_reviews


In [3]:

search_url = "https://www.amazon.com/BERIBES-Cancelling-Transparent-Soft-Earpads-Charging-Black/product-reviews/B0CDC4X65Q/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
headers = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
soup = get_soup(search_url, headers)
data = get_reviews(soup)
df = pd.DataFrame(data=data)
df.head(5)

Unnamed: 0,rating,content
0,5.0,\nWow! I ordered these over-the-ear headphones...
1,5.0,\nI have been wanting a cordless over the ear ...
2,4.0,"\nI'm not an audiophile, so I am not very know..."
3,5.0,"\nSound quality is very clear,not a lot of bas..."
4,5.0,\nWas pleasantly surprised by this purchase.PR...


In [4]:
df.shape

(10, 2)

In [5]:
df['content'] = df['content'].apply(lambda x: x.replace('\n', ''))
df


Unnamed: 0,rating,content
0,5.0,Wow! I ordered these over-the-ear headphones b...
1,5.0,I have been wanting a cordless over the ear he...
2,4.0,"I'm not an audiophile, so I am not very knowle..."
3,5.0,"Sound quality is very clear,not a lot of bass ..."
4,5.0,Was pleasantly surprised by this purchase.PROS...
5,3.0,I really like the price of these headphones. ...
6,5.0,"Honestly, I bought these headphones having a l..."
7,5.0,We are very satisfied with these headphones. W...
8,5.0,"When I bought this, I wasn't expecting much. I..."
9,5.0,I have been looking for headphones that are NO...


In [6]:
df['id'] = df.index + 1
df

Unnamed: 0,rating,content,id
0,5.0,Wow! I ordered these over-the-ear headphones b...,1
1,5.0,I have been wanting a cordless over the ear he...,2
2,4.0,"I'm not an audiophile, so I am not very knowle...",3
3,5.0,"Sound quality is very clear,not a lot of bass ...",4
4,5.0,Was pleasantly surprised by this purchase.PROS...,5
5,3.0,I really like the price of these headphones. ...,6
6,5.0,"Honestly, I bought these headphones having a l...",7
7,5.0,We are very satisfied with these headphones. W...,8
8,5.0,"When I bought this, I wasn't expecting much. I...",9
9,5.0,I have been looking for headphones that are NO...,10


In [7]:
df = df.reindex(columns=['id', 'rating', 'content'])
df

Unnamed: 0,id,rating,content
0,1,5.0,Wow! I ordered these over-the-ear headphones b...
1,2,5.0,I have been wanting a cordless over the ear he...
2,3,4.0,"I'm not an audiophile, so I am not very knowle..."
3,4,5.0,"Sound quality is very clear,not a lot of bass ..."
4,5,5.0,Was pleasantly surprised by this purchase.PROS...
5,6,3.0,I really like the price of these headphones. ...
6,7,5.0,"Honestly, I bought these headphones having a l..."
7,8,5.0,We are very satisfied with these headphones. W...
8,9,5.0,"When I bought this, I wasn't expecting much. I..."
9,10,5.0,I have been looking for headphones that are NO...


In [8]:
reviews = df.rename(columns={'id':'Id', 'rating':'Score', 'content':'Text'})
reviews

Unnamed: 0,Id,Score,Text
0,1,5.0,Wow! I ordered these over-the-ear headphones b...
1,2,5.0,I have been wanting a cordless over the ear he...
2,3,4.0,"I'm not an audiophile, so I am not very knowle..."
3,4,5.0,"Sound quality is very clear,not a lot of bass ..."
4,5,5.0,Was pleasantly surprised by this purchase.PROS...
5,6,3.0,I really like the price of these headphones. ...
6,7,5.0,"Honestly, I bought these headphones having a l..."
7,8,5.0,We are very satisfied with these headphones. W...
8,9,5.0,"When I bought this, I wasn't expecting much. I..."
9,10,5.0,I have been looking for headphones that are NO...


In [14]:
reviews['Score'] = reviews['Score'].apply(lambda x: x.replace(' ', ''))
reviews['Score'] = pd.to_numeric(reviews['Score'])


In [16]:
reviews

Unnamed: 0,Id,Score,Text
0,1,5.0,Wow! I ordered these over-the-ear headphones b...
1,2,5.0,I have been wanting a cordless over the ear he...
2,3,4.0,"I'm not an audiophile, so I am not very knowle..."
3,4,5.0,"Sound quality is very clear,not a lot of bass ..."
4,5,5.0,Was pleasantly surprised by this purchase.PROS...
5,6,3.0,I really like the price of these headphones. ...
6,7,5.0,"Honestly, I bought these headphones having a l..."
7,8,5.0,We are very satisfied with these headphones. W...
8,9,5.0,"When I bought this, I wasn't expecting much. I..."
9,10,5.0,I have been looking for headphones that are NO...
