In [2]:
import requests, re
from bs4 import BeautifulSoup
import pandas as pd


In [6]:
def amazon_review_scraper(url, page):
    url = f"{url}&pageNumber={page}"

    reviews = []

    user_agent = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
    }

    response = requests.get(url=url, headers=user_agent)
    soup = BeautifulSoup(response.content, "lxml")

    for review in soup.find_all("div", {"class": "a-section review aok-relative"}):
        name = review.find("span", {"class": "a-profile-name"}).text
        rating = review.find("i", {"data-hook": "review-star-rating"}).text
        comments = review.find(
            "div", {"class": "a-row a-spacing-small review-data"}
        ).text

        data = {
            "Name": name,
            "Rating": rating,
            "Comments": comments,
        }

        reviews.append(data)

    return reviews


urls = [
    "https://www.amazon.in/Omron-Automatic-Intellisense-Technology-Measurement/product-reviews/B00F38B3NW",
]


def scrape_reviews(urls):
    all_reviews = []
    for url in urls:
        url = url + "?th=1"
        print(f"Scraping for {url} started")
        num_pages = 2
        for page in range(2, num_pages):
            reviews = amazon_review_scraper(url, page)

            all_reviews.extend(
                [
                    f"Rating: {review['Rating']}, Comments: {review['Comments']}"
                    for review in reviews
                ]
            )
        print(f"Scraping for {url} completed")
    return all_reviews


all_reviews = scrape_reviews(urls)

amazon_df = pd.DataFrame(all_reviews)

# Save DataFrame to Excel file
amazon_df.to_excel("data/amazon_reviews.xlsx", index=False)


Scraping for https://www.amazon.in/Omron-Automatic-Intellisense-Technology-Measurement/product-reviews/B00F38B3NW?th=1 started
Scraping for https://www.amazon.in/Omron-Automatic-Intellisense-Technology-Measurement/product-reviews/B00F38B3NW?th=1 completed


In [35]:
def flipkart_review_scraper(url, page):
    url = f"{url}&page={page}"

    reviews = []

    user_agent = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
    }

    response = requests.get(url=url, headers=user_agent)
    soup = BeautifulSoup(response.content, "html.parser")

    for review in soup.find_all(
        "div", {"class": "_27M-vq"}
    ):  # Updated class for review container
        name = review.find(
            "p", {"class": "_2sc7ZR _2V5EHH"}
        ).text  # Updated class for reviewer's name
        rating = review.find(
            "div", {"class": "_3LWZlK"}
        ).text  # Updated class for rating
        comments = review.find(
            "div", {"class": "t-ZTKy"}
        ).text  # Updated class for review comments
        comments = re.sub(
            r"\s*READ\s+MORE\s*", "", comments
        )  # Remove 'READ MORE' links if present

        data = {
            "Name": name,
            "Rating": rating,
            "Comments": comments.strip(),
        }

        reviews.append(data)

    return reviews


url = "https://www.flipkart.com/samsung-galaxy-watch4-bluetooth-4-0cm-bt-calling-health-monitoring-fall-detection/product-reviews/itmb9f795a3e99ce?pid=SMWGBYRGYCHRVMXV&lid=LSTSMWGBYRGYCHRVMXVV14CCB&marketplace=FLIPKART"
print("Scraping Started")

all_reviews = []

for page in range(1, 31):
    reviews = flipkart_review_scraper(url, page)
    all_reviews.extend(
        [
            f"Rating: {review['Rating']}, Comments: {review['Comments']}"
            for review in reviews
        ]
    )

print("Scraping Completed")

flipkart_df = pd.DataFrame(all_reviews)

# Save DataFrame to Excel file
flipkart_df.to_excel("data/flipkart_reviews.xlsx", index=False)


Scraping Started


In [8]:
def snapdeal_review_scraper(url, page):
    url = f"{url}?page={page}"
    reviews = []

    user_agent = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
    }

    response = requests.get(url=url, headers=user_agent)
    soup = BeautifulSoup(response.content, "html.parser")

    reviews_skipped = 0  # Counter to track the number of reviews skipped

    for review in soup.find_all("div", {"class": "user-review"}):
        if reviews_skipped < 2:
            reviews_skipped += 1
            continue  # Skip the first two reviews

        rating_elements = review.find_all("i", class_="sd-icon sd-icon-star active")
        rating = len(rating_elements)
        name = review.find("div", {"class": "_reviewUserName"}).get("title")
        comments = review.find("p").text

        data = {
            "Name": name,
            "Rating": rating,
            "Comments": comments,
        }

        reviews.append(data)

    return reviews


url = "https://www.snapdeal.com/product/omron-auto-b-p-monitor/1290000232/reviews"

all_reviews = []

print("Scraping Started")

for page in range(1, 131):
    reviews = snapdeal_review_scraper(url, page)
    all_reviews.extend(
        [
            f"Rating: {review['Rating']}, Comments: {review['Comments']}"
            for review in reviews
        ]
    )

print("Scraping Completed")

snapdeal_df = pd.DataFrame(all_reviews)

# Save DataFrame to Excel file
snapdeal_df.to_excel("data/snapdeal_reviews.xlsx", index=False)


Scraping Started
Scraping Completed
