In [17]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import schedule
import time

In [18]:
def get_title(soup):
  try:
    title = soup.find("span",attrs={"id":"productTitle"})
    title_value = title.text # content inside span tag
    title_string = title_value.strip()

  except AttributeError:
    title_string = ""
  
  return title_string

In [19]:
def get_price(soup):
    try:
        price = soup.find("span", attrs={"class": "a-price-whole"}).text.strip()[:-1]
        # remove the comma from the price
        price = price.replace(",", "")
    except:
        price = ""
    return price


In [20]:
def get_discount(soup):
    try:
        discount = soup.find("span", attrs={"class": "a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage"}).text.strip()[1:-1]
    except:
        discount = ""
    return discount


In [21]:
def get_rating(soup):
  try:
        rating = soup.find("span", attrs={"class": "a-icon-alt"}).text[:3]
  except:
        rating = ""
  return rating

In [22]:
def get_all_reviews(soup):
    reviews = []
    try:
        review_divs = soup.find_all("div", class_="a-expander-content reviewText review-text-content a-expander-partial-collapse-content")
        for review_div in review_divs:
            review_span = review_div.find("span")
            if review_span:
                review = review_span.get_text(strip=True)
                reviews.append(review)
    except Exception as e:
        print(f"Error occurred: {e}")
    return reviews


In [23]:
HEADERS = ({'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'})


In [24]:
URLS = {
    "boAt Rockerz 255" : "https://www.amazon.in/boAt-Rockerz-255-Pro-Earphones/dp/B08TV2P1N8",
    "Oneplus Bullets Z2": "https://www.amazon.in/Oneplus-Bluetooth-Wireless-Earphones-Bombastic/dp/B09TVVGXWS/ref=sr_1_3?crid=1AD2BORMPFJHA&dib=eyJ2IjoiMSJ9.TCdYft94Omg-JxhEPSoOK2Y1bkzd6rS6K4SeCeQxH6p4Y9H8OcZ7iiZn7LK2LoClTDcKFcV5h3vCzYzLBd0VsFgdWJiPtOMuIzmHjOnNPZNxPbKA0sSxX1hjONJBqaEKzcFGKE0hmaFXmbr0aQ4igAuxXgXGTggmD0CY5IdJNIriYxcdTvEi55HTyBJg6O4Jz9wWEpG6N6TFh-R0tHdqW3fEMnWuDI8ldew88aJnEeY.gj-XLLo5c1Q5ukdSAiIzaDoFpVQBU-N3xbQOirwPxGg&dib_tag=se&keywords=bluetooth%2Bearphones%2Bwired&qid=1737911857&sprefix=bluetooth%2Bearphones%2Bwire%2Caps%2C270&sr=8-3&th=1",
    "Realme Buds Wireless 3 Neo":"https://www.amazon.in/realme-Buds-Wireless-Bluetooth-Resistannt/dp/B0D3HT2S1M/ref=sr_1_10?crid=1AD2BORMPFJHA&dib=eyJ2IjoiMSJ9.TCdYft94Omg-JxhEPSoOK2Y1bkzd6rS6K4SeCeQxH6p4Y9H8OcZ7iiZn7LK2LoClTDcKFcV5h3vCzYzLBd0VsFgdWJiPtOMuIzmHjOnNPZNxPbKA0sSxX1hjONJBqaEKzcFGKE0hmaFXmbr0aQ4igAuxXgXGTggmD0CY5IdJNIriYxcdTvEi55HTyBJg6O4Jz9wWEpG6N6TFh-R0tHdqW3fEMnWuDI8ldew88aJnEeY.gj-XLLo5c1Q5ukdSAiIzaDoFpVQBU-N3xbQOirwPxGg&dib_tag=se&keywords=bluetooth%2Bearphones%2Bwired&qid=1737911857&sprefix=bluetooth%2Bearphones%2Bwire%2Caps%2C270&sr=8-10&th=1",
    "JBL Tune 215BT":"https://www.amazon.in/JBL-Playtime-Bluetooth-Earphones-Assistant/dp/B08FB2LNSZ/ref=sr_1_18?crid=1AD2BORMPFJHA&dib=eyJ2IjoiMSJ9.TCdYft94Omg-JxhEPSoOK2Y1bkzd6rS6K4SeCeQxH6p4Y9H8OcZ7iiZn7LK2LoClTDcKFcV5h3vCzYzLBd0VsFgdWJiPtOMuIzmHjOnNPZNxPbKA0sSxX1hjONJBqaEKzcFGKE0hmaFXmbr0aQ4igAuxXgXGTggmD0CY5IdJNIriYxcdTvEi55HTyBJg6O4Jz9wWEpG6N6TFh-R0tHdqW3fEMnWuDI8ldew88aJnEeY.gj-XLLo5c1Q5ukdSAiIzaDoFpVQBU-N3xbQOirwPxGg&dib_tag=se&keywords=bluetooth%2Bearphones%2Bwired&qid=1737911857&sprefix=bluetooth%2Bearphones%2Bwire%2Caps%2C270&sr=8-18&th=1"

    }

In [25]:
# Scrape Price, Discount, Rating, Review for each product

In [26]:
# Dataframe to store the scraped data - Product_name, Price, Discount, Date 
competitor_data_today = pd.DataFrame(columns=["Product_name", "Price", "Date"])


In [27]:
# Create review dataframe
# reviews_today = product_name, review
reviews_today = pd.DataFrame(columns=["Product_name", "Review"])

In [28]:
for product, url in URLS.items():
    # Create a dictionary to store the data
    competitor_data = {"Product_name": [], "Price": [], "Discount": [], "Date": time.strftime("%Y-%m-%d")}
    reviews_data = {"Product_name": [], "Review": []}
    
    # Get the page content
    page = requests.get(url, headers=HEADERS)
    
    # Create a BeautifulSoup object
    soup = BeautifulSoup(page.content, "html.parser")
    
    # Get the title
    title = get_title(soup)
    
    # Get the price
    price = get_price(soup)
    
    # Get the discount
    discount = get_discount(soup)
    
    # Get all reviews
    all_reviews = get_all_reviews(soup)  # Use the updated function
    
    # Store the product data in the dictionary
    competitor_data["Product_name"].append(product)
    competitor_data["Price"].append(price)
    competitor_data["Discount"].append(discount)
    
    # Add the product data to the dataframe
    competitor_data_today = pd.concat([competitor_data_today, pd.DataFrame(competitor_data)])
    
    # Store all review data in the dictionary
    for review in all_reviews:
        reviews_data["Product_name"].append(product)
        reviews_data["Review"].append(review)
    
    # Add the review data to the dataframe
    reviews_today = pd.concat([reviews_today, pd.DataFrame(reviews_data)])


In [29]:
competitor_data_today

Unnamed: 0,Product_name,Price,Date,Discount
0,boAt Rockerz 255,998,2025-01-28,75
0,Oneplus Bullets Z2,1799,2025-01-28,22
0,Realme Buds Wireless 3 Neo,1299,2025-01-28,48
0,JBL Tune 215BT,1699,2025-01-28,43


In [30]:
reviews_today

Unnamed: 0,Product_name,Review
0,boAt Rockerz 255,I am an earphone addict. I think I am eligible...
1,boAt Rockerz 255,"I’ve been using the boAt Rockerz 255 ANC, and ..."
2,boAt Rockerz 255,i like the design and quality of the neckband...
3,boAt Rockerz 255,"Pros: durability, design, plastic quality, com..."
4,boAt Rockerz 255,I would Rate - 4.1 of 5Construct and Build - N...
5,boAt Rockerz 255,"Your product is very good, I myself have been ..."
6,boAt Rockerz 255,Tinha um fone parecido de outra marca; entreta...
7,boAt Rockerz 255,Melhor fone que já tive. Não cai devido ao sup...
8,boAt Rockerz 255,"Fone bom, som alto limpo e com bons graves, ve..."
9,boAt Rockerz 255,I specifically bought this to try this product...


In [31]:
# Save the data to a CSV file with today's date
today = time.strftime("%Y-%m-%d")
competitor_data_today.to_csv(f"competitor_data_{today}.csv", index=False)


In [32]:
# Save the review data to a CSV file with today's date
reviews_today.to_csv(f"reviews_data_{today}.csv", index=False)

In [None]:
# # open competitor_data.csv 

# competitor_data = pd.read_csv("competitor_data.csv")
# # select unique product names
# # create a new dataframe to such that it will all data including today's data
# competitor_data_new = pd.DataFrame(columns=["product_name", "price", "discount", "date"])

# unique_products = competitor_data["product_name"].unique()
# # for each of the unique products, add the data for today
# for product in unique_products:
#     # get the data for the product
#     product_data_today = competitor_data_today[competitor_data_today["product_name"] == product]
#     product_data = competitor_data[competitor_data["product_name"] == product]
#     # append the data for today to the existing data
#     product_data = pd.concat([product_data, product_data_today])
#     # add the data to the new dataframe
#     competitor_data_new = pd.concat([competitor_data_new, product_data])


# # save the new data to a csv file
# competitor_data_new.to_csv("competitor_data.csv", index=False)    

In [None]:
# do the same for reviews
# reviews = pd.read_csv("reviews.csv")
# # select unique product names
# # create a new dataframe to such that it will all data including today's data
# reviews_new = pd.DataFrame(columns=["product_name", "review"])

# unique_products = reviews["product_name"].unique()
# # for each of the unique products, add the data for today
# for product in unique_products:
#     # get the data for the product
#     product_reviews_today = reviews_today[reviews_today["Product_name"] == product]
#     product_reviews = reviews[reviews["product_name"] == product]
#     # append the data for today to the existing data
#     product_reviews = pd.concat([product_reviews, product_reviews_today])
#     # add the data to the new dataframe
#     reviews_new = pd.concat([reviews_new, product_reviews])

# # save the new data to a csv file
# reviews_new.to_csv("reviews.csv", index=False)
