In [None]:

import logging  # Import logging for tracking the execution of the code
from bs4 import BeautifulSoup  # Import BeautifulSoup for web scraping
import requests  # Import requests for making HTTP requests
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations
from datetime import datetime  # Import datetime for date and time manipulation
from dateutil.relativedelta import relativedelta  # Import relativedelta for calculating date differences

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:

# Function to extract the product name
def get_productname(soup):
    try:
        logging.info("Extracting product name")
        productname = soup.find("h3", attrs={"class":'block m-0 line-clamp-2 font-regular text-base leading-sm text-darkOnyx-800 pt-0.5 h-full'})
        productname_value = productname.text
        productname_string = productname_value.strip()
    except AttributeError:
        logging.error("Product name not found")
        productname_string = ""
    return productname_string

# Function to extract the brand name
def get_brand(soup):
    try:
        logging.info("Extracting brand name")
        brand = soup.find("span", attrs={"class":'Label-sc-15v1nk5-0 BrandName___StyledLabel2-sc-hssfrl-1 gJxZPQ keQNWn'})
        brand_value = brand.text
        brand_string = brand_value.strip()
    except AttributeError:
        logging.error("Brand name not found")
        brand_string = ""
    return brand_string

# Function to extract the product code
def get_code(soup):
    try:
        logging.info("Extracting product code")
        code = soup.find("div", attrs={"style":"font-family: 'ProximaNova-Regular';font-size:13px;line-height: 18px;color:8f8f8f;"})
        code_value = code.text
        code_string = code_value.strip()
    except AttributeError:
        logging.error("Product code not found")
        code_string = ""
    return code_string

# Function to extract the product price
def get_price(soup):
    try:
        logging.info("Extracting product price")
        price = soup.find("span", attrs={'class':'Label-sc-15v1nk5-0 Pricing___StyledLabel-sc-pldi2d-1 gJxZPQ AypOi'}).string.strip()
    except AttributeError:
        logging.error("Product price not found")
        price = ""
    return price

# Function to extract the weight/volume of the product
def get_wv(soup):
    try:
        logging.info("Extracting weight/volume")
        wv = soup.find("span", attrs={"class":'Label-sc-15v1nk5-0 PackSizeSelector___StyledLabel2-sc-l9rhbt-2 gJxZPQ hDJUsF'})
        wv_value = wv.text
        wv_string = wv_value.strip()
    except AttributeError:
        logging.error("Weight/volume not found")
        wv_string = ""
    return wv_string

# Function to extract the product description
def get_desc(soup):
    try:
        logging.info("Extracting product description")
        desc = soup.find("span", attrs={"class":'Label-sc-15v1nk5-0 PackSizeSelector___StyledLabel2-sc-l9rhbt-2 gJxZPQ hDJUsF'})
        desc_value = desc.text
        desc_string = desc_value.strip()
    except AttributeError:
        logging.error("Product description not found")
        desc_string = ""
    return desc_string

# Function to extract product image URLs
def get_imgurl(soup):
    try:
        logging.info("Extracting product image URLs")
        imgurl = soup.find_all("img", attrs={"data-nimg":'intrinsic'})
        links = [url.get("src") for url in imgurl]
        link = links[1::2]  # Return every second URL to avoid duplicates (if any)
    except AttributeError:
        logging.error("Product images not found")
        link = []
    return link


In [None]:

if __name__ == '__main__':
    logging.info("Starting the scraping process")
    
    # Define HTTP headers, including User-Agent and Accept-Language
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    # Define the URL of the webpage to scrape
    URL = "https://www.bigbasket.com/cl/cleaning-household/?nc=nb"

    # Send an HTTP request to the webpage
    webpage = requests.get(URL, headers=HEADERS)
    logging.info(f"Requested URL: {URL}")

    # Create a BeautifulSoup object to parse the webpage content
    soup = BeautifulSoup(webpage.content, "html.parser")
    logging.info("Parsed the webpage content")

    # Find all link elements with the specified class and store them in a list
    links = soup.find_all("a", attrs={'class': 'h-full'})
    links_list = [link.get('href') for link in links]

    # Initialize a dictionary to store product details
    d = {
        "Product Name": [], 
        "brand": [], 
        "category": [], 
        "bar code": [], 
        "price (M.R.P.)": [], 
        "weight/volume": [], 
        "Unit": [], 
        "Product Description": [], 
        "Images URL": [], 
        "Product URL": [], 
        "Relative Expiry time w.r.t. today": [], 
        "Relative expiry in months": []
    }

    # Loop through each link to extract product details
    for link in links_list:
        logging.info(f"Processing link: https://www.bigbasket.com{link}")
        
        # Send an HTTP request to the product page
        new_webpage = requests.get("https://www.bigbasket.com" + link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")
        
        # Extract weight and unit from the product page
        try:
            quant, unit = get_wv(new_soup).split(" ")
        except ValueError:
            logging.error("Weight/volume format is incorrect")
            quant, unit = "", ""
        
        # Get the current date
        current_date = datetime.now()
        
        # Example specific date for calculation
        specific_date = datetime(2020, 12, 25)
        
        # Calculate the difference between current date and specific date
        difference = relativedelta(current_date, specific_date)
        relative = f'{difference.years} years, {difference.months} months, and {difference.days} days'
        rmonths = difference.years * 12 + difference.months

        # Extract and append product details to the dictionary
        d['Product Name'].append(get_productname(new_soup))
        d['brand'].append(get_brand(new_soup))
        d['category'].append("Household and care")
        d['bar code'].append(get_code(new_soup))
        d['price (M.R.P.)'].append(get_price(new_soup))
        d['weight/volume'].append(quant)
        d['Unit'].append(unit)
        d['Product Description'].append(get_desc(new_soup))
        d['Images URL'].append(get_imgurl(new_soup))
        d['Product URL'].append("https://www.bigbasket.com" + link)
        d['Relative Expiry time w.r.t. today'].append(relative)
        d['Relative expiry in months'].append(rmonths)
    
    # Create a DataFrame from the dictionary
    file = pd.DataFrame.from_dict(d)
    
    # Replace empty strings with NaN and drop rows with NaN in 'Product Name'
    file['Product Name'].replace('', np.nan, inplace=True)
    file = file.dropna(subset=['Product Name'])
    
    # Save the DataFrame to a CSV file
    file.to_csv("file.csv", header=True, index=False)
    logging.info("Saved the product details to file.csv")
