In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
from selenium.common.exceptions import NoSuchElementException

In [2]:
USER_AGENT_LIST = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
                'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4']
DRIVER_FILE_PATH = "/Users/qunishdash/.wdm/drivers/chromedriver/mac64/116.0.5845.96/chromedriver-mac-x64/chromedriver"

In [3]:
def get_chrome_driver(headless_flag):
    # Set up the Selenium webdriver
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service

    service = Service(executable_path=DRIVER_FILE_PATH)
    chrome_options = Options()

    if headless_flag:
        # in case you want headless browser
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("user-agent={}".format(random.choice(USER_AGENT_LIST)))
    else:
        # in case  you want to open browser
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("user-agent={}".format(random.choice(USER_AGENT_LIST)))
        chrome_options.headless = False

    driver = webdriver.Chrome(service=service, options=chrome_options) 

    return driver

In [4]:
# Function to scroll to a specific element on the page
def scroll_to_element(driver, element_selector, offset_percentage=0.5):
    element = driver.find_element(By.CSS_SELECTOR, element_selector)
    # Calculate the offset position
    offset = -driver.execute_script("return window.innerHeight") * offset_percentage
    # Scroll to the element with the calculated offset
    driver.execute_script("arguments[0].scrollIntoView(); window.scrollBy(0, arguments[1]);", element, offset)
    time.sleep(3)

# Function to click the "Show More" button if it's present
def click_show_more(driver):
    try:
        show_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, ".show-more button"))  # Adjust the class
        )
        show_more_button.click()
    except:
        pass

In [5]:
def extract_data_from_card(card_element):
    try:
        product_name = card_element.find_element(By.CSS_SELECTOR, ".prod-name").text.replace("\n", " ")
    except Exception as e:
        product_name = ""
        print(e)
    try:
        product_url = card_element.find_element(By.CSS_SELECTOR, "a.ng-binding").get_attribute("href")
    except Exception as e:
        product_url = ""
        print(e)
    
    data = {
        "product_name": product_name,
        "product_url": product_url,
    }

    return data

In [6]:
import pymongo

MONGODB_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "bigbasket_data_db"

def push_data_to_mongodb(data, collection_name):
    try:
        client = pymongo.MongoClient(MONGODB_URI)
        db = client[DATABASE_NAME]
        collection = db[collection_name]
        collection.insert_many(data)
        print(f"Data pushed to MongoDB collection '{collection_name}' successfully.")
    except Exception as e:
        print("Error while pushing data to MongoDB:", e)
    finally:
        client.close()

In [7]:
def scrape_data(collection_url, collection_name):
    driver = get_chrome_driver(False)
    driver.get(collection_url)

    product_section = driver.find_element(By.CSS_SELECTOR, '.pl-wrap')

    print(f"Scraping data from {collection_url}")

    while True:
        try:
            click_show_more(driver)
            time.sleep(2)

            initial_scroll_position = driver.execute_script("return window.scrollY;")
            scroll_to_element(driver, '.styl:nth-child(1) .pad-wrapper', offset_percentage=0.5)
            new_scroll_position = driver.execute_script("return window.scrollY;")

            if new_scroll_position <= initial_scroll_position:
                print("Scroll position did not change, breaking the loop")
                break
        except NoSuchElementException:
            print("Show More button not found, breaking the loop")
            break

    card_elements = product_section.find_elements(By.CSS_SELECTOR, '.row.ng-scope')

    all_data = []

    for card_element in card_elements:
        product_data = extract_data_from_card(card_element)
        all_data.append(product_data)

    print(all_data)
    push_data_to_mongodb(all_data, collection_name)
    
    print(f"Total data collected from {collection_url}: {len(all_data)}")
    driver.quit()

In [8]:
collection_urls = [
    # {"fruits-vegetables-lv": "https://www.bigbasket.com/cl/fruits-vegetables/#!page=1"},
    # {"foodgrains-oil-masala-lv": "https://www.bigbasket.com/cl/foodgrains-oil-masala/#!page=1"},
    {"bakery-cakes-dairy-lv": "https://www.bigbasket.com/cl/bakery-cakes-dairy/#!page=1"},
    # {"beverages-lv": "https://www.bigbasket.com/cl/beverages/#!page=1"},
    # {"snacks-branded-foods-lv": "https://www.bigbasket.com/cl/snacks-branded-foods/#!page=1"},
    # {"beauty-hygiene-lv": "https://www.bigbasket.com/cl/beauty-hygiene/#!page=1"},
    # {"cleaning-household-lv": "https://www.bigbasket.com/cl/cleaning-household/#!page=1"},
    # {"kitchen-garden-pets-lv": "https://www.bigbasket.com/cl/kitchen-garden-pets/#!page=1"},
    # {"eggs-meat-fish-lv": "https://www.bigbasket.com/cl/eggs-meat-fish/#!page=1"},
    # {"gourmet-world-food-lv": "https://www.bigbasket.com/cl/gourmet-world-food/#!page=1"},
    # {"baby-care-lv": "https://www.bigbasket.com/cl/baby-care/#!page=1"}
]

In [9]:
for collection_info in collection_urls:
    collection_name = next(iter(collection_info.keys()))
    url = collection_info[collection_name]
    
    scrape_data(url, collection_name)

  chrome_options.headless = False


Scraping data from https://www.bigbasket.com/cl/bakery-cakes-dairy/#!page=1
Scroll position did not change, breaking the loop
[{'product_name': 'Milky Mist Paneer - Rich In Protein, Calcium, Excellent Taste 3.9 20417 Ratings', 'product_url': 'https://www.bigbasket.com/pd/137575/milky-mist-paneer-premium-fresh-1-kg-pouch/?nc=bakery-cakes-dairy&t_pg=l1-bakery-cakes-dairy&t_p=bakery-cakes-dairy&t_s=bakery-cakes-dairy&t_pos=3&t_ch=desktop'}, {'product_name': 'Milky Mist Paneer - Rich In Calcium 3.7 114 Ratings', 'product_url': 'https://www.bigbasket.com/pd/40003150/milky-mist-paneer-premium-fresh-500-g-pouch/?nc=bakery-cakes-dairy&t_pg=l1-bakery-cakes-dairy&t_p=bakery-cakes-dairy&t_s=bakery-cakes-dairy&t_pos=3&t_ch=desktop'}, {'product_name': 'Nandini GoodLife UHT Treated Toned Milk 4.1 6073 Ratings', 'product_url': 'https://www.bigbasket.com/pd/244999/nandini-goodlife-toned-milk-500-ml-pouch/?nc=bakery-cakes-dairy&t_pg=l1-bakery-cakes-dairy&t_p=bakery-cakes-dairy&t_s=bakery-cakes-dairy&t_