In [29]:
from selenium import webdriver  # To initialize and control a browser instance programmatically
from selenium.webdriver.chrome.service import Service  # To manage the ChromeDriver executable service for Selenium
from selenium.webdriver.chrome.options import Options  # To configure options for the Chrome browser instance (e.g., headless mode)
from selenium.webdriver.common.by import By  # To locate elements on a webpage using different strategies (e.g., ID, name, XPath)
from selenium.webdriver.support.ui import WebDriverWait  # To pause execution until a specific condition is met (e.g., element visibility)
from selenium.webdriver.support import expected_conditions as EC  # To define conditions to wait for (e.g., element clickable, text presence)
from selenium.common.exceptions import TimeoutException, NoSuchElementException  # To handle specific Selenium exceptions during interaction
import pandas as pd  # For data manipulation and analysis, especially to store scraped data in structured formats like DataFrames
import time  # To introduce delays between operations or simulate user-like behavior
import requests  # To make HTTP requests (e.g., GET, POST) for retrieving web data or interacting with APIs

In [11]:
service = Service(r'C:\Users\data_architect\Downloads\chromedriver-win64 (1)\chromedriver-win64\chromedriver.exe')

In [19]:
options = Options()
options.add_argument("--headless")

service = Service()
driver = webdriver.Chrome(service=service, options=options)

driver.get('https://www.google.com')
print(f"Page title is: {driver.title}")

driver.quit()


Page title is: Google


In [21]:
url = 'https://dir.indiamart.com/search.mp?ss=korean+flavours&v=4&mcatid=177793&catid=15&prdsrc=1&tags=stype:attr=1|qr_nm=splt-gd|res=RC5|com-cf:nl|ptrs=na|ktp=N0|mc=6529|mtp=G|qry_typ=P|lang=en|wc=2&cs=9275'

In [23]:
r = requests.get(url)

In [25]:
print(r.status_code)

200


In [None]:
## working code
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time

# Function to scrape data and save specific records in one shot
def scrape_and_save_data(url, driver_path, step_size=2):
    # Setup Chrome WebDriver
    chrome_options = Options()
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Target URL
    driver.get(url)

    # Wait for initial elements to load
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "cardlinks")))
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "span.elps.elps1")))
    except TimeoutException:
        print("Error: Elements did not load in time.")
        driver.quit()
        return

    # Lists to store data
    product_names = []
    product_links = []
    seller_names = []
    seller_addresses = []

    # Function to scrape data
    def scrape_data():
        try:
            products = driver.find_elements(By.CLASS_NAME, "cardlinks")
            addresses = driver.find_elements(By.CSS_SELECTOR, "span.elps.elps1")

            for i, product in enumerate(products):
                product_name = product.text.strip()
                product_link = product.get_attribute('href')

                try:
                    seller_name = product.find_element(By.XPATH, ".//following-sibling::a").text.strip()
                except NoSuchElementException:
                    seller_name = "N/A"

                seller_address = addresses[i].text.strip() if i < len(addresses) else 'N/A'

                product_names.append(product_name)
                product_links.append(product_link)
                seller_names.append(seller_name)
                seller_addresses.append(seller_address)

        except Exception as e:
            print(f"Error during scraping: {e}")

    # Pause for manual interaction
    print("Please log in and click the 'Show More' button manually. Press Enter to continue...")
    input()

    # Function to scroll and load more data
    def scroll_and_load():
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)

            scrape_data()

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                print("No more data to load.")
                break
            last_height = new_height

    # Start scrolling and scraping
    scroll_and_load()

    # Save scraped data to a DataFrame
    df = pd.DataFrame({
        'Product Name': product_names,
        'Product Link': product_links,
        'Seller Name': seller_names,
        'Seller Address': seller_addresses
    })

    # Debugging: Print page source and available elements (optional)
    print("Page Source (first 1000 chars):", driver.page_source[:1000])
    print("Products:", [elem.text for elem in driver.find_elements(By.CLASS_NAME, "cardlinks")])
    print("Addresses:", [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, "span.elps.elps1")])

    # Save entire scraped data to Excel
    df.to_excel('korean_scraped_data.xlsx', index=False)

    # Generate indices for every Nth record
    indices = list(range(0, len(df), step_size))  # 0, 2, 4, etc.
    specific_records = df.iloc[indices].reset_index(drop=True)

    # Extract every Nth record where N is step_size
    nth_records = df.iloc[step_size-1::step_size].reset_index(drop=True)

    # Save specific records to separate Excel files
    specific_records.to_excel("korean_products.xlsx", index=False)
    nth_records.to_excel("korean_seller.xlsx", index=False)

    # Close the browser
    driver.quit()

    print("Data scraping and saving completed!")

# Example usage of the function
scrape_and_save_data(
    url=url_korean,  # Replace with the actual URL
    driver_path= r'C:\Users\data_architect\Downloads\chromedriver-win64 (1)\chromedriver-win64\chromedriver.exe',
    step_size=2
)    
