Import Key Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import re
from datetime import datetime
from selenium.webdriver.common.keys import Keys 
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, WebDriverException, InvalidSessionIdException
)

### Step1: Get all product Links

In [2]:
#Inputs to search
search_box_text = 'sports shoes for women'
website_link = "https://www.flipkart.com/"

#initiating the browser
#session start time
session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} ---------------------------> ")


#starting the browser
driver = webdriver.Chrome()
driver.get(website_link)
driver.maximize_window()

print('Waiting for search input...')
search_input = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[autocomplete="off"]'))) 
        
print('Typing in search input...') 
search_input.send_keys(search_box_text) 
        
print('Submitting search form...') 
search_input.send_keys(Keys.RETURN) 
        
print('Waiting for search results...') 
WebDriverWait(driver, 120).until( EC.presence_of_element_located((By.CSS_SELECTOR, '[target="_blank"]')) )

print('Collecting pagination links...') 


#we want first 25 pages [pagination link]  [1000 Products]
#logic: Let's get the first page pagination link and append the number in the end for 25 pages and store in a list
all_pagination_links =[]

first_page = driver.find_elements(By.CSS_SELECTOR, 'nav a')[0]
first_page_link = first_page.get_attribute('href')
all_pagination_links.append(first_page_link)

for i in range(2, 26):
    new_pagination_link = first_page_link[: -1] + str(i)
    all_pagination_links.append(new_pagination_link)

print('Pagination Links Count:', len(all_pagination_links)) 
print("All Pagination Links: ", all_pagination_links)


print("Collecting Product Detail Page Links")
all_product_links = []

for link in all_pagination_links:
    driver.get(link)
    # Wait for the page to load by checking document.readyState
    WebDriverWait(driver, 120).until(lambda d: d.execute_script('return document.readyState') == 'complete')

    #wait until elements located
    WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'rPDeLR'))) 
                
    all_products = driver.find_elements(By.CLASS_NAME, 'rPDeLR')
    all_links = [element.get_attribute('href') for element in all_products]

    print(f"{link} Done ------>")

    all_product_links.extend(all_links)
    
print('All Product Detail Page Links Captured: ', len(all_product_links)) 


# Creating a DataFrame from the list
df_product_links = pd.DataFrame(all_product_links, columns=['product_links'])
#remove any duplicates
df_product_links = df_product_links.drop_duplicates(subset=['product_links'])

print("Total Product Detail Page Links", len(df_product_links))
df_product_links.to_csv('flipkart_product_links.csv', index = False)

driver.close()
session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} ---------------------------> ")

Session Start Time: 17:24:20.015527 ---------------------------> 
Waiting for search input...
Typing in search input...
Submitting search form...
Waiting for search results...
Collecting pagination links...
Pagination Links Count: 25
All Pagination Links:  ['https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=1', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=2', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=3', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=4', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=5', 'https://www.flipkart.com/search?q=sports+

### Step2: Get Individual product information

In [3]:
# session start
session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} ---------------------------> ")

# read product links (demo: first 10)
df_product_links = pd.read_csv("flipkart_product_links.csv")
# df_product_links = df_product_links.head(10)
all_product_links = df_product_links['product_links'].tolist()
print("Collecting Individual Product Detail Information")

driver = None
complete_product_details = []
unavailable_products = []
successful_parsed_urls_count = 0
complete_failed_urls_count = 0

try:
    # Initialize driver (adjust options if needed)
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless=new")  # uncomment to run headless (careful with UI-popup detection)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=options)  # Selenium Manager in modern selenium will manage chromedriver
    print("Driver session id:", getattr(driver, "session_id", None))

    for product_page_link in all_product_links:
        try:
            driver.get(product_page_link)

            # Wait until page ready and <body> is present
            try:
                WebDriverWait(driver, 30).until(lambda d: d.execute_script('return document.readyState') == 'complete')
            except TimeoutException:
                print("Page load timed out, continuing to next URL.")
                unavailable_products.append(product_page_link)
                complete_failed_urls_count += 1
                continue

            # Example wait for key element — Flipkart pages vary; use a fallback to body presence
            try:
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            except TimeoutException:
                print("Body element not found, marking unavailable.")
                unavailable_products.append(product_page_link)
                complete_failed_urls_count += 1
                continue

            # Check availability (adapt selectors if Flipkart changes them)
            try:
                product_status = driver.find_element(By.CLASS_NAME, 'Z8JjpR').text
                if product_status in ('Currently Unavailable', 'Sold Out'):
                    unavailable_products.append(product_page_link)
                    successful_parsed_urls_count += 1
                    print(f"URL {successful_parsed_urls_count} completed ---> unavailable")
                    continue
            except NoSuchElementException:
                # element not found — proceed (product might be available)
                pass

            # Extract fields with safe fallbacks
            def safe_text(by, selector):
                try:
                    return driver.find_element(by, selector).text.strip()
                except Exception:
                    return ""

            brand = safe_text(By.CLASS_NAME, 'mEh187')
            title = safe_text(By.CLASS_NAME, 'VU-ZEz')
            title = re.sub(r'\s*\([^)]*\)', '', title)  # remove parentheses content
            price_raw = safe_text(By.CLASS_NAME, 'Nx9bqj')
            price_digits = re.findall(r'\d+', price_raw)
            price = ''.join(price_digits) if price_digits else ""

            # Discount
            discount = ""
            try:
                discount_raw = driver.find_element(By.CLASS_NAME, 'UkUFwK').text
                discount_digits = re.findall(r'\d+', discount_raw)
                if discount_digits:
                    discount = int(''.join(discount_digits)) / 100.0
            except NoSuchElementException:
                discount = ""

            # Ratings
            avg_rating = ""
            total_ratings = ""
            try:
                product_review_status = driver.find_element(By.CLASS_NAME, 'E3XX7J').text
                if product_review_status == 'Be the first to Review this product':
                    avg_rating = ""
                    total_ratings = ""
            except NoSuchElementException:
                # try usual rating elements
                try:
                    avg_rating = driver.find_element(By.CLASS_NAME, 'XQDdHH').text
                except NoSuchElementException:
                    avg_rating = ""

                try:
                    total_ratings_text = driver.find_element(By.CLASS_NAME, 'Wphh3N').text.split(' ')[0]
                    total_ratings = int(total_ratings_text.replace(',', '')) if total_ratings_text else ""
                except Exception:
                    total_ratings = ""

            successful_parsed_urls_count += 1
            print(f"URL {successful_parsed_urls_count} completed *******")
            complete_product_details.append([product_page_link, title, brand, price, discount, avg_rating, total_ratings])

        except InvalidSessionIdException as ise:
            # session died mid-run; log and break out to recreate/quit driver
            print("InvalidSessionIdException — the browser session is no longer valid:", ise)
            complete_failed_urls_count += 1
            unavailable_products.append(product_page_link)
            break  # break; outer finally will quit and you can restart the scraping
        except WebDriverException as wde:
            print(f"WebDriverException for URL {product_page_link}: {wde}")
            complete_failed_urls_count += 1
            unavailable_products.append(product_page_link)
            continue
        except Exception as e:
            print(f"Failed to establish a connection for URL {product_page_link}: {e}")
            complete_failed_urls_count += 1
            unavailable_products.append(product_page_link)
            continue

finally:
    # graceful cleanup
    try:
        if driver is not None:
            print("Attempting to quit driver. current session id:", getattr(driver, "session_id", None))
            try:
                driver.quit()
            except InvalidSessionIdException:
                # session already invalid - ignore
                print("Driver session already invalid when quitting.")
            except Exception as e:
                print("Exception while quitting driver:", e)
    except Exception as e:
        print("Unexpected error during cleanup:", e)

# create dataframes and save
df = pd.DataFrame(complete_product_details, columns=['product_link', 'title', 'brand', 'price', 'discount', 'avg_rating', 'total_ratings'])
df_duplicate_products = df[df.duplicated(subset=['brand', 'price', 'discount', 'avg_rating', 'total_ratings'])]
df = df.drop_duplicates(subset=['brand', 'price', 'discount', 'avg_rating', 'total_ratings'])
df_unavailable_products = pd.DataFrame(unavailable_products, columns=['link'])

print("Total product pages scrapped: ", len(all_product_links))
print("Final Total Products: ", len(df))
print("Total Unavailable Products : ", len(df_unavailable_products))
print("Total Duplicate Products: ", len(df_duplicate_products))

df.to_csv('flipkart_product_data.csv', index=False)
df_unavailable_products.to_csv('unavailable_products.csv', index=False)
df_duplicate_products.to_csv('duplicate_products.csv', index=False)

session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} ---------------------------> ")

Session Start Time: 17:29:23.251078 ---------------------------> 
Collecting Individual Product Detail Information
Driver session id: e4f69598da1de6598266fd39a24cad1b
URL 1 completed *******
URL 2 completed *******
URL 3 completed *******
URL 4 completed *******
URL 5 completed *******
URL 6 completed *******
URL 7 completed *******
URL 8 completed *******
URL 9 completed *******
URL 10 completed *******
URL 11 completed *******
URL 12 completed *******
URL 13 completed *******
URL 14 completed *******
URL 15 completed *******
URL 16 completed *******
URL 17 completed *******
URL 18 completed *******
URL 19 completed *******
URL 20 completed *******
URL 21 completed *******
URL 22 completed *******
URL 23 completed *******
URL 24 completed *******
URL 25 completed *******
URL 26 completed ---> unavailable
URL 27 completed *******
URL 28 completed *******
URL 29 completed *******
URL 30 completed *******
URL 31 completed *******
URL 32 completed *******
URL 33 completed *******
URL 34 c