# **Initialization**

In [1]:
!pip install selenium
from selenium import webdriver
!apt-get update
!apt install chromium-driver
def web_driver():
  options = webdriver.ChromeOptions()
  options.add_argument("--verbose")
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-gpu')
  options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome(options=options)
  return driver
driver = web_driver()

Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 3,917 B in 3s (1,427 B/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (

# **Data Extraction**

In [2]:
def extract_car_data(car_text):
    """
    Extract car details from the text content.
    Returns a dictionary with the extracted data.
    """
    lines = car_text.strip().split('\n')

    data = {
        'Car_Name': '',
        'Year': '',
        'Kilometers_Driven': '',
        'Fuel_Type': '',
        'Transmission': '',
        'Price': ''
    }

    try:
        # First line usually contains year and car name
        first_line = lines[0].strip()
        year_match = re.search(r'\b(19|20)\d{2}\b', first_line)
        if year_match:
            data['Year'] = year_match.group(0)
            data['Car_Name'] = first_line

        # Parse through other lines
        for line in lines[1:]:
            line = line.strip()

            # Kilometers driven (e.g., "34.11k km" or "96.90k km")
            if 'km' in line.lower() and ('k' in line.lower() or re.search(r'\d', line)):
                data['Kilometers_Driven'] = line

            # Fuel type
            elif any(fuel in line.lower() for fuel in ['petrol', 'diesel', 'cng', 'electric', 'hybrid']):
                data['Fuel_Type'] = line

            # Transmission
            elif any(trans in line.lower() for trans in ['manual', 'auto', 'automatic']):
                data['Transmission'] = line

            # Price (looking for lakh pattern)
            elif 'lakh' in line.lower() and '₹' in line:
                # Extract the main price (e.g., "₹3.35 lakh")
                price_match = re.search(r'₹[\d.]+\s*lakh', line)
                if price_match:
                    data['Price'] = price_match.group(0)

    except Exception as e:
        print(f"Error parsing data: {e}")

    return data

# **Web Scrapping**

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import InvalidSessionIdException
import time
import csv
import re
from datetime import datetime

# Assuming web_driver function is defined in a previous cell


try:
    # Ensure a fresh driver instance is available
    if 'driver' not in locals() or not driver.session_id:
        print("Creating a new WebDriver instance...")
        driver = web_driver()

    # Navigate to the website
    driver.get("https://www.cars24.com/buy-used-car/?f=make%3A%3D%3Ahyundai&search=hyundai&listingSource=Search_HP&storeCityId=2378")
    time.sleep(5)  # Wait for page to load completely

    # Scroll and click "Load More" or similar buttons to load all cars
    print("Loading all car listings...")
    print(f"URL: {driver.current_url}\n")

    previous_count = 0
    no_change_iterations = 0
    max_iterations = 100  # Maximum number of scroll attempts
    iteration = 0

    while iteration < max_iterations:
        iteration += 1

        # Get current count of car elements
        current_elems = driver.find_elements(By.CLASS_NAME, "styles_outer__NTVth")
        current_count = len(current_elems)

        print(f"Iteration {iteration}: Found {current_count} cars")

        # Try different scrolling methods
        # Method 1: Scroll in increments
        for i in range(5):
            driver.execute_script("window.scrollBy(0, 300);")
            time.sleep(0.2)

        # Method 2: Scroll to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

        # Method 3: Try scrolling within specific containers
        try:
            containers = driver.find_elements(By.CSS_SELECTOR, "div[class*='container'], div[class*='list'], div[class*='grid']")
            for container in containers[:3]:  # Try first 3 containers
                driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", container)
        except:
            pass


        time.sleep(1)

        # Try to find and click the "Next" button or other likely load more buttons
        button_clicked = False
        button_selectors = [
            "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'next')]",
            "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'load more')]",
            "//a[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'next')]",
            "//a[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'load more')]",
            "//*[contains(@class, 'pagination')]//button",
            "//*[contains(@class, 'next')]",
        ]

        for selector in button_selectors:
            try:
                buttons = driver.find_elements(By.XPATH, selector)
                for button in buttons:
                    if button.is_displayed() and button.is_enabled():
                        try:
                            # Scroll to button
                            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", button)
                            time.sleep(0.5)
                            # Try clicking
                            button.click()
                            print(f"  → Clicked button: {button.text[:50]}")
                            button_clicked = True
                            time.sleep(2)
                            break
                        except:
                            # Try JavaScript click
                            try:
                                driver.execute_script("arguments[0].click();", button)
                                print(f"  → JS-clicked button: {button.text[:50]}")
                                button_clicked = True
                                time.sleep(2)
                                break
                            except:
                                pass
                if button_clicked:
                    break
            except:
                continue

        time.sleep(1)

        # Check if new cars were loaded
        new_elems = driver.find_elements(By.CLASS_NAME, "styles_outer__NTVth")
        new_count = len(new_elems)

        if new_count > current_count:
            print(f"  ✓ Loaded {new_count - current_count} more cars (Total: {new_count})")
            no_change_iterations = 0
        else:
            no_change_iterations += 1
            print(f"  → No new cars loaded (attempt {no_change_iterations}/8)")

        # If no new content loaded for 8 consecutive iterations, stop
        if no_change_iterations >= 8:
            print("\n" + "="*60)
            print("No more cars to load. This appears to be all available cars.")
            print("="*60 + "\n")
            break

        previous_count = new_count


    print("Finished loading. Collecting all car listings...\n")

    # Find all car listing elements after loading everything
    elems = driver.find_elements(By.CLASS_NAME, "styles_outer__NTVth")

    print(f"Total cars found: {len(elems)}")
    print("Extracting data...\n")

    # Collect data from all elements
    all_cars_data = []

    for idx, elem in enumerate(elems, 1):
        try:
            # Get the text content of each car listing
            car_text = elem.text
            car_data = extract_car_data(car_text) # Assuming extract_car_data function is defined
            all_cars_data.append(car_data)
            print(f"✓ Car #{idx}: {car_data['Car_Name']}")
        except Exception as e:
            print(f"✗ Error extracting Car #{idx}: {str(e)}")


    # Save to CSV file
    if all_cars_data:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_filename = f"cars24_data_{timestamp}.csv"

        fieldnames = ['Car_Name', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Price']

        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_cars_data)

        print(f"\n{'='*60}")
        print(f"✓ Data successfully saved to '{csv_filename}'")
        print(f"✓ Total records: {len(all_cars_data)}")
        print(f"{'='*60}")
    else:
        print("No data found to write to CSV")

except InvalidSessionIdException:
    print("Caught InvalidSessionIdException. The WebDriver session was lost.")
    print("Please try re-running the cells to create a new session.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


finally:
    # Close the browser if the driver object is still valid
    if 'driver' in locals() and driver.session_id:
        try:
            time.sleep(2)
            driver.close()
            print("\nBrowser closed.")
        except InvalidSessionIdException:
            print("\nWebDriver session already closed or invalid during cleanup.")
    else:
        print("\nNo active WebDriver session to close.")

Loading all car listings...
URL: https://www.cars24.com/buy-used-car/?f=make%3A%3D%3Ahyundai&search=hyundai&listingSource=Search_HP&storeCityId=2378

Iteration 1: Found 20 cars
  → Clicked button: Next
  ✓ Loaded 20 more cars (Total: 40)
Iteration 2: Found 40 cars
  → JS-clicked button: Next
  → No new cars loaded (attempt 1/8)
Iteration 3: Found 40 cars
  → Clicked button: Next
  → No new cars loaded (attempt 2/8)
Iteration 4: Found 40 cars
  → Clicked button: Next
  → No new cars loaded (attempt 3/8)
Iteration 5: Found 40 cars
  → Clicked button: Next
  ✓ Loaded 20 more cars (Total: 60)
Iteration 6: Found 60 cars
  → Clicked button: Next
  ✓ Loaded 20 more cars (Total: 80)
Iteration 7: Found 80 cars
  → Clicked button: Next
  → No new cars loaded (attempt 1/8)
Iteration 8: Found 80 cars
  → Clicked button: Next
  ✓ Loaded 20 more cars (Total: 100)
Iteration 9: Found 100 cars
  → Clicked button: Next
  ✓ Loaded 20 more cars (Total: 120)
Iteration 10: Found 120 cars
  → Clicked button:

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd

csv_file_path = 'cars24_data_20251018_124728.csv'
try:
    df = pd.read_csv(csv_file_path)
    print(f"Successfully loaded '{csv_file_path}'")
    display(df)
except FileNotFoundError:
    print(f"Error: The file '{csv_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

Successfully loaded 'cars24_data_20251018_124728.csv'


Unnamed: 0,Car_Name,Year,Kilometers_Driven,Fuel_Type,Transmission,Price
0,2016 Hyundai Grand i10 SPORTZ 1.2 KAPPA VTVT,2016,34.11k km,Petrol,Manual,₹3.35 lakh
1,2018 Hyundai Verna 1.6 VTVT SX (O) AT,2018,30.23k km,Petrol,Auto,₹6.68 lakh
2,2018 Hyundai Grand i10 SPORTZ 1.2 KAPPA VTVT,2018,63.23k km,Petrol,Manual,₹3.50 lakh
3,2014 Hyundai Xcent SX 1.2,2014,96.90k km,Petrol,Manual,₹3.00 lakh
4,2020 Hyundai GRAND I10 NIOS SPORTZ AMT 1.2 KAP...,2020,40.15k km,Petrol,Auto,₹4.55 lakh
...,...,...,...,...,...,...
432,2012 Hyundai i20,2012,71.71k km,Petrol,Manual,₹2.65 lakh
433,2018 Hyundai Grand i10,2018,55.95k km,Petrol,Auto,₹4.00 lakh
434,2015 Hyundai Xcent,2015,36.89k km,Petrol,Manual,₹3.00 lakh
435,2013 Hyundai Grand i10,2013,21.60k km,Petrol,Manual,₹2.55 lakh
