## Environment Setup

In [38]:
!apt-get update
!apt-get install -y wget
!wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add -
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list
!apt-get update
!apt-get install -y google-chrome-stable
!apt-get install -y chromium-chromedriver
!pip install selenium

0% [Working]            Hit:1 https://dl.google.com/linux/chrome/deb stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.81)] [                                                                               Hit:3 https://cli.github.com/packages stable InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.81)] [                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for he

## Import Libraries

In [39]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time


## Initial Page Scraping
* Fetches Honda car collection page from Ackodrive website
* Parses HTML content using BeautifulSoup for data extraction

In [40]:
from bs4 import BeautifulSoup
import requests

web_link = "https://ackodrive.com/collection/honda+cars/"
response = requests.get(web_link)

soup = BeautifulSoup(response.content,'html.parser')

## Car Data Extraction
* Extracts all car names from the webpage
* Filters cars that have "Express Delivery" available
* Creates a list of Honda car models to process further

In [41]:
raw_cars = soup.find_all(class_="BuyCarCard_carName__SAJVh")
len(raw_cars)

all_cars = []
for name in raw_cars:
    all_cars.append(name.text.strip())
cars_to_check = []
car_names = soup.find_all(class_="BuyCarCard_card__AsGXF")
for car_name in car_names:
        if "Express Delivery" in car_name.text.strip():
                for car in all_cars:
                        if car in car_name.text.strip():
                                cars_to_check.append(car)


In [42]:
cars_to_check

['Honda City', 'Honda Elevate', 'Honda Amaze']

In [43]:
def get_complete_variants(car):
    driver = None
    try:
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-blink-features=AutomationControlled')

        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(30)

        car_domain = car.lower().replace(" ", "-")
        url = f"https://ackodrive.com/cars/{car_domain}/variants/"

        driver.get(url)
        time.sleep(3)

        # Click view all variants button
        try:
            button = driver.find_element(By.CLASS_NAME, "VariantWisePrice_viewMoreLink__CkpjG")
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)
        except:
            pass

        # Get unique variant names
        variant_cards = driver.find_elements(By.XPATH, "//div[contains(@class, 'CarVariantCard')]")

        variants_to_process = []
        seen_names = set()

        for card in variant_cards:
            try:
                variant_name = card.find_element(By.XPATH, ".//h2[contains(@class, 'CarVariantCard_variantName')]").text.strip()
                has_badge = len(card.find_elements(By.CLASS_NAME, "CarVariantCard_discontinuedBadge__bqsA8")) > 0

                if variant_name and not has_badge and variant_name not in seen_names:
                    variants_to_process.append(variant_name)
                    seen_names.add(variant_name)
            except:
                continue

        variants_data = []

        # Process each variant
        for variant_name in variants_to_process:
            try:
                # Re-fetch cards to avoid stale elements
                variant_cards = driver.find_elements(By.XPATH, "//div[contains(@class, 'CarVariantCard')]")

                # Find the matching card
                target_card = None
                for card in variant_cards:
                    try:
                        card_name = card.find_element(By.XPATH, ".//h2[contains(@class, 'CarVariantCard_variantName')]").text.strip()
                        if card_name == variant_name:
                            has_badge = len(card.find_elements(By.CLASS_NAME, "CarVariantCard_discontinuedBadge__bqsA8")) > 0
                            if not has_badge:
                                target_card = card
                                break
                    except:
                        continue

                if not target_card:
                    continue

                # Click the button
                select_button = target_card.find_element(By.XPATH, ".//button[contains(@class, 'CarVariantCard_exploreButton')]")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", select_button)
                time.sleep(0.5)
                driver.execute_script("arguments[0].click();", select_button)
                time.sleep(4)

                # Extract specs
                variant_info = {
                    'name': variant_name,
                    'on_road_price': None,
                    'fuel_type': None,
                    'engine_capacity': None,
                    'mileage': None,
                    'seating_capacity': None,
                    'transmission': None
                }

                # On-road price
                try:
                    price_element = driver.find_element(By.CLASS_NAME, "MMVPriceBreakup_priceValue__uoYlB")
                    variant_info['on_road_price'] = price_element.text.strip()
                except:
                    pass

                # Fuel type
                try:
                    fuel_card = driver.find_element(By.XPATH, "//div[contains(@class, 'sf-content-block__card--fuel-type')]")
                    variant_info['fuel_type'] = fuel_card.find_element(By.CLASS_NAME, "SpecsAndFeatureCard_card__text__wxvgC").text.strip()
                except:
                    pass

                # Engine capacity
                try:
                    engine_card = driver.find_element(By.XPATH, "//div[contains(@class, 'sf-content-block__card--engine-capacity')]")
                    variant_info['engine_capacity'] = engine_card.find_element(By.CLASS_NAME, "SpecsAndFeatureCard_card__text__wxvgC").text.strip()
                except:
                    pass

                # Transmission
                try:
                    trans_card = driver.find_element(By.XPATH, "//p[contains(@class, 'SpecsAndFeatureCard_card__title') and contains(text(), 'Transmission')]/ancestor::div[contains(@class, 'SpecsAndFeatureCard_card__X_t2v')]")
                    variant_info['transmission'] = trans_card.find_element(By.CLASS_NAME, "SpecsAndFeatureCard_card__text__wxvgC").text.strip()
                except:
                    pass

                # ARAI mileage
                try:
                    mileage_card = driver.find_element(By.XPATH, "//p[contains(@class, 'SpecsAndFeatureCard_card__title__dpB5f') and contains(text(), 'ARAI mileage')]/ancestor::div[contains(@class, 'SpecsAndFeatureCard_card__X_t2v')]")
                    variant_info['mileage'] = mileage_card.find_element(By.CLASS_NAME, "SpecsAndFeatureCard_card__text__wxvgC").text.strip()
                except:
                    pass

                # Seat capacity
                try:
                    seat_card = driver.find_element(By.XPATH, "//p[contains(@class, 'SpecsAndFeatureCard_card__title__dpB5f') and contains(text(), 'Seat capacity')]/ancestor::div[contains(@class, 'SpecsAndFeatureCard_card__X_t2v')]")
                    variant_info['seating_capacity'] = seat_card.find_element(By.CLASS_NAME, "SpecsAndFeatureCard_card__text__wxvgC").text.strip()
                except:
                    pass

                variants_data.append(variant_info)

                driver.back()
                time.sleep(2)

                # Re-click "View all variants"
                try:
                    button = driver.find_element(By.CLASS_NAME, "VariantWisePrice_viewMoreLink__CkpjG")
                    driver.execute_script("arguments[0].click();", button)
                    time.sleep(1)
                except:
                    pass

            except:
                try:
                    driver.back()
                    time.sleep(2)
                except:
                    break
                continue

        return variants_data

    except:
        return []

    finally:
        if driver:
            try:
                driver.quit()
            except:
                pass

## Data Processing Setup
* Imports pandas for data manipulation and analysis


In [44]:
import pandas as pd

## Main Data Collection Loop
* Iterates through each car model and collects variant data
* Structures data into rows with proper column names
* Saves collected data to CSV file
* Processes all three Honda models successfully

In [45]:
rows = []

for car in cars_to_check:
    variants = get_complete_variants(car)

    if variants:
        for v in variants:
            rows.append({
                "Model": car,
                "Variant": v.get("name", "N/A"),
                "Price": v.get("on_road_price", "N/A"),
                "Fuel": v.get("fuel_type", "N/A"),
                "Engine": v.get("engine_capacity", "N/A"),
                "Transmission": v.get("transmission", "N/A"),
                "Mileage": v.get("mileage", "N/A"),
                "Seats": v.get("seating_capacity", "N/A"),
            })

    time.sleep(2)

df = pd.DataFrame(rows)
df.to_csv("car_variants.csv", index=False, encoding="utf-8")

print("Saved to car_variants.csv using pandas")


Saved to car_variants.csv using pandas


Shows 27 variants across three Honda models with detailed specifications

In [46]:
df = pd.read_csv("car_variants.csv")
df

Unnamed: 0,Model,Variant,Price,Fuel,Engine,Transmission,Mileage,Seats
0,Honda City,1.5 SV-R,"‚Çπ11,95,300",Petrol,1498 cc,Manual,17.0 kmpl,5 seater
1,Honda City,1.5 V-R,"‚Çπ12,69,500",Petrol,1498 cc,Manual,17.0 kmpl,5 seater
2,Honda City,1.5 VX-R,"‚Çπ13,72,900",Petrol,1498 cc,Manual,17.0 kmpl,5 seater
3,Honda City,1.5 V-R CVT,"‚Çπ13,90,200",Petrol,1498 cc,Automatic,18.0 kmpl,5 seater
4,Honda City,1.5 Sports CVT,"‚Çπ14,37,500",Petrol,1498 cc,Automatic,18.0 kmpl,5 seater
5,Honda City,1.5 ZX-R,"‚Çπ14,86,800",Petrol,1498 cc,Manual,17.0 kmpl,5 seater
6,Honda City,1.5 VX-R CVT,"‚Çπ14,93,500",Petrol,1498 cc,Automatic,18.0 kmpl,5 seater
7,Honda City,1.5 ZX-R CVT,"‚Çπ16,07,400",Petrol,1498 cc,Automatic,18.0 kmpl,5 seater
8,Honda City,1.5 ZX-R eHEV,"‚Çπ19,48,200",Hybrid,1498 cc,Automatic,27.0 kmpl,5 seater
9,Honda Elevate,SV-R,"‚Çπ10,99,900",Petrol,1498 cc,Manual,15.0 kmpl,5 seater


## Date cleaning

In [47]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Model         27 non-null     object
 1   Variant       27 non-null     object
 2   Price         27 non-null     object
 3   Fuel          27 non-null     object
 4   Engine        27 non-null     object
 5   Transmission  27 non-null     object
 6   Mileage       27 non-null     object
 7   Seats         27 non-null     object
dtypes: object(8)
memory usage: 1.8+ KB
None


In [48]:
print(df.isnull().sum())

Model           0
Variant         0
Price           0
Fuel            0
Engine          0
Transmission    0
Mileage         0
Seats           0
dtype: int64


In [49]:
# Clean Price ‚Üí remove ‚Çπ and commas ‚Üí convert to int
# Convert to string before cleaning
df["Price"] = df["Price"].astype(str)

df["Price"] = (
    df["Price"]
    .str.replace("‚Çπ", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
)

# Convert to float or int
df["Price"] = df["Price"].astype(float)


In [50]:
# Clean Engine (remove ' cc' and convert)
df["Engine"] = (
    df["Engine"]
    .astype(str)
    .str.replace("cc", "", regex=False)
    .str.replace("CC", "", regex=False)
    .str.replace("cc.", "", regex=False)
    .str.replace(" ", "", regex=False)
)


df["Engine"] = pd.to_numeric(df["Engine"], errors="coerce")

df["Engine"] = df["Engine"].fillna(0).astype(int)


In [51]:
# Clean Mileage (remove ' kmpl')
df["Mileage"] = df["Mileage"].astype(str)
df["Mileage"] = (
    df["Mileage"]
    .str.replace("kmpl", "", regex=False)
    .str.replace("KMPL", "", regex=False)
    .str.replace("Kmpl", "", regex=False)
    .str.replace(" ", "", regex=False)   # remove spaces
)

df["Mileage"] = pd.to_numeric(df["Mileage"], errors="coerce")

df["Mileage"] = df["Mileage"].fillna(0)


In [52]:
# Clean Seats (remove ' seater')
df["Seats"] = df["Seats"].astype(str)

df["Seats"] = (
    df["Seats"]
    .str.replace("seater", "", regex=False)
    .str.replace("Seater", "", regex=False)
    .str.replace("SEATER", "", regex=False)
    .str.replace(" ", "", regex=False)
    .str.strip()
)

df["Seats"] = pd.to_numeric(df["Seats"], errors="coerce")

df["Seats"] = df["Seats"].fillna(0).astype(int)


In [53]:
# Strip whitespace from text columns
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)


### Shows final cleaned dataset with proper numeric types and Ready for analysis

In [54]:
df

Unnamed: 0,Model,Variant,Price,Fuel,Engine,Transmission,Mileage,Seats
0,Honda City,1.5 SV-R,1195300.0,Petrol,1498,Manual,17.0,5
1,Honda City,1.5 V-R,1269500.0,Petrol,1498,Manual,17.0,5
2,Honda City,1.5 VX-R,1372900.0,Petrol,1498,Manual,17.0,5
3,Honda City,1.5 V-R CVT,1390200.0,Petrol,1498,Automatic,18.0,5
4,Honda City,1.5 Sports CVT,1437500.0,Petrol,1498,Automatic,18.0,5
5,Honda City,1.5 ZX-R,1486800.0,Petrol,1498,Manual,17.0,5
6,Honda City,1.5 VX-R CVT,1493500.0,Petrol,1498,Automatic,18.0,5
7,Honda City,1.5 ZX-R CVT,1607400.0,Petrol,1498,Automatic,18.0,5
8,Honda City,1.5 ZX-R eHEV,1948200.0,Hybrid,1498,Automatic,27.0,5
9,Honda Elevate,SV-R,1099900.0,Petrol,1498,Manual,15.0,5


### Exports cleaned and processed data to new CSV file

In [55]:
df.to_csv("honda_cleaned_data.csv", index=False)

### File Download
Provides download functionality for the cleaned dataset

In [56]:
try:
  from google.colab import files
  import glob

  csv_files = glob.glob("honda_cleaned_data*.csv")
  if csv_files:
      latest_file = max(csv_files)
      print(f"\n‚¨áÔ∏è Downloading {latest_file}...")
      files.download(latest_file)
except:
  print("\nüí° Not in Colab environment - file saved locally")



‚¨áÔ∏è Downloading honda_cleaned_data.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>