In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from dataclasses import dataclass, asdict

# Base headers for requests
BASE_URL = "https://www.carwale.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/",
}

@dataclass
class CarSpecifications:
    brand_name: str = "N/A"
    car_name: str = "N/A"
    variant_name: str = "N/A"
    variant_price: str = "N/A"  # Added field for variant price
    ex_showroom_price: str = "N/A"
    engine: str = "N/A"
    engine_type: str = "N/A"
    fuel_type: str = "N/A"
    max_power: str = "N/A"
    max_torque: str = "N/A"
    mileage: str = "N/A"
    driving_range: str = "N/A"
    drivetrain: str = "N/A"
    transmission_manual: str = "N/A"
    emission_standard: str = "N/A"
    electric_motor: str = "N/A"
    others: str = "N/A"
    valve_per_cylinder: str = "N/A"
    length: str = "N/A"
    width: str = "N/A"
    height: str = "N/A"
    wheel_base: str = "N/A"
    ground_clearance: str = "N/A"
    doors: str = "N/A"
    seating_capacity: str = "N/A"
    no_of_rows: str = "N/A"
    boot_space: str = "N/A"
    fuel_tank_capacity: str = "N/A"
    front_suspension: str = "N/A"
    rear_suspension: str = "N/A"
    front_brake_type: str = "N/A"
    rear_brake_type: str = "N/A"
    minimum_turning_radius: str = "N/A"

def scrape_all_cars(brand_name, brand_url):
    """
    Scrapes all car models, their variants, and specifications for a given brand URL.
    """
    response = requests.get(brand_url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch brand page. Status code: {response.status_code}")
        return pd.DataFrame()

    # Parse the brand page to get car model links
    soup = BeautifulSoup(response.text, 'html.parser')
    car_details_dict = {}

    category_section = soup.find('div', class_="o-dpDliG o-eAyrtt o-cglRxs aGK5Mk o-fpkJwH o-dCyDMp o-fzovSM")
    car_items = category_section.find_all('li', class_=re.compile(r'o-fz')) if category_section else []
    for car in car_items:
        car_name_tag = car.find('h3')
        car_name = car_name_tag.text.strip() if car_name_tag else "N/A"
        car_link_tag = car.find('a', href=True)
        car_link = BASE_URL + car_link_tag['href'] if car_link_tag else "N/A"
        car_details_dict[car_name] = car_link

    all_car_specs = []

    # Iterate over each car model to fetch variants and specifications
    for car_name, car_link in car_details_dict.items():
        print(f"Scraping variants for {car_name}...")
        variants = fetch_variants(car_link)
        for variant_name, variant_link, variant_price in variants:
            print(f"Scraping details for {variant_name}...")
            car_spec = scrape_variant_details(brand_name, car_name, variant_name, variant_link)
            if car_spec:
                car_spec_dict = asdict(car_spec)
                car_spec_dict['variant_price'] = variant_price  # Add the price to the spec dictionary
                all_car_specs.append(car_spec_dict)

    # Convert the list of car specifications into a DataFrame
    car_specs_df = pd.DataFrame(all_car_specs)
    return car_specs_df

def fetch_variants(car_link):
    """
    Fetches the variants, their links, and prices for a specific car model.
    """
    response = requests.get(car_link, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch car page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    variants_table = soup.find('tbody', class_="o-dJmcbh")
    variants = []

    if variants_table:
        rows = variants_table.find_all('tr')
        for row in rows:
            variant_name = row.find('div', class_=re.compile(r'o-fzp')).text.strip() if row.find('div', class_=re.compile(r'o-fzp')) else "N/A"
            variant_link_tag = row.find('a', href=True)
            variant_link = BASE_URL + variant_link_tag['href'] if variant_link_tag else "N/A"

            # Extract price information
            price_tag = row.find('div', class_="o-jjpuv o-eqqVmt o-dJmcbh o-fzpilz")
            variant_price = price_tag.text.strip() if price_tag else "N/A"

            # Skip variants with missing links
            if not variant_link or not variant_link.startswith("http"):
                print(f"Invalid variant link for {variant_name}. Skipping...")
                continue

            # Include variant price in the tuple
            variants.append((variant_name, variant_link, variant_price))

    return variants

def scrape_variant_details(brand_name, car_name, variant_name, variant_link):
    """
    Scrapes specifications for a specific variant.
    """
    if not variant_link or not variant_link.startswith("http"):
        print(f"Invalid URL for variant {variant_name}. Skipping...")
        return None

    response = requests.get(variant_link, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch variant details for {variant_name}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    info_table = soup.find('ul', class_="o-eFudgX o-bQXvsa o-cKSifp o-ItVGT o-cRSqer")

    def safe_find(data_itemid):
        """
        Safely finds and extracts text for a given data-itemid.
        Returns 'N/A' if the element is not found or text extraction fails.
        """
        try:
            element = info_table.find('div', {'data-itemid': data_itemid})
            return element.find('div', class_="o-cJrNdO o-biMxfO o-cYdrZi o-ckGLSv o-cMwvCl o-emwzWU").text.strip() if element else "N/A"
        except AttributeError:
            return "N/A"
        
    def safe_find_price():
        """
        Safely finds and extracts the average ex-showroom price.
        Returns 'N/A' if the element is not found or text extraction fails.
        """
        try:
            price_div = info_table.find('div', class_="o-NBTwp o-SoIQT o-cpnuEd")
            price_span = price_div.find('span', class_="o-Hyyko o-bPYcRG o-eqqVmt ")
            return price_span.text.strip() if price_span else "N/A"
        except AttributeError:
            return "N/A"    

    # Parse specifications using dataclass
    car_spec = CarSpecifications(
        brand_name=brand_name,
        car_name=car_name,
        variant_name=variant_name,
        ex_showroom_price=safe_find_price(),
        engine=safe_find('484'),
        engine_type=safe_find('13'),
        fuel_type=safe_find('26'),
        max_power=safe_find('249'),
        max_torque=safe_find('250'),
        mileage=safe_find('12'),
        driving_range=safe_find('459'),
        drivetrain=safe_find('31'),
        transmission_manual=safe_find('500'),
        emission_standard=safe_find('520'),
        electric_motor=safe_find('489'),
        others=safe_find('502'),
        valve_per_cylinder=safe_find('252'),
        length=safe_find('1'),
        width=safe_find('2'),
        height=safe_find('3'),
        wheel_base=safe_find('4'),
        ground_clearance=safe_find('5'),
        doors=safe_find('11'),
        seating_capacity=safe_find('9'),
        no_of_rows=safe_find('7'),
        boot_space=safe_find('8'),
        fuel_tank_capacity=safe_find('10'),
        front_suspension=safe_find('39'),
        rear_suspension=safe_find('40'),
        front_brake_type=safe_find('41'),
        rear_brake_type=safe_find('43'),
        minimum_turning_radius=safe_find('46'),
    )
    return car_spec
"""
# Example usage
brand_url = "https://www.carwale.com/maruti-suzuki-cars/"
maruti_df = scrape_all_cars("Maruti Suzuki", brand_url)

# Save to CSV
maruti_df.to_csv("maruti_car_details_with_prices.csv", index=False)

# Display DataFrame
print(maruti_df)

"""

'\n# Example usage\nbrand_url = "https://www.carwale.com/maruti-suzuki-cars/"\nmaruti_df = scrape_all_cars("Maruti Suzuki", brand_url)\n\n# Save to CSV\nmaruti_df.to_csv("maruti_car_details_with_prices.csv", index=False)\n\n# Display DataFrame\nprint(maruti_df)\n\n'

In [None]:


# Add this code at the bottom of your script to scrape data for multiple brands
if __name__ == "__main__":
    # List of brands and their URLs
    brands_and_urls = [
        ("Maruti Suzuki", "https://www.carwale.com/maruti-suzuki-cars/"),
        ("Mahindra", "https://www.carwale.com/mahindra-cars/")
        
    ]

    # Initialize an empty list to store DataFrames
    all_brands_data = []

    # Scrape data for each brand
    for brand_name, brand_url in brands_and_urls:
        print(f"Scraping data for {brand_name}...")
        brand_df = scrape_all_cars(brand_name, brand_url)
        if not brand_df.empty:
            all_brands_data.append(brand_df)
        else:
            print(f"No data found for {brand_name}!")

    # Combine all brand DataFrames into a single DataFrame
    if all_brands_data:
        combined_data = pd.concat(all_brands_data, ignore_index=True)

        # Save the combined data to a CSV file
        combined_data.to_csv("all_car_details_with_prices.csv", index=False)

        # Display the first few rows of the combined dataset
        print("Combined Data:")
        print(combined_data.head())
    else:
        print("No data scraped for any brand.")
