# **Scrapping and Parsing the data (ACKODRIVE website)**

Using BeautifulSoup Library

In [1]:
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
# load the page
url = "https://ackodrive.com/collection/mahindra+cars/"
response = requests.get(url)

In [3]:
# check status code
if response.status_code != 200:
  print("Error:", response.status_code)
else:
  print("Page loaded succesfully!")

Page loaded succesfully!


In [4]:
# initialize beautifulsoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
# locate the parent container
parent_class = "BuyCarCard_card__AsGXF"
cards = soup.find_all("div", class_=parent_class)
print(len(cards))

15


In [6]:
# saving a raw html file
with open ("ackoDrive_raw.html","w",encoding="utf-8") as f:
  f.write(response.text)
print("Raw Html saved as ackoDrive_raw.html")

Raw Html saved as ackoDrive_raw.html


## **Extracting the car details in a python object**

In [7]:
# Extrating the car details in an python object
cars=[]

# extracting the attributes using class names
for card in cards:
  name = card.find("a", class_="BuyCarCard_carName__SAJVh")
  price_range = card.find("p", class_="BuyCarCard_priceRange__2q8tm")
  fuel = card.find_all("p", class_="BuyCarCard_specificationItem__yn5cu")
  price_label = card.find("p", class_="BuyCarCard_priceLabel__18XLX")
  variants = card.find("p", class_="BuyCarCard_carVariants__uju0j")

  # appending each car along with details in python object
  cars.append({
      "name": name.get_text(strip=True) if name else None,
      "price_range": price_range.get_text(strip=True) if price_range else None,
      "fuel_type": fuel[0].get_text(strip=True) if len(fuel) > 0 else None,
      "transmission": fuel[1].get_text(strip=True) if len(fuel) > 1 else None,
      "price_label": price_label.get_text(strip=True) if price_label else None,
      "variants": variants.get_text(strip=True) if variants else None
  })

print("Total Cars Extracted : ",len(cars))
print(cars[0])

Total Cars Extracted :  15
{'name': 'Mahindra XUV700', 'price_range': '₹16.5 lakh – ₹29.0 lakh', 'fuel_type': 'Petrol • Diesel', 'transmission': '• Manual • Automatic', 'price_label': 'On-road price in Delhi', 'variants': '65 Variants'}


## **Data Cleaning on Extracted Features**

In [8]:
# Firstly checking about the data types of each feature
df = pd.DataFrame(cars)
df = df.convert_dtypes() # Give exact data type
print(df.dtypes)

name            string[python]
price_range     string[python]
fuel_type       string[python]
transmission    string[python]
price_label     string[python]
variants        string[python]
dtype: object


In [9]:
# Checking Missing or Null Values
print("Null or Missing Values:")
print(df.isnull().sum())
print("-" * 40)

Null or Missing Values:
name            0
price_range     0
fuel_type       0
transmission    0
price_label     0
variants        0
dtype: int64
----------------------------------------


In [10]:
df.head(len(df))

Unnamed: 0,name,price_range,fuel_type,transmission,price_label,variants
0,Mahindra XUV700,₹16.5 lakh – ₹29.0 lakh,Petrol • Diesel,• Manual • Automatic,On-road price in Delhi,65 Variants
1,Mahindra Scorpio-N,₹15.6 lakh – ₹29.6 lakh,Petrol • Diesel,Manual • Automatic,On-road price in Delhi,58 Variants
2,Mahindra Scorpio Classic,₹16.2 lakh – ₹20.6 lakh,Diesel,Manual,On-road price in Delhi,3 Variants
3,Mahindra XUV400,₹16.6 lakh – ₹18.9 lakh,Electric,Automatic,On-road price in Delhi,3 Variants
4,Mahindra XUV 3XO,₹8.7 lakh – ₹17.2 lakh,Petrol • Diesel,Manual • Automatic,On-road price in Delhi,29 Variants
5,Mahindra Bolero,₹9.4 lakh – ₹11.4 lakh,Diesel,Manual,On-road price in Delhi,4 Variants
6,Mahindra Bolero Neo,₹10.1 lakh – ₹13.0 lakh,Diesel,Manual,On-road price in Delhi,6 Variants
7,Mahindra XEV 9e,₹23.0 lakh – ₹32.7 lakh,Electric,Automatic,On-road price in Delhi,15 Variants
8,Mahindra BE 6,₹19.9 lakh – ₹30.0 lakh,Electric,Automatic,On-road price in Delhi,20 Variants
9,Mahindra Thar Roxx,₹15.4 lakh – ₹27.4 lakh,Petrol • Diesel,• Manual • Automatic,On-road price in Delhi,25 Variants


In [11]:
new_df = df.copy()
print("New DataFrame Variable")
new_df.head()

New DataFrame Variable


Unnamed: 0,name,price_range,fuel_type,transmission,price_label,variants
0,Mahindra XUV700,₹16.5 lakh – ₹29.0 lakh,Petrol • Diesel,• Manual • Automatic,On-road price in Delhi,65 Variants
1,Mahindra Scorpio-N,₹15.6 lakh – ₹29.6 lakh,Petrol • Diesel,Manual • Automatic,On-road price in Delhi,58 Variants
2,Mahindra Scorpio Classic,₹16.2 lakh – ₹20.6 lakh,Diesel,Manual,On-road price in Delhi,3 Variants
3,Mahindra XUV400,₹16.6 lakh – ₹18.9 lakh,Electric,Automatic,On-road price in Delhi,3 Variants
4,Mahindra XUV 3XO,₹8.7 lakh – ₹17.2 lakh,Petrol • Diesel,Manual • Automatic,On-road price in Delhi,29 Variants


#### Cleaning the Fuel Features on Copy DataFrame

In [12]:
def clean_fuel_type(field):
    """
    Convert messy separators like '•', ',', '/' into a clean format:
    Example:
      'Petrol • Diesel' -> 'Petrol and Diesel'
      'Diesel' -> 'Diesel'
    """
    # Replace all possible separators with a uniform '|'
    s = field.replace("•", "|").replace(",", "|").replace("/", "|")

    # Split, trim, remove duplicates
    parts = [p.strip() for p in s.split("|")]
    parts = [p for p in parts if p]                     # remove empty
    parts = list(dict.fromkeys(parts))                  # remove duplicates, keep order

    # Join based on how many items exist
    if len(parts) == 0:
        return None
    elif len(parts) == 1:
        return parts[0]
    else:
        return " and ".join(parts)

# Apply to DataFrame
new_df["fuel_type"] = new_df["fuel_type"].apply(clean_fuel_type)

# Optional quick check
print(new_df[["name", "fuel_type"]].head())


                       name          fuel_type
0           Mahindra XUV700  Petrol and Diesel
1        Mahindra Scorpio-N  Petrol and Diesel
2  Mahindra Scorpio Classic             Diesel
3           Mahindra XUV400           Electric
4          Mahindra XUV 3XO  Petrol and Diesel


#### Cleaning the Price Range Features

In [13]:
def extract_price_range(text):
    """
    Extracts min and max price in lakhs from expressions like:
    '₹16.5 lakh – ₹29.0 lakh'

    Returns (min_lakh, max_lakh) as floats.
    If only one value found, uses it for both.
    If nothing found, returns (None, None).
    """

    # find all numbers before the word 'lakh'
    nums = re.findall(r"([\d]+(?:\.[\d]+)?)\s*lakh", text, flags=re.IGNORECASE)

    # case: two numbers found (normal case)
    if len(nums) >= 2:
        return float(nums[0]), float(nums[1])

    # case: only one value found
    if len(nums) == 1:
        v = float(nums[0])
        return v, v

    # no values found
    return None, None


# Apply to DataFrame
new_df["price_range"] = new_df["price_range"].apply(extract_price_range)

# Optional quick view
print(new_df[["name", "price_range"]].head())


                       name   price_range
0           Mahindra XUV700  (16.5, 29.0)
1        Mahindra Scorpio-N  (15.6, 29.6)
2  Mahindra Scorpio Classic  (16.2, 20.6)
3           Mahindra XUV400  (16.6, 18.9)
4          Mahindra XUV 3XO   (8.7, 17.2)


#### Cleaning the Transmission Features

In [14]:
def clean_transmission(field):
    """
    Normalize transmission types into a readable string:
    Examples:
        '• Manual • Automatic' -> 'Manual and Automatic'
        'Manual • Automatic'  -> 'Manual and Automatic'
        'Manual'              -> 'Manual'
    """
    # Replace all possible separators with a uniform symbol
    s = field.replace("•", "|").replace(",", "|").replace("/", "|")

    # Split & clean tokens
    parts = [p.strip() for p in s.split("|")]
    parts = [p for p in parts if p]                 # remove empty
    parts = list(dict.fromkeys(parts))              # remove duplicates, keep order

    # Join for final readable output
    if len(parts) == 0:
        return None
    elif len(parts) == 1:
        return parts[0]
    else:
        return " and ".join(parts)

# Apply to DataFrame
new_df["transmission"] = new_df["transmission"].apply(clean_transmission)

# Optional check
print(new_df[["name","transmission"]].head())


                       name          transmission
0           Mahindra XUV700  Manual and Automatic
1        Mahindra Scorpio-N  Manual and Automatic
2  Mahindra Scorpio Classic                Manual
3           Mahindra XUV400             Automatic
4          Mahindra XUV 3XO  Manual and Automatic


#### Cleaning the Variants Features

In [15]:
def clean_variants(field):
    """
    Extracts the numeric part from strings like:
        '65 Variants'
        '3 Variants'
        '20 Variant'
    Returns an integer, or None if no number exists.
    """
    match = re.search(r"\d+", field)
    if match:
        return int(match.group(0))
    return None

# Apply to DataFrame
new_df["variants"] = new_df["variants"].apply(clean_variants)

# Optional check
print(new_df[["name", "variants"]].head())


                       name  variants
0           Mahindra XUV700        65
1        Mahindra Scorpio-N        58
2  Mahindra Scorpio Classic         3
3           Mahindra XUV400         3
4          Mahindra XUV 3XO        29


#### Cleaning the Price Label Features

In [16]:
def clean_price_label(label):
    """
    From strings like:
        'On-road price in Delhi'
    extract only:
        'Delhi'
    """
    # Remove the common phrase (case-insensitive)
    temp = label.lower().replace("on-road price in", "").strip()

    # Capitalize properly (optional but cleaner)
    return temp.title()

# Apply to DataFrame
new_df["Location"] = new_df["price_label"].apply(clean_price_label)

# Optional check
print(new_df[["name", "Location"]].head())


                       name Location
0           Mahindra XUV700    Delhi
1        Mahindra Scorpio-N    Delhi
2  Mahindra Scorpio Classic    Delhi
3           Mahindra XUV400    Delhi
4          Mahindra XUV 3XO    Delhi


In [17]:
new_df.head(15)

Unnamed: 0,name,price_range,fuel_type,transmission,price_label,variants,Location
0,Mahindra XUV700,"(16.5, 29.0)",Petrol and Diesel,Manual and Automatic,On-road price in Delhi,65,Delhi
1,Mahindra Scorpio-N,"(15.6, 29.6)",Petrol and Diesel,Manual and Automatic,On-road price in Delhi,58,Delhi
2,Mahindra Scorpio Classic,"(16.2, 20.6)",Diesel,Manual,On-road price in Delhi,3,Delhi
3,Mahindra XUV400,"(16.6, 18.9)",Electric,Automatic,On-road price in Delhi,3,Delhi
4,Mahindra XUV 3XO,"(8.7, 17.2)",Petrol and Diesel,Manual and Automatic,On-road price in Delhi,29,Delhi
5,Mahindra Bolero,"(9.4, 11.4)",Diesel,Manual,On-road price in Delhi,4,Delhi
6,Mahindra Bolero Neo,"(10.1, 13.0)",Diesel,Manual,On-road price in Delhi,6,Delhi
7,Mahindra XEV 9e,"(23.0, 32.7)",Electric,Automatic,On-road price in Delhi,15,Delhi
8,Mahindra BE 6,"(19.9, 30.0)",Electric,Automatic,On-road price in Delhi,20,Delhi
9,Mahindra Thar Roxx,"(15.4, 27.4)",Petrol and Diesel,Manual and Automatic,On-road price in Delhi,25,Delhi


In [18]:
new_df = new_df.drop("price_label", axis = 1)
new_df.head()

Unnamed: 0,name,price_range,fuel_type,transmission,variants,Location
0,Mahindra XUV700,"(16.5, 29.0)",Petrol and Diesel,Manual and Automatic,65,Delhi
1,Mahindra Scorpio-N,"(15.6, 29.6)",Petrol and Diesel,Manual and Automatic,58,Delhi
2,Mahindra Scorpio Classic,"(16.2, 20.6)",Diesel,Manual,3,Delhi
3,Mahindra XUV400,"(16.6, 18.9)",Electric,Automatic,3,Delhi
4,Mahindra XUV 3XO,"(8.7, 17.2)",Petrol and Diesel,Manual and Automatic,29,Delhi


#### Assigning Correct DataType

In [19]:
# 1. Numeric fields → float or int
new_df["variants"] = new_df["variants"].astype(int)

# 2. Cleaned string fields → string dtype
new_df["transmission"] = new_df["transmission"].astype(str)
new_df["fuel_type"] = new_df["fuel_type"].astype(str)
new_df["Location"] = new_df["Location"].astype(str)
new_df["name"] = new_df["name"].astype(str)

In [20]:
new_df.dtypes

Unnamed: 0,0
name,object
price_range,object
fuel_type,object
transmission,object
variants,int64
Location,object


## **Data Presentation on Cleaned Data**

In [21]:
# Export DataFrame to CSV file
new_df.to_csv("Cleaned_Car_Data.csv", index=False)
print("CSV exported successfully: Cleaned_Car_Data.csv")

CSV exported successfully: Cleaned_Car_Data.csv


# Data Quality Check

In [22]:
df = pd.read_csv("Cleaned_Car_Data.csv")
df.head()

Unnamed: 0,name,price_range,fuel_type,transmission,variants,Location
0,Mahindra XUV700,"(16.5, 29.0)",Petrol and Diesel,Manual and Automatic,65,Delhi
1,Mahindra Scorpio-N,"(15.6, 29.6)",Petrol and Diesel,Manual and Automatic,58,Delhi
2,Mahindra Scorpio Classic,"(16.2, 20.6)",Diesel,Manual,3,Delhi
3,Mahindra XUV400,"(16.6, 18.9)",Electric,Automatic,3,Delhi
4,Mahindra XUV 3XO,"(8.7, 17.2)",Petrol and Diesel,Manual and Automatic,29,Delhi


In [23]:
print("Columns:", df.columns.tolist())
print("\nShape (rows, columns):", df.shape)
df.info()

Columns: ['name', 'price_range', 'fuel_type', 'transmission', 'variants', 'Location']

Shape (rows, columns): (15, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          15 non-null     object
 1   price_range   15 non-null     object
 2   fuel_type     15 non-null     object
 3   transmission  15 non-null     object
 4   variants      15 non-null     int64 
 5   Location      15 non-null     object
dtypes: int64(1), object(5)
memory usage: 852.0+ bytes


In [24]:
# Check Missing Column
expected_cols = ["name", "fuel_type", "transmission", "variants",
                 "price_range", "Location"]

missing = set(expected_cols) - set(df.columns)

if missing:
  print("Missing Column found")
  print("Missing Columns:", missing)
else:
  print("No Missing Column")

No Missing Column


In [25]:
# Check Missing Values
print("Null Count per Column:")
print(df.isnull().sum())

Null Count per Column:
name            0
price_range     0
fuel_type       0
transmission    0
variants        0
Location        0
dtype: int64


In [26]:
# Printing any missing value
df[df.isnull().any(axis=1)]

Unnamed: 0,name,price_range,fuel_type,transmission,variants,Location


In [27]:
# Checking for duplicates
duplicate_count = df.duplicated().sum()
print("Duplicate Rows:", duplicate_count)

# View duplicates
df[df.duplicated()]

Duplicate Rows: 0


Unnamed: 0,name,price_range,fuel_type,transmission,variants,Location


In [28]:
# Drop duplicates if any
df = df.drop_duplicates()

In [29]:
df.dtypes

Unnamed: 0,0
name,object
price_range,object
fuel_type,object
transmission,object
variants,int64
Location,object


In [30]:
# summary statistics
df.describe(include='all')

Unnamed: 0,name,price_range,fuel_type,transmission,variants,Location
count,15,15,15,15,15.0,15
unique,15,15,3,3,,1
top,Mahindra XUV700,"(16.5, 29.0)",Petrol and Diesel,Manual and Automatic,,Delhi
freq,1,1,7,7,,15
mean,,,,,18.933333,
std,,,,,19.575738,
min,,,,,2.0,
25%,,,,,4.5,
50%,,,,,15.0,
75%,,,,,25.0,


In [31]:
# Strip any unwanted character
df["name"] = df["name"].str.strip()
df["Location"] = df["Location"].str.strip()

In [32]:
# Clean and Fix Data Formats
df["fuel_type"] = df["fuel_type"].str.title()
df["transmission"] = df["transmission"].str.title()
df["Location"] = df["Location"].str.title()

In [33]:
# Final Check
print("Final Shape:", df.shape)
print("\nNull Check:\n", df.isnull().sum())
print("\nDuplicate Check:", df.duplicated().sum())
print("\nFinal Data Types:\n", df.dtypes)

Final Shape: (15, 6)

Null Check:
 name            0
price_range     0
fuel_type       0
transmission    0
variants        0
Location        0
dtype: int64

Duplicate Check: 0

Final Data Types:
 name            object
price_range     object
fuel_type       object
transmission    object
variants         int64
Location        object
dtype: object


# **Final CSV file**

In [34]:
df.to_csv("Final_Clean_Dataset.csv", index=False)
print("Dataset Approved & Saved ")

Dataset Approved & Saved 
