# Processing DataSet

In [5]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/Cars24 dataset.csv')
df.head()

Unnamed: 0,Car_Name,Year,Kilometers_Driven,Fuel_Type,Transmission,Price
0,2016 Hyundai Grand i10,2016,34.11k km,Petrol,Manual,₹3.35 lakh
1,2018 Hyundai Verna 1.6 VTVT SX (O) AT,2018,30.23k km,Petrol,Auto,₹6.68 lakh
2,2018 Hyundai Grand i10,2018,63.23k km,Petrol,Manual,₹3.50 lakh
3,2014 Hyundai Xcent SX 1.2,2014,96.90k km,Petrol,Manual,₹3.00 lakh
4,2020 Hyundai GRAND I10 NIOS,2020,40.15k km,Petrol,Auto,₹4.55 lakh


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Car_Name           435 non-null    object
 1   Year               435 non-null    int64 
 2   Kilometers_Driven  435 non-null    object
 3   Fuel_Type          435 non-null    object
 4   Transmission       435 non-null    object
 5   Price              435 non-null    object
dtypes: int64(1), object(5)
memory usage: 20.5+ KB


In [7]:
# Clean Car_Name by removing year prefix
df['Car_Name'] = df['Car_Name'].str.replace(r'^\d{4}\s+', '', regex=True)

print(df['Car_Name'])

0                          Hyundai Grand i10
1           Hyundai Verna 1.6 VTVT SX (O) AT
2                          Hyundai Grand i10
3                       Hyundai Xcent SX 1.2
4                     Hyundai GRAND I10 NIOS
                       ...                  
430                     Hyundai i20 ASTA 1.2
431                        Hyundai Grand i10
432                      Hyundai Xcent S 1.2
433                        Hyundai Grand i10
434    Hyundai Grand i10 ASTA 1.2 KAPPA VTVT
Name: Car_Name, Length: 435, dtype: object


In [9]:
import re

def clean_kilometers(value):
    value = str(value).replace(' km', '').strip()

    # Handle thousands (k or K)
    if re.search(r'[kK]', value):
        value = value.replace('k', '').replace('K', '')
        return float(value) * 1000

    # Handle lakhs (L or lakh)
    elif re.search(r'[lL]', value):
        value = value.replace('lakh', '').replace('Lakh', '').replace('L', '').replace('l', '')
        return float(value) * 100000

    # Handle plain numeric values
    else:
        return float(value)

# Apply to DataFrame
df['Kilometers_Driven'] = df['Kilometers_Driven'].apply(clean_kilometers).astype(int)

In [10]:
import re

def clean_price(value):
    value = str(value).strip()

    # Remove currency symbol and extra spaces
    value = value.replace('₹', '').replace(',', '').strip()

    # Handle lakh or Lakh (× 1,00,000)
    if re.search(r'[lL]akh', value):
        value = re.sub(r'[lL]akh', '', value).strip()
        return float(value) * 100000

    # Handle crore or Crore (× 1,00,00,000)
    elif re.search(r'[cC]rore', value):
        value = re.sub(r'[cC]rore', '', value).strip()
        return float(value) * 10000000

    # Handle plain numeric value
    else:
        return float(value)

# Apply the function
df['Price'] = df['Price'].apply(clean_price).astype(int)
# Rename column 'Price' to 'Price(INR)'
df.rename(columns={'Price': 'Price(INR)'}, inplace=True)

In [11]:
df.head()

Unnamed: 0,Car_Name,Year,Kilometers_Driven,Fuel_Type,Transmission,Price(INR)
0,Hyundai Grand i10,2016,34110,Petrol,Manual,335000
1,Hyundai Verna 1.6 VTVT SX (O) AT,2018,30230,Petrol,Auto,668000
2,Hyundai Grand i10,2018,63230,Petrol,Manual,350000
3,Hyundai Xcent SX 1.2,2014,96900,Petrol,Manual,300000
4,Hyundai GRAND I10 NIOS,2020,40150,Petrol,Auto,455000


In [13]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('Refined_Cars24_DataSet.csv', index=False)

print("Refined dataset saved successfully as 'Refined_Cars24_DataSet.csv'")

Refined dataset saved successfully as 'Refined_Cars24_DataSet.csv'


# Analysis on Clean DataSet

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Car_Name           435 non-null    object
 1   Year               435 non-null    int64 
 2   Kilometers_Driven  435 non-null    int64 
 3   Fuel_Type          435 non-null    object
 4   Transmission       435 non-null    object
 5   Price(INR)         435 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 20.5+ KB


In [14]:
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Price(INR)
count,435.0,435.0,435.0
mean,2016.195402,65061.19,464882.7
std,3.733147,76565.66,284363.0
min,2010.0,1510.0,89000.0
25%,2013.0,34405.0,266500.0
50%,2016.0,57730.0,369000.0
75%,2019.0,81940.0,648000.0
max,2025.0,1450000.0,1695000.0


In [15]:
df.duplicated().sum()

np.int64(0)

In [16]:
df.isnull().sum()

Unnamed: 0,0
Car_Name,0
Year,0
Kilometers_Driven,0
Fuel_Type,0
Transmission,0
Price(INR),0


In [17]:
df.nunique()

Unnamed: 0,0
Car_Name,141
Year,16
Kilometers_Driven,374
Fuel_Type,3
Transmission,2
Price(INR),300


In [18]:
df.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count
Car_Name,Year,Kilometers_Driven,Fuel_Type,Transmission,Price(INR),Unnamed: 6_level_1
Hyundai i20 SPORTZ 1.4 CRDI,2013,130000,Diesel,Manual,315000,1
Hyundai ALCAZAR 1.5 SIGNATURE (O) AT 6STR,2021,37330,Diesel,Auto,1662000,1
Hyundai ALCAZAR 1.5 SIGNATURE (O) AT 6STR,2021,180000,Diesel,Auto,1237000,1
Hyundai i20 SPORTZ 1.2,2010,48840,Petrol,Manual,128000,1
Hyundai i20 MAGNA 1.2,2013,73100,Petrol,Manual,190000,1
...,...,...,...,...,...,...
Hyundai AURA S 1.2 CNG,2024,8900,Petrol,Manual,791000,1
Hyundai AURA S 1.2 CNG,2021,73240,Petrol,Manual,576000,1
Hyundai AURA S 1.2 CNG,2021,20610,Petrol,Manual,637000,1
Hyundai AURA S 1.2,2024,28900,Petrol,Manual,582000,1


Car Age

In [21]:
current_year = 2025
current_year - df['Year']

Unnamed: 0,Year
0,9
1,7
2,7
3,11
4,5
...,...
430,13
431,7
432,10
433,12


Price per Kilometer

In [23]:
df['Price(INR)'] / df['Kilometers_Driven']

Unnamed: 0,0
0,9.821167
1,22.097254
2,5.535347
3,3.095975
4,11.332503
...,...
430,3.695440
431,7.149240
432,8.132285
433,11.805509
