### Importing Libraries

---

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Adding `utils` to `PYTHONPATH`

---

In [2]:
sys.path.append(os.path.abspath("../utils"))

### Reading CSV File

---

In [3]:
# Importing load_csv function from read_data module
from read_data import load_csv
cars = load_csv('scrape_data', 'scrape_data.csv')
cars.head()

Unnamed: 0,model_name,km_driven,fuel_type,transmission,owner,price,link,engine_capacity
0,2012 Maruti Wagon R 1.0,88.76k km,Petrol,Manual,1st owner,₹2.19 lakh,https://www.cars24.com/buy-used-maruti-wagon-r...,998cc
1,2016 Maruti Alto 800,17.92k km,Petrol,Manual,1st owner,₹2.66 lakh,https://www.cars24.com/buy-used-maruti-alto-80...,796cc
2,2014 Maruti Ertiga,9.94k km,Petrol,Manual,1st owner,₹4.96 lakh,https://www.cars24.com/buy-used-maruti-ertiga-...,1373cc
3,2016 Tata Tiago,67.34k km,Petrol,Manual,2nd owner,₹3.55 lakh,https://www.cars24.com/buy-used-tata-tiago-201...,1199cc
4,2023 Maruti New Wagon-R,30.39k km,Petrol,Manual,1st owner,₹5.30 lakh,https://www.cars24.com/buy-used-maruti-new-wag...,998cc


### Dropping Unnecessary Columns

---

In [4]:
cars.drop('link', axis=1, inplace=True)

### Summary of the DataFrame

---

In [5]:
# Importing dataframe_summary function from summary module
from summary import dataframe_summary
dataframe_summary(cars)

                          DataFrame Summary                           
Shape: 2826 rows × 7 columns
Duplicate Rows: 0
Memory Usage: 1.20MB
Missing Values:
                 Missing Values     %
engine_capacity             182  6.44
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2826 entries, 0 to 2825
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   model_name       2826 non-null   object
 1   km_driven        2826 non-null   object
 2   fuel_type        2826 non-null   object
 3   transmission     2826 non-null   object
 4   owner            2826 non-null   object
 5   price            2826 non-null   object
 6   engine_capacity  2644 non-null   object
dtypes: object(7)
memory usage: 154.7+ KB
DataFrame Description:
No numerical columns in the dataframe


### Cleaning `model_name` Column

---

In [6]:
# "model_name" contains multiple information about cars :
# -> manufacturing year of cars (like 2019, 2021)
# -> brand name of cars (like Maruti, Tata)
# -> model name of cars (like Alto 800, Tiago)
# So we will extract all these information from "model_name" column
cars['model_name'].head()

0    2012 Maruti Wagon R 1.0
1       2016 Maruti Alto 800
2         2014 Maruti Ertiga
3            2016 Tata Tiago
4    2023 Maruti New Wagon-R
Name: model_name, dtype: object

In [7]:
# manufacturing year of cars
cars['model_name'].str.strip().str.split(' ', n=1).str.get(0).head()

0    2012
1    2016
2    2014
3    2016
4    2023
Name: model_name, dtype: object

In [8]:
# Checking if there are any garbage values after extracting manufacturing year details
cars['model_name'].str.strip().str.split(' ', n=1).str.get(0).unique()

array(['2012', '2016', '2014', '2023', '2021', '2024', '2020', '2022',
       '2018', '2025', '2019', '2017', '2015', '2013', '2011', '2010',
       '2007'], dtype=object)

In [9]:
# Creating "year" column to keep details about manufacturing year of cars
cars['year'] = cars['model_name'].str.strip().str.split(' ', n=1).str.get(0).astype(int)
cars.head()

Unnamed: 0,model_name,km_driven,fuel_type,transmission,owner,price,engine_capacity,year
0,2012 Maruti Wagon R 1.0,88.76k km,Petrol,Manual,1st owner,₹2.19 lakh,998cc,2012
1,2016 Maruti Alto 800,17.92k km,Petrol,Manual,1st owner,₹2.66 lakh,796cc,2016
2,2014 Maruti Ertiga,9.94k km,Petrol,Manual,1st owner,₹4.96 lakh,1373cc,2014
3,2016 Tata Tiago,67.34k km,Petrol,Manual,2nd owner,₹3.55 lakh,1199cc,2016
4,2023 Maruti New Wagon-R,30.39k km,Petrol,Manual,1st owner,₹5.30 lakh,998cc,2023


In [10]:
# brand name of cars
cars['model_name'].str.strip().str.split(' ').str.get(1).head()

0    Maruti
1    Maruti
2    Maruti
3      Tata
4    Maruti
Name: model_name, dtype: object

In [11]:
# Checking if there are any garbage values after extracting brand name details
cars['model_name'].str.strip().str.split(' ').str.get(1).unique()

array(['Maruti', 'Tata', 'Nissan', 'Renault', 'Hyundai', 'Honda', 'KIA',
       'MG', 'Ford', 'Skoda', 'Volkswagen', 'Mahindra', 'Toyota', 'Jeep',
       'Datsun', 'Mercedes', 'Audi', 'BMW', 'Jaguar', 'Landrover',
       'Volvo', 'Chevrolet', 'Force', 'Fiat', 'Premier', 'Mini',
       'Mitsubishi', 'Ssangyong', 'CITROEN', 'Porsche'], dtype=object)

In [12]:
# Creating "brand" column to keep details about brand name of cars
cars['brand'] = cars['model_name'].str.split(' ').str.get(1)
cars.head()

Unnamed: 0,model_name,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand
0,2012 Maruti Wagon R 1.0,88.76k km,Petrol,Manual,1st owner,₹2.19 lakh,998cc,2012,Maruti
1,2016 Maruti Alto 800,17.92k km,Petrol,Manual,1st owner,₹2.66 lakh,796cc,2016,Maruti
2,2014 Maruti Ertiga,9.94k km,Petrol,Manual,1st owner,₹4.96 lakh,1373cc,2014,Maruti
3,2016 Tata Tiago,67.34k km,Petrol,Manual,2nd owner,₹3.55 lakh,1199cc,2016,Tata
4,2023 Maruti New Wagon-R,30.39k km,Petrol,Manual,1st owner,₹5.30 lakh,998cc,2023,Maruti


In [13]:
# model name of cars
cars['model_name'].str.strip().str.split(' ', n=2).str.get(-1).head()

0    Wagon R 1.0
1       Alto 800
2         Ertiga
3          Tiago
4    New Wagon-R
Name: model_name, dtype: object

In [14]:
# Checking if there are any garbage values after extracting model name details
cars['model_name'].str.strip().str.split(' ', n=2).str.get(-1).unique()

array(['Wagon R 1.0', 'Alto 800', 'Ertiga', 'Tiago', 'New Wagon-R',
       'MAGNITE', 'Zest', 'Kwid', 'Xcent', 'Amaze', 'SONET', 'City',
       'TRIBER', 'S PRESSO', 'Eon', 'NEXON', 'PUNCH', 'ASTOR', 'VENUE',
       'Alto K10', 'FREESTYLE', 'Grand i10', 'Rapid', 'Polo', 'Baleno',
       'i10', 'Verna', 'Thar', 'Swift', 'NEW I20', 'Swift Dzire', 'Ciaz',
       'Harrier', 'XCENT PRIME', 'Celerio', 'KUSHAQ', 'WR-V', 'Bolero',
       'TIGOR', 'Brio', 'AURA', 'GRAND I10 NIOS', 'NEW SANTRO', 'XUV300',
       'Creta', 'Elite i20', 'URBAN CRUISER', 'ALTROZ', 'Kiger', 'SLAVIA',
       'Grand Vitara', 'Duster', 'BREZZA', 'Micra Active', 'XUV500',
       'Glanza', 'Compass', 'TUV300', 'Alto', 'Eeco', 'Jazz', 'SELTOS',
       'VIRTUS', 'Celerio X', 'HECTOR', 'Micra', 'ALCAZAR', 'HECTOR PLUS',
       'TAIGUN', 'Vitara Brezza', 'Redi Go', 'Figo Aspire', 'Ecosport',
       'Figo', 'SCORPIO-N', 'FRONX', 'S Cross', 'NEW I20 N LINE',
       'i20 Active', 'Benz GLE', 'Benz E Class', 'Q7', 'X5', 'Q3', 'X1

In [15]:
# Creating "model" column to keep details about model name of cars
cars['model'] = cars['model_name'].str.strip().str.split(' ', n=2).str.get(-1)
cars.head()

Unnamed: 0,model_name,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand,model
0,2012 Maruti Wagon R 1.0,88.76k km,Petrol,Manual,1st owner,₹2.19 lakh,998cc,2012,Maruti,Wagon R 1.0
1,2016 Maruti Alto 800,17.92k km,Petrol,Manual,1st owner,₹2.66 lakh,796cc,2016,Maruti,Alto 800
2,2014 Maruti Ertiga,9.94k km,Petrol,Manual,1st owner,₹4.96 lakh,1373cc,2014,Maruti,Ertiga
3,2016 Tata Tiago,67.34k km,Petrol,Manual,2nd owner,₹3.55 lakh,1199cc,2016,Tata,Tiago
4,2023 Maruti New Wagon-R,30.39k km,Petrol,Manual,1st owner,₹5.30 lakh,998cc,2023,Maruti,New Wagon-R


In [16]:
# Now we don't need "model_name" column, so we can drop it
cars.drop('model_name', axis=1, inplace=True)

### Cleaning `km_driven` Column

---

In [17]:
# "km_driven" column contains "km" suffix
# To convert "km_driven" into numerical datatype, we have to remove the "km" suffix
cars['km_driven'].head()

0    88.76k km
1    17.92k km
2     9.94k km
3    67.34k km
4    30.39k km
Name: km_driven, dtype: object

In [18]:
# Removing "km" suffix from "km_driven" column
cars['km_driven'] = cars['km_driven'].str.split(' ').str.get(0)
cars['km_driven'].head()

0    88.76k
1    17.92k
2     9.94k
3    67.34k
4    30.39k
Name: km_driven, dtype: object

In [19]:
# There are 2 values in "km_driven" column that don't have any suffix associated with them 
# It might be because of any data collection or web scraping error, so we must remove these values
cars[cars['km_driven'].str.isdigit()]

Unnamed: 0,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand,model
1230,250,Petrol,Manual,1st owner,₹5.37 lakh,998cc,2024,Maruti,Alto K10
1509,677,Petrol,Auto,1st owner,₹6.00 lakh,,2024,Renault,Kwid


In [20]:
# Removing values that don't have any suffix associated with them
cars = cars[(cars['km_driven'].str.endswith('k')) | (cars['km_driven'].str.endswith('L'))]

In [21]:
# "km_driven" column contains 2 types of suffix : "k" or "L"
cars['km_driven'].str.get(-1).unique()

array(['k', 'L'], dtype=object)

In [22]:
# Importing km_driven_cleaner function from helpers module
from helpers import km_driven_cleaner

In [23]:
# Applying km_driven_cleaner function on "km_driven" column
cars.loc[:,'km_driven'] = cars.loc[:,'km_driven'].apply(km_driven_cleaner)
cars.head()

Unnamed: 0,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand,model
0,88760,Petrol,Manual,1st owner,₹2.19 lakh,998cc,2012,Maruti,Wagon R 1.0
1,17920,Petrol,Manual,1st owner,₹2.66 lakh,796cc,2016,Maruti,Alto 800
2,9940,Petrol,Manual,1st owner,₹4.96 lakh,1373cc,2014,Maruti,Ertiga
3,67340,Petrol,Manual,2nd owner,₹3.55 lakh,1199cc,2016,Tata,Tiago
4,30390,Petrol,Manual,1st owner,₹5.30 lakh,998cc,2023,Maruti,New Wagon-R


### Cleaning `fuel_type` Column

---

In [24]:
# "fuel_type" column has nothing to clean, but to proceed without any error, we can strip the values to remove any extra spaces
cars['fuel_type'] = cars['fuel_type'].str.strip()
cars['fuel_type'].unique()

array(['Petrol', 'Diesel', 'CNG', 'Hybrid', 'Electric'], dtype=object)

### Cleaning `owner` Column

---

In [25]:
# "owner" column has nothing to clean, but to proceed without any error, we can strip the values to remove any extra spaces
cars['owner'] = cars['owner'].str.strip()
cars['owner'].unique()

array(['1st owner', '2nd owner', '3rd owner', '4th owner', '5th owner',
       '6th owner'], dtype=object)

### Cleaning `transmission` Column

---

In [26]:
# "transmission" column has nothing to clean, but to proceed without any error, we can strip the values to remove any extra spaces
cars['transmission'] = cars['transmission'].str.strip()
cars['transmission'].unique()

array(['Manual', 'Auto'], dtype=object)

In [27]:
# "transmission" column contains 2 types of values : "Auto" and "Manual"
# But Auto should be labeled as Automatic, which makes more sense, so we will update it
cars['transmission'] = cars['transmission'].str.replace('Auto','Automatic')
cars.head()

Unnamed: 0,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand,model
0,88760,Petrol,Manual,1st owner,₹2.19 lakh,998cc,2012,Maruti,Wagon R 1.0
1,17920,Petrol,Manual,1st owner,₹2.66 lakh,796cc,2016,Maruti,Alto 800
2,9940,Petrol,Manual,1st owner,₹4.96 lakh,1373cc,2014,Maruti,Ertiga
3,67340,Petrol,Manual,2nd owner,₹3.55 lakh,1199cc,2016,Tata,Tiago
4,30390,Petrol,Manual,1st owner,₹5.30 lakh,998cc,2023,Maruti,New Wagon-R
