### Importing Libraries

---

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Custom Seaborn Style

---

In [2]:
sns.set_theme('paper')
sns.set_style('ticks')

### Adding `utils` to `PYTHONPATH`

---

In [3]:
sys.path.append(os.path.abspath("../utils"))

### Reading Parquet File

---

In [4]:
# Importing load_parquet function from read_data module
from read_data import load_parquet
cars = load_parquet('clean_data', 'clean_data.parquet')
cars.head()

Unnamed: 0,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand,model
0,88760,Petrol,Manual,1st owner,219000,998,2012,Maruti,Wagon R 1.0
1,17920,Petrol,Manual,1st owner,266000,796,2016,Maruti,Alto 800
2,9940,Petrol,Manual,1st owner,496000,1373,2014,Maruti,Ertiga
3,67340,Petrol,Manual,2nd owner,355000,1199,2016,Tata,Tiago
4,30390,Petrol,Manual,1st owner,530000,998,2023,Maruti,New Wagon-R


### Handling `year` Column

---

In [5]:
# Frequency Distribution of "year" column
cars['year'].value_counts()

year
2022    310
2021    305
2018    286
2017    274
2019    263
2016    243
2020    234
2023    194
2015    188
2014    168
2013    131
2024     76
2012     65
2011     49
2010     36
2025      1
2007      1
Name: count, dtype: int64

In [6]:
# As we can see, 2007 and 2025 are very rare in our data occuring only 1 time as compared to other values
# So it's better to remove them, because model can't learn any pattern of year 2007 or 2025 with just 1 value
# And it can also cause model to overfit instead of learning meaningful patterns
cars = cars[(cars['year']!=2007) & (cars['year']!=2025)]

### Handling `transmission` Column

---

In [7]:
# Frequency Distribution of "transmission" column
cars['transmission'].value_counts()

transmission
Manual       1910
Automatic     912
Name: count, dtype: int64

### Handling `owner` Column

---

In [8]:
# Frequency Distribution of "owner" column
cars['owner'].value_counts()

owner
1st owner    2041
2nd owner     639
3rd owner     114
4th owner      25
5th owner       2
6th owner       1
Name: count, dtype: int64

In [9]:
# As we can see, categories in "owner" column like "5th owner" and "6th owner" are very rare as compared to other values
# So we will remove "5th owner" and "6th owner" from "owner" column
# And since "4th owner" is also significantly less occuring than other categories, we will rename it to "Others"
# This way we will reduce overfitting and help model learn and generalize better
cars = cars[cars['owner'].isin(['1st owner','2nd owner','3rd owner','4th owner'])]

In [10]:
# Because we have converted "owner" column into category datatype, we also have to remove unused categories manually
cars['owner'] = cars['owner'].cat.remove_unused_categories()

In [11]:
# Renaming "4th owner" as "Others"
cars['owner'] = cars['owner'].cat.rename_categories({'4th owner':'Others'})

In [12]:
# Frequency Distribution of "owner" column after transformation
cars['owner'].value_counts()

owner
1st owner    2041
2nd owner     639
3rd owner     114
Others         25
Name: count, dtype: int64