### Importing Libraries

---

In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Custom Seaborn Style

---

In [2]:
sns.set_theme('paper')
sns.set_style('ticks')

### Adding `utils` to `PYTHONPATH`

---

In [3]:
sys.path.append(os.path.abspath("../utils"))

### Reading Parquet File

---

In [4]:
# Importing load_parquet function from read_data module
from read_data import load_parquet
cars = load_parquet('clean_data', 'clean_data_after_eda.parquet')
cars.head()

Unnamed: 0,km_driven,fuel_type,transmission,owner,price,engine_capacity,year,brand,model
0,88760,Petrol,Manual,1st owner,219000,998,2012,Maruti,Wagon R 1.0
1,17920,Petrol,Manual,1st owner,266000,796,2016,Maruti,Alto 800
2,9940,Petrol,Manual,1st owner,496000,1373,2014,Maruti,Ertiga
3,67340,Petrol,Manual,2nd owner,355000,1199,2016,Tata,Tiago
4,30390,Petrol,Manual,1st owner,530000,998,2023,Maruti,New Wagon-R


In [5]:
# Since we have removed some data while exploratory data analysis,
# Some of the categories in categorical columns might got removed but the category will still be present
# Because we have converted object datatype into category datatype, so we have to remove these unused categories manually
cat_col = cars.select_dtypes(include='category').columns
for column in cat_col:
    cars[column] = cars[column].cat.remove_unused_categories()

### Handling High Cardinality Column  

---

In [6]:
# Frequency Distribution of "model" column
# As we can see, "model" column contains huge number of categories, which can cause problem when we apply OneHotEncoding on it
# It will create too many columns and cause sparsity (having too many zeros)
print(f'Unique categories in "model" column : {len(cars['model'].unique())}')

Unique categories in "model" column : 124


In [7]:
# So we are using model frequencies in place of original "model" column
model_freq = cars['model'].value_counts(normalize=True)

In [8]:
# Mapping model frequencies to "model" column
cars['model_freq'] = cars['model'].map(model_freq)

In [9]:
# Removing original "model" column
cars.drop('model', axis=1, inplace=True)

### Importing Libraries for Pipeline

---

In [10]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict, KFold, learning_curve, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor

### Creating Training and Testing Set

---

In [11]:
# Creating Features and Target Variable
X = cars.drop('price', axis=1)
y = cars['price']

In [12]:
# Splitting Data into Training and Testing Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [13]:
# Size of Training and Testing Set
print(f"Training Data : {X_train.shape[0]} | Testing Data : {X_test.shape[0]}")

Training Data : 1917 | Testing Data : 639
