In [276]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [277]:
df = pd.read_csv('cardekho_imputated.csv', index_col=0)
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data Cleaning

In [278]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

## Feature Engineering

In [279]:
numerical_feat = list(df.select_dtypes(exclude="object").columns)
categorical_feat = list(df.select_dtypes(include="object").columns)

In [280]:
numerical_feat
# Let us remove selling price as it is not a feature, its our output
numerical_feat.pop()

'selling_price'

In [281]:
categorical_feat

['car_name', 'brand', 'model', 'seller_type', 'fuel_type', 'transmission_type']

In [282]:
## Let us see unique values in each categorical value
for feat in categorical_feat:
    print(feat + f": {len(df[feat].unique())}")

car_name: 121
brand: 32
model: 120
seller_type: 3
fuel_type: 5
transmission_type: 2


In [283]:
# We will drop car_name and model, too many values, high cardinality, no usage
df = df.drop(columns=['car_name', 'model'])
# Too many values in brand, so we apply frequency encoding or drop, lets try with frequency encoding
# ANd onehot encoder for other values and we will use pipelining


In [284]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor 
from sklearn.impute import SimpleImputer

In [285]:
# Step 2: Encode 'brand' using frequency encoding
df['brand_freq'] = df['brand'].map(df['brand'].value_counts())
df = df.drop(columns=['brand'], axis = 1)

In [286]:
# Step 3: Define columns for transformers
onehot_cols = ['seller_type', 'fuel_type']
ordinal_cols = ['transmission_type']

In [287]:
# Step 4: Create the column transformer
preprocessor = ColumnTransformer([
    ('standardscaler', StandardScaler(), numerical_feat),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), onehot_cols),
    ('ordinal', OrdinalEncoder(), ordinal_cols)
], remainder='passthrough')  # remainder = numerical features + brand_freq

## Pipelining

In [288]:
# Step 5: Create pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

In [289]:
from sklearn.model_selection import train_test_split
# Step 6: Fit pipeline
X = df.drop(columns='selling_price')  
y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [290]:
pipeline.fit(X_train, y_train)

In [291]:
y_pred = pipeline.predict(X_test)

## Model Evaluation

| Metric                           | What it Measures                   | Ideal Value    | sklearn function                |
| -------------------------------- | ---------------------------------- | -------------- | ------------------------------- |
| **MAE** (Mean Absolute Error)    | Average of absolute errors         | Lower = Better | `mean_absolute_error()`         |
| **MSE** (Mean Squared Error)     | Average of squared errors          | Lower = Better | `mean_squared_error()`          |
| **RMSE** (Root MSE)              | Square root of MSE                 | Lower = Better | `np.sqrt(mean_squared_error())` |
| **R² Score (R-squared)**         | How well data fits the model       | Closer to 1    | `r2_score()`                    |
| **Adjusted R²**                  | R² adjusted for number of features | Closer to 1    | manually computed               |
| **MAPE** (Mean Absolute % Error) | Avg percentage error               | Closer to 0    | manually computed               |


In [292]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [293]:
print(mae)
print(mse)
print(rmse)
print(r2)

104162.11644423801
92909636897.67212
304810.8214904322
0.8765782620245987


## Hyperparameter Tuning

In [296]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 20, 40],
    'model__min_samples_split': [2, 5]
}


## Rule: To access parameters inside a pipeline step in GridSearchCV, use:

In [297]:
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [299]:
print("✅ Best Parameters:", grid_search.best_params_)
print("📊 Best R2 Score (CV):", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("📈 Test R2 Score:", r2_score(y_test, y_pred))
print("📉 Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

✅ Best Parameters: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 200}
📊 Best R2 Score (CV): 0.8964786923582926
📈 Test R2 Score: 0.8449854500462171
📉 Test RMSE: 341602.2986824847
