In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [13]:
df = pd.read_csv('cleaned_car_data_2026_mumbai.csv')
df.head()

Unnamed: 0,Registration Year,Insurance,Fuel Type,Seats,Kms Driven,Ownership,Engine Displacement,Transmission,Power,Drive Type,...,Diesel Mileage ARAI,Diesel Fuel Tank Capacity,Top Speed,Super Charge,Battery Capacity,Charging Time DC,Charging Time AC,Motor Type,Battery Type,Charging Port
0,2022.0,Unknown,Petrol,5.0,30000,1,999,Automatic,113.98 bhp,FWD,...,Not Applicable,Not Applicable,Unknown,0,0.0,0.0,0.0,Not Applicable,Not Applicable,
1,2023.0,Zero Dep,Diesel,5.0,22527,1,1493,Automatic,,2WD,...,19.1,50.0,Unknown,0,0.0,0.0,0.0,Not Applicable,Not Applicable,
2,2024.0,Comprehensive,Petrol,7.0,8000,1,1451,Automatic,141.04 bhp,FWD,...,Not Applicable,Not Applicable,195,0,0.0,0.0,0.0,Not Applicable,Not Applicable,
3,2024.0,Unknown,Petrol,7.0,20277,1,1482,Automatic,157.81 bhp,FWD,...,Not Applicable,Not Applicable,174,0,0.0,0.0,0.0,Not Applicable,Not Applicable,
4,2022.0,Comprehensive,Petrol,5.0,20000,1,999,Manual,71.01 bhp,FWD,...,Not Applicable,Not Applicable,Unknown,0,0.0,0.0,0.0,Not Applicable,Not Applicable,


In [14]:
df.drop(columns=['Power'], inplace=True)

In [15]:
# 1. Identify all numerical columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 2. Remove your target variable 'Price' from this list 
if 'Price' in numerical_features:
    numerical_features.remove('Price')

# 3. Print the list to see what we are working with
print(f"Total Numerical Features: {len(numerical_features)}")
print(numerical_features)

Total Numerical Features: 22
['Registration Year', 'Seats', 'Kms Driven', 'Ownership', 'Engine Displacement', 'Mileage', 'Max_Power_clean', 'Max_Power_RPM', 'Max_Torque_Nm', 'Max_Torque_RPM', 'No. of Cylinders', 'Valves Per Cylinder', 'Gearbox', 'Length', 'Width', 'Height', 'Ground Clearance Unladen', 'Wheel Base', 'Super Charge', 'Battery Capacity', 'Charging Time DC', 'Charging Time AC']


In [16]:
categorical_col = [col for col in df.columns if col not in numerical_features]
print(categorical_col)

['Insurance', 'Fuel Type', 'Transmission', 'Drive Type', 'Engine Type', 'Fuel Supply System', 'Turbo Charger', 'Transmission Type', 'Petrol Mileage ARAI', 'Petrol Fuel Tank Capacity', 'Emission Norm Compliance', 'Front Suspension', 'Rear Suspension', 'Steering Type', 'Steering Column', 'Front Brake Type', 'Rear Brake Type', 'Gross Weight', 'Price', 'Diesel Mileage ARAI', 'Diesel Fuel Tank Capacity', 'Top Speed', 'Motor Type', 'Battery Type', 'Charging Port']


In [17]:
def batch_feature_engineering(df, categorical_columns):
    """
    Smartly iterates through all provided columns and applies either 
    Ordinal or One-Hot encoding based on the data profile.
    """
    
    # Define features that are strictly nominal (no inherent order) 
    # and should try to stay One-Hot if possible.
    nominal_priority = ['Fuel Type', 'Transmission', 'Drive Type', 'Motor Type', 'Engine Type']
    
    for col in categorical_columns:
        if col not in df.columns:
            continue
            
        unique_count = df[col].nunique()
        
        # --- SMART LOGIC SELECTION ---
        # 1. Use One-Hot for low-cardinality nominal data
        if col in nominal_priority or unique_count <= 10:
            print(f"Applying One-Hot: {col} ({unique_count} categories)")
            df = apply_one_hot_encoding(df, col)
            
        # 2. Use Ordinal for high-cardinality or structural data (Suspension, Weight, etc.)
        else:
            print(f"Applying Ordinal: {col} ({unique_count} categories)")
            # No manual mapping provided, so it will auto-encode 1 to N
            df = apply_ordinal_encoding(df, col)
            
    return df

# --- THE HELPERS ---
def apply_ordinal_encoding(df, column_name, mapping=None):
    if mapping:
        # Map values; any missing from mapping default to 0
        df[column_name] = df[column_name].map(mapping).fillna(0).astype(int)
    else:
        # Auto-rank unique values if no mapping is provided
        unique_vals = sorted(df[column_name].unique())
        # Ensure 'Unknown' or similar is at the bottom (0) if it exists
        auto_map = {val: i for i, val in enumerate(unique_vals)}
        df[column_name] = df[column_name].map(auto_map).astype(int)
    
    return df

def apply_one_hot_encoding(df, column_name):
    """
    Applies One-Hot Encoding with drop_first=True.
    Returns the cleaned dataframe and the name of the automatically dropped column.
    """
    # 1. Standardize missing/unknown
    df[column_name] = df[column_name].astype(str).replace(['nan', 'None', 'unknown', 'Unknown', 'missing'], 'TEMP_MISSING')

    # 2. Get the full list of categories BEFORE dropping to see what WILL be dropped
    all_categories = sorted(df[column_name].unique())
    automatically_dropped_val = all_categories[0] # drop_first=True takes the first one
    dropped_dummy_name = f"{column_name}_{automatically_dropped_val}"

    # 3. Generate the One-Hot encoded columns with drop_first=True
    dummies = pd.get_dummies(df[column_name], prefix=column_name, drop_first=True, prefix_sep='_')

    # 4. Handle the TEMP_MISSING rule
    # If TEMP_MISSING was the one dropped by 'drop_first', we need to be careful.
    # If not, and it exists in the dummies, we remove it as per your "value 0" rule.
    missing_col_name = f"{column_name}_TEMP_MISSING"
    if missing_col_name in dummies.columns:
        dummies = dummies.drop(columns=[missing_col_name])

    # 5. Join and Clean up
    df = pd.concat([df, dummies], axis=1)
    print(f"Cleaned '{column_name}': Old feature removed. Baseline (dropped): {dropped_dummy_name}", "\n")
    
    df.drop(columns=[column_name], inplace=True)
    return df

In [18]:
df = batch_feature_engineering(df, categorical_col)

Applying One-Hot: Insurance (4 categories)
Cleaned 'Insurance': Old feature removed. Baseline (dropped): Insurance_Comprehensive 

Applying One-Hot: Fuel Type (4 categories)
Cleaned 'Fuel Type': Old feature removed. Baseline (dropped): Fuel Type_CNG 

Applying One-Hot: Transmission (2 categories)
Cleaned 'Transmission': Old feature removed. Baseline (dropped): Transmission_Automatic 

Applying One-Hot: Drive Type (5 categories)
Cleaned 'Drive Type': Old feature removed. Baseline (dropped): Drive Type_2WD 

Applying One-Hot: Engine Type (5 categories)
Cleaned 'Engine Type': Old feature removed. Baseline (dropped): Engine Type_In-Line 

Applying One-Hot: Fuel Supply System (5 categories)
Cleaned 'Fuel Supply System': Old feature removed. Baseline (dropped): Fuel Supply System_CRDi 

Applying One-Hot: Turbo Charger (3 categories)
Cleaned 'Turbo Charger': Old feature removed. Baseline (dropped): Turbo Charger_No 

Applying One-Hot: Transmission Type (2 categories)
Cleaned 'Transmission Typ

In [19]:
df['Price'] = df['Price'].astype(float)

In [20]:
# Fill Missing price values with mean
df['Price'] = df['Price'].fillna(df['Price'].mean())

In [21]:
# 1. Check data types - Everything MUST be int or float
print(df.dtypes.value_counts()) 

# 2. Check the Price column specifically
print(f"Target Type: {df['Price'].dtype}")

# 3. Ensure Price is NOT in your X
X = df.drop(columns=['Price'])
y = df['Price']

bool       53
float64    19
int64      10
Name: count, dtype: int64
Target Type: float64


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model = RandomForestRegressor()

In [24]:
# 1. Define the parameter grid
param_grid = {
    'n_estimators': [500, 1000],
    'max_features': [0.5, 0.8], 
    'min_samples_split': [2, 5], 
    'min_samples_leaf': [2, 5], 
    'max_depth': [3, 5, 7],
    'bootstrap': [True, False] 
}

# 2. Initialize the Grid Search
# We use 'neg_mean_absolute_error' because GridSearch tries to "maximize" a score
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    cv=3,           # 3-fold cross-validation
    scoring='neg_mean_absolute_error', 
    verbose=1, 
    n_jobs=-1       # Use all available CPU cores
)

# 3. Fit the grid search to the data
# (Ensure your X_train and y_train are ready)
grid_search.fit(X_train, y_train)

# 4. Extract the best model
best_model = grid_search.best_estimator_

print(f"Best Parameters found: {grid_search.best_params_}")

Fitting 3 folds for each of 96 candidates, totalling 288 fits


Best Parameters found: {'bootstrap': True, 'max_depth': 7, 'max_features': 0.5, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 500}


In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# 2. Get Importance Scores
importances = rf.feature_importances_
feature_names = X.columns

# 3. Organize into a DataFrame for easy reading
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# 4. Show the Top 15 Features
print(feature_importance_df.tail(15))

                     Feature  Importance
2                 Kms Driven    0.202331
5                    Mileage    0.087223
18  Ground Clearance Unladen    0.086243
0          Registration Year    0.073975
17                    Height    0.038412
6            Max_Power_clean    0.033326
13       Petrol Mileage ARAI    0.028877
23                 Top Speed    0.026749
15                    Length    0.025180
3                  Ownership    0.024297
16                     Width    0.023774
19                Wheel Base    0.022967
9             Max_Torque_RPM    0.021233
8              Max_Torque_Nm    0.019013
44   Fuel Supply System_MPFi    0.017867


In [28]:
# 1. Calculate cumulative importance
feature_importance_df['Cumulative_Importance'] = feature_importance_df['Importance'].cumsum()

# 2. Define your threshold (e.g., 95% of the predictive power)
threshold = 0.95
top_features_df = feature_importance_df[feature_importance_df['Cumulative_Importance'] <= threshold]

# 3. Get the list of names
top_feature_names = top_features_df['Feature'].tolist()

print(f"Total features: {len(feature_importance_df)}")
print(f"Features needed for 95% importance: {len(top_feature_names)}")
print("Top Features to keep:", top_feature_names)

# 4. Filter your main DataFrame
# Don't forget to keep 'Price' if you are still in the training phase!
df_reduced = df[top_feature_names + ['Price']]

Total features: 81
Features needed for 95% importance: 40
Top Features to keep: ['Kms Driven', 'Mileage', 'Ground Clearance Unladen', 'Registration Year', 'Height', 'Max_Power_clean', 'Petrol Mileage ARAI', 'Top Speed', 'Length', 'Ownership', 'Width', 'Wheel Base', 'Max_Torque_RPM', 'Max_Torque_Nm', 'Fuel Supply System_MPFi', 'Engine Displacement', 'Gross Weight', 'Insurance_Third Party', 'Petrol Fuel Tank Capacity', 'Turbo Charger_Yes', 'Max_Power_RPM', 'Fuel Supply System_Direct Injection', 'Gearbox', 'Fuel Supply System_EFI', 'Diesel Mileage ARAI', 'Emission Norm Compliance_BS VI', 'Transmission_Manual', 'Transmission Type_Manual', 'Diesel Fuel Tank Capacity', 'Steering Column_Tilt Only', 'Steering Column_Fixed', 'Insurance_Zero Dep', 'Steering Column_Tilt and Telescopic', 'Engine Type_V-Type', 'Steering Type_Power', 'Emission Norm Compliance_BS IV', 'Seats', 'Engine Type_Standard', 'Drive Type_4WD', 'Emission Norm Compliance_BS VI 2.0']


In [27]:
print(feature_importance_df.tail(15))

                                              Feature    Importance
62                       Rear Suspension_Trailing Arm  4.789149e-04
11                                Valves Per Cylinder  4.724679e-04
74                          Motor Type_Not Applicable  4.147139e-04
50                      Emission Norm Compliance_BS V  2.956514e-04
77                           Battery Type_Lithium-ion  2.735225e-04
63                            Steering Type_Hydraulic  2.705369e-04
76  Motor Type_PMSM (Permanent Magnet Synchronous ...  2.437064e-04
54                       Front Suspension_Independent  2.281037e-04
53                       Emission Norm Compliance_ZEV  1.670190e-04
80                               Charging Port_CCS-II  1.587793e-04
79                        Battery Type_Not Applicable  1.329888e-04
31                                 Fuel Type_Electric  9.093446e-05
78                                 Battery Type_Ni-MH  5.351122e-05
24                                       Super C

In [None]:
# Fit data into the model 
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R^2: {r2}")

MAE: 175.57020568847656
MSE: 44330.0546875
R^2: 0.0027835965156555176


In [None]:
baseline_pred = np.full_like(y_test, y_train.mean())
baseline_mae = mean_absolute_error(y_test, baseline_pred)

print(f"Baseline MAE: {baseline_mae}")
print(f"Difference between Baseline and Model: {mae - baseline_mae}")

Baseline MAE: 178.02618328298087
Difference between Baseline and Model: -2.4559775945043043


In [None]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mape

np.float64(497.7720617159054)

In [None]:
# 1. Check data types - Everything MUST be int or float
print(df.dtypes.value_counts()) 

# 2. Check the Price column specifically
print(f"Target Type: {df['Price'].dtype}")

# 3. Ensure Price is NOT in your X
X = df.drop(columns=['Price'])
y = df['Price']

bool       53
float64    18
int64      11
Name: count, dtype: int64
Target Type: int64


In [None]:
ssss

NameError: name 'ssss' is not defined

0. Registration Year

In [None]:
df['Registration Year'].dtype

dtype('float64')

---

---

1. Insurance

In [None]:
insurance_hierarchy = {
    'Third Party': 1,
    'Comprehensive': 2,
    'Zero Dep': 3
}
df = apply_ordinal_cleaning(df, 'Insurance', mapping=insurance_hierarchy)
df['Insurance'].value_counts()

Insurance
2    1940
0    1775
1    1083
3     166
Name: count, dtype: int64

2. Fuel

In [None]:
df = apply_one_hot_cleaning(df, 'Fuel Type')

Removing old feature: Fuel Type
Note for Frontend: The baseline (dropped) column is: Fuel Type_CNG


3. Seats

In [None]:
df['Seats'].value_counts()

Seats
5.0    4365
7.0     439
8.0      71
6.0      45
4.0      42
2.0       2
Name: count, dtype: int64

4. Kms Driven

In [None]:
# Reversing the order: Low mileage = High Value (5)
kms_mapping_reversed = {
    "500 - 27000": 5,
    "27000 - 42143": 4,
    "42143 - 60000": 3,
    "60000 - 86000": 2,
    "86000 - 160000": 1
}

# Apply your function with this logic
df = apply_ordinal_cleaning(df, 'Kms Driven', mapping=kms_mapping_reversed)

5. Ownership

In [None]:
df['Ownership'].value_counts()

Ownership
1    3411
2    1277
3     215
5      60
4       1
Name: count, dtype: int64

6. Engine Displacement

In [None]:
# Mapping from smallest engine to largest engine
engine_mapping = {
    "796 - 1197": 1,   # Small/Economy engines
    "1197 - 1199": 2,
    "1199 - 1497": 3,
    "1497 - 5461": 4    # Large/Powerful engines
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Engine Displacement', mapping=engine_mapping)

7. Transmission

In [None]:
df = apply_one_hot_cleaning(df, 'Transmission')

Removing old feature: Transmission
Note for Frontend: The baseline (dropped) column is: Transmission_Automatic


8. Power

In [None]:
power_mapping = {
    "37 - 67": 1,     # Low Power
    "72 - 78": 2,
    "78 - 85": 3,
    "85 - 108": 4,
    "108 - 138": 5,
    "138 - 536": 6    # High Power
}

df = apply_ordinal_cleaning(df, 'Power', mapping=power_mapping)

9. Drive Type

In [None]:
df = apply_one_hot_cleaning(df, 'Drive Type')

Removing old feature: Drive Type
Note for Frontend: The baseline (dropped) column is: Drive Type_2WD


10. Mileage

In [None]:
mileage_mapping = {
    "10 - 15": 3,    # Medium Mileage
    "17 - 18": 2,    
    "18 - 21": 1,
    "21 - 34": 0    # High Mileage
}

df = apply_ordinal_cleaning(df, 'Mileage', mapping=mileage_mapping)

11. Engine Type

In [None]:
df = apply_one_hot_cleaning(df, 'Engine Type')

Removing old feature: Engine Type
Note for Frontend: The baseline (dropped) column is: Engine Type_In-Line


12. Max_Power_clean

In [None]:
power_map = {
    "101 - 121": 4,     # Low Power
    "34 - 73": 1,
    "81 - 101": 3,
    "73 - 81": 2,
    "121 - 536": 5    # High Power
}
df = apply_ordinal_cleaning(df, 'Max_Power_clean', mapping=power_map)

13. Max_Power_RPM

In [None]:
power_rpm_mapping = {
    "4400 - 6000": 2,
    "3000 - 4400": 1,    
    "6000 - 7900": 3
}
df = apply_ordinal_cleaning(df, 'Max_Power_RPM', mapping=power_rpm_mapping)

In [None]:
# Mapping from lowest RPM range to highest
torque_rpm_mapping = {
    "1400 - 2800": 1,
    "2800 - 3500": 2,
    "3500 - 4000": 3,
    "4000 - 4500": 4,
    "4500 - 6000": 5
}

# Apply your function (this replaces the old string column with these numbers)
df = apply_ordinal_cleaning(df, 'Max_Torque_RPM', mapping=torque_rpm_mapping)

14. Max_Torque_Nm

In [None]:
torque_mapping = {
    "11-101" : 1,
    "101-113" : 2,
    "113-146" : 3,
    "146-242" : 4,
    "242-750" : 5
}
df = apply_ordinal_cleaning(df, 'Max_Torque_Nm', mapping=torque_mapping)

15. No. of Cylinders

In [None]:
df['No. of Cylinders'].value_counts()

No. of Cylinders
4.0    3729
3.0    1095
6.0     133
8.0       7
Name: count, dtype: int64

16. Valves Per Cylinder

In [None]:
df['Valves Per Cylinder'].value_counts()

Valves Per Cylinder
4.0    4618
3.0     222
2.0     124
Name: count, dtype: int64

17. Fuel Supply System

In [None]:
df = apply_one_hot_cleaning(df, 'Fuel Supply System')

Removing old feature: Fuel Supply System
Note for Frontend: The baseline (dropped) column is: Fuel Supply System_CRDi


18. Turbo Charger

In [None]:
df = apply_one_hot_cleaning(df, 'Turbo Charger')

Removing old feature: Turbo Charger
Note for Frontend: The baseline (dropped) column is: Turbo Charger_No


19. Transmission Type

In [None]:
df = apply_one_hot_cleaning(df, 'Transmission Type')

Removing old feature: Transmission Type
Note for Frontend: The baseline (dropped) column is: Transmission Type_Automatic


21. Gearbox

In [None]:
df['Gearbox'].value_counts()

Gearbox
5.0     3012
6.0      980
8.0      301
4.0      270
7.0      264
9.0       90
1.0       14
10.0       1
Name: count, dtype: int64

22. Petrol Mileage ARAI

In [None]:

mileage_mapping = {
    "7 - 17": 1,
    "17 - 18": 2,
    "18 - 19": 3,
    "19 - 27": 4,
    "Not Applicable": 0  
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Petrol Mileage ARAI', mapping=mileage_mapping)

23. Petrol Fuel Tank Capacity

In [None]:
fuel_tank_mapping = {
    "27 - 35": 1,
    "35 - 40": 2,
    "40 - 43": 3,
    "43 - 93": 4,
    "Not Applicable": 0  
}

df = apply_ordinal_cleaning(df, 'Petrol Fuel Tank Capacity', mapping=fuel_tank_mapping) 

24. Emission Norm Compliance

In [None]:
df = apply_one_hot_cleaning(df, 'Emission Norm Compliance')

Removing old feature: Emission Norm Compliance
Note for Frontend: The baseline (dropped) column is: Emission Norm Compliance_BS III


25. Front Suspension

In [None]:
df['Front Suspension'].value_counts()

Front Suspension
MacPherson Strut    4451
Standard/Other       322
Double Wishbone      169
Multi-Link            13
Independent            9
Name: count, dtype: int64

In [None]:
df = apply_one_hot_cleaning(df, 'Front Suspension')

Removing old feature: Front Suspension
Note for Frontend: The baseline (dropped) column is: Front Suspension_Double Wishbone


26. Rear Suspension

In [None]:
df = apply_one_hot_cleaning(df, 'Rear Suspension')

Removing old feature: Rear Suspension
Note for Frontend: The baseline (dropped) column is: Rear Suspension_Double Wishbone


27. Steering Type

In [None]:
df = apply_one_hot_cleaning(df, 'Steering Type')

Removing old feature: Steering Type
Note for Frontend: The baseline (dropped) column is: Steering Type_Electric


28. Steering Column

In [None]:
df = apply_one_hot_cleaning(df, 'Steering Column')

Removing old feature: Steering Column
Note for Frontend: The baseline (dropped) column is: Steering Column_Collapsible/Fixed


29. Front Brake Type

In [None]:
df = apply_one_hot_cleaning(df, 'Front Brake Type')

Removing old feature: Front Brake Type
Note for Frontend: The baseline (dropped) column is: Front Brake Type_Other


30 .Rear Brake Type

In [None]:
df = apply_one_hot_cleaning(df, 'Rear Brake Type')

Removing old feature: Rear Brake Type
Note for Frontend: The baseline (dropped) column is: Rear Brake Type_Drum


31. 'Length', 'Width', 'Height'

In [None]:
length_mapping = {
    "4-3746": 1,
    "3746-3995": 2,
    "3995-4220": 3,
    "4220-5391": 4
}
df = apply_ordinal_cleaning(df, 'Length', mapping=length_mapping)

In [None]:
# Mapping from narrowest to widest
width_mapping = {
    "1 - 1647": 1,
    "1647 - 1710": 2,
    "1710 - 1780": 3,
    "1780 - 2230": 4
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Width', mapping=width_mapping)

In [None]:
# Mapping from shortest to tallest
height_mapping = {
    "1 - 1495": 1,
    "1495 - 1530": 2,
    "1530 - 1640": 3,
    "1640 - 1995": 4
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Height', mapping=height_mapping)

34. Ground Clearance Unladen

In [None]:
# Mapping from lowest to highest ground clearance
gc_mapping = {
    "100 - 163": 1,
    "163 - 165": 2,
    "165 - 170": 3,
    "170 - 178": 4,
    "178 - 307": 5
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Ground Clearance Unladen', mapping=gc_mapping)

35. Wheel Base

In [None]:
# Mapping from shortest to longest wheel base
wheelbase_mapping = {
    "2 - 2400": 1,
    "2400 - 2456": 2,
    "2456 - 2550": 3,
    "2550 - 2650": 4,
    "2650 - 3215": 5
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Wheel Base', mapping=wheelbase_mapping)

36. Gross Weight

In [None]:
# Mapping from lightest to heaviest gross weight
weight_mapping = {
    "1140 - 1340": 1,
    "1340 - 1430": 2,
    "1430 - 1520": 3,
    "1520 - 1755": 4,
    "1755 - 3250": 5,
    "Unknown": 0       # Your function will also handle this via the 'MISSING_VAL' logic
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Gross Weight', mapping=weight_mapping)

37. Diesel Mileage ARAI

In [None]:
# Mapping: Higher number = Better Diesel Efficiency
diesel_mapping = {
    "10 - 16": 1,
    "16 - 20": 2,
    "20 - 27": 3,
    "27 - 28": 4,
    "Not Applicable": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Diesel Mileage ARAI', mapping=diesel_mapping)

In [None]:
# Mapping: Higher number = Larger Fuel Tank (indicates larger vehicle class)
diesel_tank_mapping = {
    "35 - 37": 1,
    "37 - 50": 2,
    "50 - 60": 3,
    "60 - 100": 4,
    "Not Applicable": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Diesel Fuel Tank Capacity', mapping=diesel_tank_mapping)

39 .Top Speed

In [None]:
# Mapping: Higher number = Higher Performance / Higher Price Category
top_speed_mapping = {
    "121 - 150": 1,
    "150 - 165": 2,
    "165 - 185": 3,
    "185 - 302": 4,
    "Unknown": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Top Speed', mapping=top_speed_mapping)

40. Super Charge

In [None]:
df['Super Charge'].value_counts()

Super Charge
0    4959
1       5
Name: count, dtype: int64

41. Battery Capacity

In [None]:
# Mapping: Higher number = Larger Battery / More Expensive EV technology
battery_mapping = {
    "26 - 40": 1,
    "40 - 71": 2,
    "71 - 79": 3,
    "79 - 106": 4,
    "No Battery": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Battery Capacity', mapping=battery_mapping)

42. Charging Time DC

In [None]:
# Mapping: Higher number = Better Performance (Faster Charging)
charging_mapping = {
    "30 - 50": 4,     # Fastest - Premium Tech
    "50 - 58": 3,
    "58 - 1220": 2,
    "1220 - 3599": 1, # Slowest
    "No Charging": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Charging Time DC', mapping=charging_mapping)

43. Charging Time AC

In [None]:
# Mapping: Higher number = Better Efficiency (Shorter home charging time)
ac_charging_mapping = {
    "300 - 445": 4,   # Fastest AC charging
    "445 - 480": 3,
    "480 - 564": 2,
    "564 - 900": 1,   # Slowest AC charging
    "No Charging": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Charging Time AC', mapping=ac_charging_mapping)

44.Motor Type

In [None]:
# Apply your One-Hot function to Motor Type
df = apply_one_hot_cleaning(df, 'Motor Type')

Removing old feature: Motor Type
Note for Frontend: The baseline (dropped) column is: Motor Type_Mild Hybrid Motor


45. Battery Type

In [None]:
# Mapping: Higher number = More advanced/expensive battery technology
battery_type_mapping = {
    "Ni-MH": 1,                # Older hybrid tech
    "Lithium-ion": 2,          # Standard modern EV tech
    "Blade Battery (LFP)": 3,  # Premium safety/durability tech
    "Not Applicable": 0
}

# Apply your function
df = apply_ordinal_cleaning(df, 'Battery Type', mapping=battery_type_mapping)

In [None]:
df = apply_one_hot_cleaning(df, 'Charging Port')

Removing old feature: Charging Port
Note for Frontend: The baseline (dropped) column is: Charging Port_CCS-I


In [None]:
# Print the missing values in each dataframe
print(f"The number of missing vlaues in the column 'Price': {df['Price'].isnull().sum()}, ({(df['Price'].isnull().sum() / df['Price'].shape[0] * 100):.2f}%)")

The number of missing vlaues in the column 'Price': 86, (1.73%)


In [None]:
# Fill missing values with the mean of th column 
df['Price'] = df['Price'].fillna(df['Price'].mean())

In [None]:
df.columns

Index(['Registration Year', 'Insurance', 'Seats', 'Kms Driven', 'Ownership',
       'Engine Displacement', 'Power', 'Mileage', 'Max_Power_clean',
       'Max_Power_RPM', 'Max_Torque_Nm', 'Max_Torque_RPM', 'No. of Cylinders',
       'Valves Per Cylinder', 'Gearbox', 'Petrol Mileage ARAI',
       'Petrol Fuel Tank Capacity', 'Length', 'Width', 'Height',
       'Ground Clearance Unladen', 'Wheel Base', 'Gross Weight', 'Price',
       'Diesel Mileage ARAI', 'Diesel Fuel Tank Capacity', 'Top Speed',
       'Super Charge', 'Battery Capacity', 'Charging Time DC',
       'Charging Time AC', 'Battery Type', 'Fuel Type_Diesel',
       'Fuel Type_Electric', 'Fuel Type_Petrol', 'Transmission_Manual',
       'Drive Type_4WD', 'Drive Type_AWD', 'Drive Type_FWD', 'Drive Type_RWD',
       'Engine Type_Standard', 'Engine Type_Standard Brand Engine',
       'Engine Type_Turbocharged', 'Engine Type_V-Type',
       'Fuel Supply System_Direct Injection', 'Fuel Supply System_EFI',
       'Fuel Supply System

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Price', 'Power'], axis=1), df['Price'], test_size=0.2, random_state=42)

In [None]:
model = xgb.XGBRegressor()

In [None]:
df['Max_Power_clean'].value_counts()

Max_Power_clean
4    1039
1    1018
3    1001
2     970
5     936
Name: count, dtype: int64

In [None]:
# 1. Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000],
    'colsample_bytree': [0.7, 0.8] # Percentage of features used per tree
}

# 2. Initialize the Grid Search
# We use 'neg_mean_absolute_error' because GridSearch tries to "maximize" a score
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    cv=3,           # 3-fold cross-validation
    scoring='neg_mean_absolute_error', 
    verbose=1, 
    n_jobs=-1       # Use all available CPU cores
)

# 3. Fit the grid search to the data
# (Ensure your X_train and y_train are ready)
grid_search.fit(X_train, y_train)

# 4. Extract the best model
best_model = grid_search.best_estimator_

print(f"Best Parameters found: {grid_search.best_params_}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits


Best Parameters found: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}


In [None]:
# Fit data into the model 
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R^2: {r2}")

MAE: 541760.7273097517
MSE: 974709372973.4718
R^2: 0.10618414393836773
