In [4]:
import pandas as pd
import numpy as np
from pycaret.regression import *

In [7]:
df=pd.read_csv('lcar.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,brand,model
0,0,1003000,86226,Diesel,Man,1,2017,1956,5,Jeep,Compass 2.0
1,1,1283000,13248,Petrol,Auto,1,2021,1330,5,Renault,Duster RXZ
2,2,1640000,60343,Petrol,Auto,1,2016,2494,5,Toyota,Camry 2.5
3,3,777000,26696,Petrol,Auto,1,2018,1199,5,Honda,Jazz VX
4,4,515000,69414,Petrol,Man,1,2016,1199,5,Volkswagen,Polo 1.2


In [8]:
# Drop unnecessary column
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

In [9]:
# Feature Engineering
df["car_age"] = 2025 - df["manufacture"]
df["kms_per_year"] = df["kms_driven"] / df["car_age"].replace(0, 1)  # Avoid division by zero


In [15]:
# Convert categorical columns explicitly
df["fuel_type"] = df["fuel_type"].astype(str)
df["transmission"] = df["transmission"].astype(str)
df["brand"] = df["brand"].astype(str)
df["model"] = df["model"].astype(str)  # Treat model names as categorical


In [17]:
# PyCaret Setup
regression_setup = setup(
    data=df,
    target="car_prices_in_rupee",
    ignore_features=["model"],  # Ignore 'model' since it's too specific
    categorical_features=["fuel_type", "transmission", "brand"],
    session_id=42,
    normalize=True  # Normalize numerical features
  
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,car_prices_in_rupee
2,Target type,Regression
3,Original data shape,"(5312, 12)"
4,Transformed data shape,"(5312, 15)"
5,Transformed train set shape,"(3718, 15)"
6,Transformed test set shape,"(1594, 15)"
7,Ignore features,1
8,Numeric features,7
9,Categorical features,3


In [18]:
# compare different models
best_model=compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,268785.9317,326108125728.9486,567010.4194,0.8161,0.4439,0.4609,1.68
et,Extra Trees Regressor,259026.0653,340891814325.6408,573807.0474,0.8096,0.4377,0.4309,1.115
lightgbm,Light Gradient Boosting Machine,285758.7313,344815119368.924,584308.5934,0.8029,0.4602,0.4875,0.608
gbr,Gradient Boosting Regressor,307454.1737,386759573352.0946,619466.9245,0.7802,0.4877,0.5524,0.535
dt,Decision Tree Regressor,321190.7715,510095675210.6701,708488.064,0.7148,0.5386,0.4698,0.121
knn,K Neighbors Regressor,355276.0457,522904626853.404,719275.1465,0.7032,0.5206,0.5937,0.167
ada,AdaBoost Regressor,434503.5755,623957214181.3088,787149.0018,0.6534,0.6209,0.8237,0.185
lr,Linear Regression,431733.6845,629420669864.6067,791471.3929,0.6467,0.7732,0.8307,4.747
br,Bayesian Ridge,431854.8997,629656561244.4929,791624.6726,0.6466,0.7687,0.8299,0.219
lasso,Lasso Regression,431791.4305,629701261345.868,791647.4288,0.6465,0.7664,0.8305,0.217


In [19]:
#Step 3: Create and Train Random Forest Model
rf_model = create_model('rf')  # 'rf' is the identifier for Random Forest in Pycaret

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,279854.0614,273049564943.1246,522541.4481,0.8724,0.3997,0.3926
1,256593.4946,339485187697.1924,582653.5743,0.8013,0.4036,0.3512
2,253281.3217,340385879197.0505,583425.9843,0.8313,0.4352,0.4278
3,259945.3853,363567480343.0406,602965.5714,0.7618,0.4537,0.4883
4,313595.3136,466964650338.3811,683348.118,0.7859,0.4595,0.4421
5,259872.2446,260660845950.8289,510549.5529,0.8262,0.4434,0.434
6,252612.4514,201465923866.3479,448849.5559,0.9032,0.3923,0.3432
7,245275.8513,260764430816.9355,510650.9873,0.8663,0.3872,0.3502
8,275181.1927,320704263353.199,566307.5696,0.742,0.4903,0.5723
9,291648.0009,434033030783.3858,658811.8326,0.7708,0.574,0.8067


In [20]:
# Step 4: Tune the Random Forest Model (Optional, for better performance)
tuned_rf_model = tune_model(rf_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,325360.7113,335459446459.6704,579188.6104,0.8432,0.4376,0.4684
1,307457.1096,429403856246.0106,655289.1394,0.7487,0.4499,0.448
2,292337.8989,335354158613.6319,579097.7108,0.8338,0.4331,0.4684
3,271843.3682,372206420474.389,610087.2237,0.7561,0.4733,0.548
4,339599.9733,501824880986.491,708395.9917,0.77,0.491,0.492
5,283451.8248,314389114591.1528,560704.1239,0.7903,0.4974,0.5549
6,270428.1184,213613087205.3092,462182.9586,0.8973,0.3892,0.3551
7,289880.9292,354201753883.8041,595148.5141,0.8184,0.4287,0.4308
8,298288.2302,358131503937.1168,598440.8943,0.7119,0.5152,0.6285
9,328211.7834,466675201676.8185,683136.298,0.7536,0.5828,0.8486


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [21]:
# Step 5: Finalize the Model (Train on full data)
final_rf_model = finalize_model(tuned_rf_model)

In [22]:
# Step 6: Predictions
predictions = predict_model(final_rf_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,176190.4575,188290379216.1411,433924.3934,0.894,0.3309,0.2805


In [24]:

# Save the optimized model
save_model(tuned_rf_model, "car_model_rf")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['kms_driven', 'ownership',
                                              'manufacture', 'engine', 'Seats',
                                              'car_age', 'kms_per_year'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['fuel_type', 'transmission',
                                              'brand'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('o...
                                     transformer=OneHotEncoder(cols=['fuel_type'],
                                                               handle_missing='return_nan',
                                                               use_cat_names=True))),
                 ('rest_encoding',
                  TransformerWrapper(inclu