In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import math

In [2]:
data = pd.read_csv("quikr_car.csv")

data.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [3]:
X = data.drop(["name", "Price"], axis=1)
y = data["Price"]
X

Unnamed: 0,company,year,kms_driven,fuel_type
0,Hyundai,2007,"45,000 kms",Petrol
1,Mahindra,2006,40 kms,Diesel
2,Maruti,2018,"22,000 kms",Petrol
3,Hyundai,2014,"28,000 kms",Petrol
4,Ford,2014,"36,000 kms",Diesel
...,...,...,...,...
887,Tara,zest,,
888,Tata,2018,"27,000 kms",Diesel
889,Mahindra,2013,"40,000 kms",Diesel
890,Honda,2014,Petrol,


In [4]:
X["company"] = X["company"].fillna(X["company"].mode()[0])
X["company"] =  LabelEncoder().fit_transform(X['company'])
X

Unnamed: 0,company,year,kms_driven,fuel_type
0,14,2007,"45,000 kms",Petrol
1,20,2006,40 kms,Diesel
2,21,2018,"22,000 kms",Petrol
3,14,2014,"28,000 kms",Petrol
4,11,2014,"36,000 kms",Diesel
...,...,...,...,...
887,31,zest,,
888,32,2018,"27,000 kms",Diesel
889,20,2013,"40,000 kms",Diesel
890,13,2014,Petrol,


In [5]:
X["year"] = X["year"].fillna(X["year"].mode()[0])
X["year"] = X["year"].apply(lambda x: X["year"].mode()[0] if not x.isdigit() else x)
X

Unnamed: 0,company,year,kms_driven,fuel_type
0,14,2007,"45,000 kms",Petrol
1,20,2006,40 kms,Diesel
2,21,2018,"22,000 kms",Petrol
3,14,2014,"28,000 kms",Petrol
4,11,2014,"36,000 kms",Diesel
...,...,...,...,...
887,31,2015,,
888,32,2018,"27,000 kms",Diesel
889,20,2013,"40,000 kms",Diesel
890,13,2014,Petrol,


In [6]:
X["kms_driven"] = X["kms_driven"].fillna(X["kms_driven"].mode()[0])
X["kms_driven"] = X["kms_driven"].apply(lambda x: x.split()[0].replace(",", ""))
X["kms_driven"] = X["kms_driven"].apply(lambda x: int(x) if x.isdigit() else int(X["kms_driven"].mode()[0]))
X

Unnamed: 0,company,year,kms_driven,fuel_type
0,14,2007,45000,Petrol
1,20,2006,40,Diesel
2,21,2018,22000,Petrol
3,14,2014,28000,Petrol
4,11,2014,36000,Diesel
...,...,...,...,...
887,31,2015,35000,
888,32,2018,27000,Diesel
889,20,2013,40000,Diesel
890,13,2014,35000,


In [7]:
X["fuel_type"] = X["fuel_type"].fillna(X["fuel_type"].mode()[0])
X["fuel_type"] = LabelEncoder().fit_transform(X['fuel_type'])
X

Unnamed: 0,company,year,kms_driven,fuel_type
0,14,2007,45000,2
1,20,2006,40,0
2,21,2018,22000,2
3,14,2014,28000,2
4,11,2014,36000,0
...,...,...,...,...
887,31,2015,35000,2
888,32,2018,27000,0
889,20,2013,40000,0
890,13,2014,35000,2


In [8]:
y = y.fillna(y.mode()[0])
y = y.apply(lambda x: x.replace(",", "").replace(" ",""))
y = y.apply(lambda x: int(x) if x.isdigit() else 100000)
y

0       80000
1      425000
2      100000
3      325000
4      575000
        ...  
887    310000
888    260000
889    390000
890    180000
891    160000
Name: Price, Length: 892, dtype: int64

In [9]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

MSE: 52530005850.3857
RMSE: 52530005850.3857
R² Score: 0.5501


In [11]:
# Predict custom example
y_pred_cust = model.predict([[6, 2014, 35000, 0]])

# Access the first element
pred_price = int(y_pred_cust[0])
print(f"Predicted price: {pred_price}")

Predicted price: 341039


