# Used Car Price Prediction

Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv('used_cars.csv')
df.sample(5)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3533,Lexus,GX 460 Base,2010,"97,097 mi.",Gasoline,301.0HP 4.6L 8 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,At least 1 accident or damage reported,Yes,"$22,000"
1738,Volkswagen,Jetta GLI,2014,"99,499 mi.",Gasoline,210.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,At least 1 accident or damage reported,Yes,"$15,000"
515,Acura,TLX A-Spec,2021,"16,878 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,10-Speed Automatic,Red,Ebony,None reported,Yes,"$38,781"
2467,Lexus,GX 460 Premium,2020,"22,466 mi.",Gasoline,301.0HP 4.6L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Black,None reported,Yes,"$45,900"
2523,BMW,335 i xDrive,2012,"153,600 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,"$17,500"


In [None]:
# checking dtype of columns

print(df.info())
print(df.shape)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   object
 4   fuel_type     3839 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   object
dtypes: int64(1), object(11)
memory usage: 376.0+ KB
None
(4009, 12)


Checking Null Values

In [5]:
df.isnull().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

### What to do with each Column
1. Brand: Apply Target Encoding
2. model: drop it
3. model_year: can do feature extraction
4. mileage: clean it 
5. fuel type: drop useless categories
6. engine: feature engineer it to get different attributes
7. transmission:Convert it to manual/automatic
8. ext_col: Group rare colors into 'other' category
9. int_col: Group rare colors into 'other' category
10. accident: Make it boolean
11. clean_title: drop it
12. Price: clean it

Price

In [6]:
df['price']=df['price'].apply(lambda x:x.replace('$','').replace(',', ''))
df['price']=df['price'].astype('int')

Mileage

In [None]:
df['milage']=df['milage'].apply(lambda x:x.replace(',', '').replace('mi.',''))
df['milage']=df['milage'].astype('int')

Brand

In [None]:
# Checking No of categories 
df['brand'].nunique()

There are 57 so best is to apply Target Encoding or mean encoding

Model

In [None]:
df.drop(columns='model',inplace=True)

Model Year

In [None]:
import datetime
current_year = datetime.datetime.now().year
df['car_age']=current_year-df['model_year']

Fuel Type

In [None]:
df['fuel_type'].value_counts()

In [None]:
df=df[df['fuel_type']!='–']
df=df[df['fuel_type']!='not supported']

In [None]:
df['fuel_type'].value_counts()

Engine

In [None]:
df['engine'].nunique()

Making Horsepower column

In [None]:
df['engine_HP']=df['engine'].str.extract(r'(\d+\.?\d*)HP').astype('float')

In [None]:
df.sample(5)

Displacement

In [None]:
df['displacement']=df['engine'].str.extract(r'(\d+\.?\d*)L | Liter').astype('float')


Cylinder

In [None]:
# Leave it for now


In [None]:
df.sample(5)

Turbo

In [None]:
df['is_Turbo']=df['engine'].str.contains('Turbo',case=False,na=False)

df.sample(5)

Dropping Engine

In [None]:
df.drop(columns='engine',inplace=True)

Transmission

In [None]:
df['transmission'].value_counts()

In [None]:
df['automatic']=df['transmission'].str.contains(r'A/T|Automatic',case=False,na=False)

# Here we are doing mampping

df['transmission'] = df['automatic'].map({True: 'Automatic', False: 'Manual'})

df.drop(columns='automatic',inplace=True)

In [None]:
df.sample(5)

Exterior Color

In [None]:
count=(df['ext_col'].value_counts())
rare_color=count[count<100].index
df['ext_col']=df['ext_col'].replace(rare_color, 'Others')

Interior Color

In [None]:
count=df['int_col'].value_counts()
rare_color=count[count<100].index
df['int_col']=df['int_col'].replace(rare_color, 'Others')
df=df[df['int_col']!='–']

Accident

In [None]:
df['accident']=df['accident'].map({'At least 1 accident or damage reported':True, 'None reported':False})

Clean Title 

In [None]:
df.drop(columns=['clean_title'],inplace=True)

Applying Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X=df.drop(columns='price',axis=1)

y=np.log1p(df['price'])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
df.sample(3)

Handling Missing Values

In [None]:
df.isnull().sum()

Using Pipeline to apply Target Encoding on Brand and Linear Regression model

In [None]:
# !pip install category_encoders

In [None]:
X_train.sample(2)

In [None]:
X_train.isnull().sum()

In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LinearRegression
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Accidental
X_train['accident']=X_train['accident'].fillna(False)
X_test['accident']=X_test['accident'].fillna(False)

In [None]:
X_train.head(2)

In [None]:
fuel_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

df['accident'] = df['accident'].map({True: 1, False: 0})


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import category_encoders as ce

num_features = ['milage','car_age','engine_HP','displacement','accident']
cat_features = ['fuel_type','transmission','ext_col','int_col']

col = ColumnTransformer(
    transformers=[
        ('brand_te',
         ce.TargetEncoder(
             cols=['brand'],
             handle_unknown='value',
             handle_missing='value'
         ),
         ['brand']),

        ('num',
         SimpleImputer(strategy='median'),
         num_features),

        ('cat',
         Pipeline([
             ('imputer', SimpleImputer(strategy='most_frequent')),
             ('ohe', OneHotEncoder(handle_unknown='ignore'))
         ]),
         cat_features)
    ]
)


Linear Regression Pipeline

In [None]:
# pipe=Pipeline(
#     steps=[
#     ('pre_process',col),
#     ('LR',LinearRegression()    
#     )]
# )
# pipe.fit(X_train,y_train)
# y_pred_log=pipe.predict(X_test)
# y_pred=np.expm1(y_pred_log)


Polynomial Regression Pipeline

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LinearRegression

# pipe = Pipeline([
#     ('pre_process',col),
#     ('poly', PolynomialFeatures(degree=2, include_bias=False)),
#     ('lr', LinearRegression())
# ])


# pipe.fit(X_train,y_train)
# y_pred_log=pipe.predict(X_test)
# y_pred=np.expm1(y_pred_log)

Random Forest Regressor (RF)

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import r2_score, mean_absolute_error

# # Pipeline
# pipe = Pipeline([
#     ('pre_process', col),                     
#     ('rf', RandomForestRegressor(
#         n_estimators=200, 
#         max_depth=None, 
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

# # Fit
# pipe.fit(X_train, y_train)

# # Predict
# y_pred_log = pipe.predict(X_test)
# y_pred = np.expm1(y_pred_log)




GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

pipe = Pipeline([
    ('pre_process', col),
    ('gb', GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])

# Fit
pipe.fit(X_train, y_train)

# Predict
y_pred_log = pipe.predict(X_test)
y_pred = np.expm1(y_pred_log)



Evaluation Metrics

In [None]:
from sklearn.metrics import r2_score

y_test_price = np.expm1(y_test)
y_pred_price = np.expm1(y_pred_log)

r2 = r2_score(y_test_price, y_pred_price)
print("R² Score:", r2)


from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    pipe, X, y, cv=5, scoring='r2'
)
print(f'Cross Val Score for R2 {scores.mean()}')

from sklearn.metrics import mean_absolute_percentage_error

y_test_price = np.expm1(y_test)
y_pred_price = np.expm1(y_pred_log)

mape = mean_absolute_percentage_error(y_test_price, y_pred_price)
print("MAPE:", mape)


Plot comparison of 4 models

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Example scores (replace with your actual calculated scores)
models = ['Linear Regression', 'Polynomial Regression', 'Decision Tree', 'Gradient Boosting']
r2_scores = [0.53, 0.62, 0.66, 0.72]
cv_scores = [0.7777, 0.7997, 0.8200, 0.8752]
mape_scores = [0.2757, 0.2735, 0.2500, 0.2187]

# Plot R² Scores
plt.figure(figsize=(10,5))
plt.bar(models, r2_scores, color='skyblue', label='R² Score')
plt.plot(models, cv_scores, marker='o', color='red', label='CV R²')
plt.title('Model Comparison: R² & Cross-Validation R²')
plt.ylabel('Score')
plt.ylim(0,1)
plt.legend()
plt.show()

# Plot MAPE
plt.figure(figsize=(10,5))
plt.bar(models, mape_scores, color='orange')
plt.title('Model Comparison: MAPE')
plt.ylabel('MAPE')
plt.show()


### Comparison of Results
1. Linear Regression <br>
    R² Score: 0.53<br>
    Cross Val Score for R2 0.77769<br>
    MAPE: 0.2757 <br>
2. Polynomial Regression <br>
   R² Score: 0.62 <br>
    Cross Val Score for R2 0.79969 <br>
    MAPE: 0.2735


3. Random Forest Regressor <br>
    R² Score: 0.71<br>
    Cross Val Score for R2 0.84956<br>
    MAPE: 0.2402

4. Gradient Boost Regressor<br>
    R² Score: 0.72<br>
    Cross Val Score for R2 0.8752<br>
    MAPE: 0.2187


Saving Model

In [None]:
import joblib
joblib.dump(pipe,'car_price_prediction.pkl')
print('Model Saved Successfully')