# Used Car Price Prediction

## Import Liberaries

In [20]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

## Load The Data

In [21]:
df=pd.read_csv('used_cars_dataset.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   object 
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   object 
 5   seller_type         8128 non-null   object 
 6   transmission        8128 non-null   object 
 7   owner               8128 non-null   object 
 8   mileage(km/ltr/kg)  7907 non-null   float64
 9   engine              7907 non-null   float64
 10  max_power           7913 non-null   object 
 11  seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 762.1+ KB


In [23]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage(km/ltr/kg)', 'engine', 'max_power',
       'seats'],
      dtype='object')

In [24]:
df.shape

(8128, 12)

## Copying the Dataset for safe use

In [25]:
df_copy=df.copy()

## Data PreProcessing

In [26]:
# Dropping unnecessary columns
df_copy.drop(["name"],axis=1,inplace=True)

# Converting year into age for better processing
df_copy["car_age"]=2025 - df_copy["year"]
df_copy.drop("year",axis=1,inplace=True)

# Clean numerical-like columns
for col in ["mileage(km/ltr/kg)", "engine", "max_power"]:
    df_copy[col] = df_copy[col].astype(str).str.extract(r"(\d+\.?\d*)")[0]
    df_copy[col] = pd.to_numeric(df_copy[col])

# Encode categorical columns
encoder=LabelEncoder()
for col in ["fuel","seller_type","transmission","owner"]:
    if col in df_copy.columns and df_copy[col].dtype=="object":
        df_copy[col]=encoder.fit_transform(df_copy[col])

# Fill missing values
df_copy["mileage(km/ltr/kg)"].fillna(df_copy["mileage(km/ltr/kg)"].median(), inplace=True)
df_copy["engine"].fillna(df_copy["engine"].median(), inplace=True)
df_copy["max_power"].fillna(df_copy["max_power"].median(), inplace=True)
df_copy["seats"].fillna(df_copy["seats"].mode()[0], inplace=True)

## Spliting the data

In [27]:
X=df_copy.drop("selling_price", axis=1)
y=df_copy["selling_price"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## Train The Model

In [28]:
# Linear Regression Model
linear_model=LinearRegression()
linear_model.fit(X_train,y_train)

# Predicting
linear_predict=linear_model.predict(X_test)

# Evaluate
mse=mean_squared_error(y_test,linear_predict)
R2=r2_score(y_test,linear_predict)

# Result
print("Linear Regression MSE:",mse)
print("Linear Regression R2 Score:",R2)

Linear Regression MSE: 207737269400.9737
Linear Regression R2 Score: 0.683078259992373


In [29]:
#Random Forest Regressor
random_forest_model=RandomForestRegressor(n_estimators=100,random_state=42)
random_forest_model.fit(X_train, y_train)

# Predicting
random_forest_predict=random_forest_model.predict(X_test)

# Evaluate
mse=mean_squared_error(y_test,random_forest_predict)
R2=r2_score(y_test,random_forest_predict)

# Result
print("Random Forest MSE:",mse)
print("Random Forest R2 Score:",R2)

Random Forest MSE: 21002395110.354004
Random Forest R2 Score: 0.9679589723023967


In [30]:
#SVR model
svr_model=SVR()
svr_model.fit(X_train, y_train)

# Predicting
svr_predict=svr_model.predict(X_test)

# Evaluate
mse=mean_squared_error(y_test,svr_predict)
R2=r2_score(y_test,svr_predict)

# Result
print("SVR MSE:", mse)
print("SVR R2 Score:", R2)

SVR MSE: 690173075320.5261
SVR R2 Score: -0.05292060768740847


### Saving the models

In [31]:
# with open('linear_model.pkl', 'wb') as f:
#     pickle.dump(linear_model, f)
# with open('rf_model.pkl', 'wb') as f:
#     pickle.dump(random_forest_model, f)
# with open('svr_model.pkl', 'wb') as f:
#     pickle.dump(svr_model, f)