In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("CAR DETAILS.csv")

In [3]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
data.dtypes

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

In [5]:
data.shape

(4340, 8)

In [6]:
data.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [7]:
data.duplicated().sum()

763

In [8]:
data = data.drop_duplicates()

In [9]:
data.duplicated().sum()

0

In [10]:
def extract_manufacturer(name):
    return name.split()[0]

def extract_model(name):
    return ' '.join(name.split()[1:-1])

def extract_variant(name):
    return name.split()[-1]

In [11]:
data['Manufacturer'] = data['name'].apply(extract_manufacturer)
data['Model'] = data['name'].apply(extract_model)
data['Variant'] = data['name'].apply(extract_variant)

In [12]:
data.drop(columns=['name'], inplace=True)

In [13]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,Manufacturer,Model,Variant
0,2007,60000,70000,Petrol,Individual,Manual,First Owner,Maruti,800,AC
1,2007,135000,50000,Petrol,Individual,Manual,First Owner,Maruti,Wagon R LXI,Minor
2,2012,600000,100000,Diesel,Individual,Manual,First Owner,Hyundai,Verna 1.6,SX
3,2017,250000,46000,Petrol,Individual,Manual,First Owner,Datsun,RediGO T,Option
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner,Honda,Amaze VX,i-DTEC


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3577 entries, 0 to 4339
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           3577 non-null   int64 
 1   selling_price  3577 non-null   int64 
 2   km_driven      3577 non-null   int64 
 3   fuel           3577 non-null   object
 4   seller_type    3577 non-null   object
 5   transmission   3577 non-null   object
 6   owner          3577 non-null   object
 7   Manufacturer   3577 non-null   object
 8   Model          3577 non-null   object
 9   Variant        3577 non-null   object
dtypes: int64(3), object(7)
memory usage: 307.4+ KB


In [15]:
data.describe()

Unnamed: 0,year,selling_price,km_driven
count,3577.0,3577.0,3577.0
mean,2012.962538,473912.5,69250.545709
std,4.251759,509301.8,47579.940016
min,1992.0,20000.0,1.0
25%,2010.0,200000.0,36000.0
50%,2013.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [16]:
cat_cols = data.select_dtypes(include='object').columns
print(cat_cols)

Index(['fuel', 'seller_type', 'transmission', 'owner', 'Manufacturer', 'Model',
       'Variant'],
      dtype='object')


In [17]:
num_cols = data.select_dtypes(exclude='object').columns
print(num_cols)

Index(['year', 'selling_price', 'km_driven'], dtype='object')


### Data Preprocessing

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
cat_cols

Index(['fuel', 'seller_type', 'transmission', 'owner', 'Manufacturer', 'Model',
       'Variant'],
      dtype='object')

In [20]:
label_encoder = LabelEncoder()

In [21]:
for feature in cat_cols:
    data[feature] = label_encoder.fit_transform(data[feature])

In [22]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,Manufacturer,Model,Variant
0,2007,60000,70000,4,1,1,0,18,12,49
1,2007,135000,50000,4,1,1,0,18,979,199
2,2012,600000,100000,1,1,1,0,10,934,241
3,2017,250000,46000,4,1,1,0,5,701,205
4,2014,450000,141000,1,1,1,2,9,66,307


In [23]:
X = data.drop(columns=['selling_price'])
y = data['selling_price']

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2861, 9)
(716, 9)
(2861,)
(716,)


# Linear Regression

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [29]:
y_pred_lr = lr.predict(X_test)

In [30]:
from sklearn.metrics import mean_squared_error, r2_score
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Mean Squared Error (MSE):", mse_lr)
print("R-squared (R2) Score:", r2_lr)

Mean Squared Error (MSE): 198450063608.8422
R-squared (R2) Score: 0.38395058804781845


# Ridge and Lasso

In [31]:
from sklearn.linear_model import Ridge, Lasso

In [32]:
rid = Ridge()
rid.fit(X_train, y_train)

In [33]:
y_pred_rid = rid.predict(X_test)

In [34]:
mse_rid = mean_squared_error(y_test, y_pred_rid)
r2_rid = r2_score(y_test, y_pred_rid)

print("Mean Squared Error (MSE):", mse_rid)
print("R-squared (R2) Score:", r2_rid)

Mean Squared Error (MSE): 198433749525.8626
R-squared (R2) Score: 0.3840012319279127


In [35]:
las = Lasso()
las.fit(X_train, y_train)

In [36]:
y_pred_las = las.predict(X_test)

In [37]:
mse_las = mean_squared_error(y_test, y_pred_las)
r2_las = r2_score(y_test, y_pred_las)

print("Mean Squared Error (MSE):", mse_las)
print("R-squared (R2) Score:", r2_las)

Mean Squared Error (MSE): 198450148966.8735
R-squared (R2) Score: 0.3839503230705035


# Decision Tree Regressor

In [38]:
from sklearn.tree import DecisionTreeRegressor

In [39]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [40]:
y_pred_dtr = dtr.predict(X_test)

In [41]:
mse_dtr = mean_squared_error(y_test, y_pred_dtr)
r2_dtr = r2_score(y_test, y_pred_dtr)

print("Mean Squared Error (MSE):", mse_dtr)
print("R-squared (R2) Score:", r2_dtr)

Mean Squared Error (MSE): 204374379933.28946
R-squared (R2) Score: 0.3655597066264439


# Random Forest Regressor

In [42]:
from sklearn.ensemble import RandomForestRegressor

In [43]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [44]:
y_pred_rfr = rfr.predict(X_test)

In [45]:
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

print("Mean Squared Error (MSE):", mse_rfr)
print("R-squared (R2) Score:", r2_rfr)

Mean Squared Error (MSE): 118638635132.88454
R-squared (R2) Score: 0.6317095591741273


# Gradient Boosting Regressor

In [46]:
from sklearn.ensemble import GradientBoostingRegressor

In [47]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

In [48]:
y_pred_gbr = gbr.predict(X_test)

In [49]:
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print("Mean Squared Error (MSE):", mse_gbr)
print("R-squared (R2) Score:", r2_gbr)

Mean Squared Error (MSE): 137107500507.86095
R-squared (R2) Score: 0.5743765785402463


# Support Vector Regression

In [50]:
from sklearn.svm import SVR

In [51]:
svr = SVR()
svr.fit(X_train, y_train)

In [52]:
y_pred_svr = svr.predict(X_test)

In [53]:
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("Mean Squared Error (MSE):", mse_svr)
print("R-squared (R2) Score:", r2_svr)

Mean Squared Error (MSE): 339670045609.20935
R-squared (R2) Score: -0.05443922793482314


In [54]:
models = {
    "Linear Regression": [mse_lr, r2_lr],
    "Ridge Regression": [mse_rid, r2_rid],
    "Lasso Regression": [mse_las, r2_las],
    "Decision Tree Regression": [mse_dtr, r2_dtr],
    "Random Forest Regression": [mse_rfr, r2_rfr],
    "Gradient Boosting Regression": [mse_gbr, r2_gbr],
    "Support Vector Regression": [mse_svr, r2_svr]
}

In [55]:
type(models)

dict

In [56]:
summary = pd.DataFrame(models).T.reset_index()

In [57]:
summary.columns = ['Model','MSE','R2 Score']

In [58]:
summary

Unnamed: 0,Model,MSE,R2 Score
0,Linear Regression,198450100000.0,0.383951
1,Ridge Regression,198433700000.0,0.384001
2,Lasso Regression,198450100000.0,0.38395
3,Decision Tree Regression,204374400000.0,0.36556
4,Random Forest Regression,118638600000.0,0.63171
5,Gradient Boosting Regression,137107500000.0,0.574377
6,Support Vector Regression,339670000000.0,-0.054439


In [59]:
import joblib

In [60]:
# Train and save the best model (Random Forest Regression)

best_model = RandomForestRegressor()
best_model.fit(X_train, y_train)

In [61]:
# Save the model in .pkl format

filename = "best_model_random_forest.pkl"
joblib.dump(best_model, filename)

['best_model_random_forest.pkl']

In [62]:
# Load the saved model

filename = "best_model_random_forest.pkl"
loaded_model = joblib.load(filename)