In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("CAR DETAILS.csv")

In [4]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [5]:
data.dtypes

name             object
year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

In [6]:
data.shape

(4340, 8)

In [7]:
data.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [8]:
data.duplicated().sum()

763

In [9]:
data = data.drop_duplicates()

In [10]:
data.duplicated().sum()

0

In [11]:
def extract_manufacturer(name):
    return name.split()[0]

def extract_model(name):
    return ' '.join(name.split()[1:-1])

def extract_variant(name):
    return name.split()[-1]

In [12]:
data['Manufacturer'] = data['name'].apply(extract_manufacturer)
data['Model'] = data['name'].apply(extract_model)
data['Variant'] = data['name'].apply(extract_variant)

In [13]:
data.drop(columns=['name'], inplace=True)

In [14]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,Manufacturer,Model,Variant
0,2007,60000,70000,Petrol,Individual,Manual,First Owner,Maruti,800,AC
1,2007,135000,50000,Petrol,Individual,Manual,First Owner,Maruti,Wagon R LXI,Minor
2,2012,600000,100000,Diesel,Individual,Manual,First Owner,Hyundai,Verna 1.6,SX
3,2017,250000,46000,Petrol,Individual,Manual,First Owner,Datsun,RediGO T,Option
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner,Honda,Amaze VX,i-DTEC


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3577 entries, 0 to 4339
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           3577 non-null   int64 
 1   selling_price  3577 non-null   int64 
 2   km_driven      3577 non-null   int64 
 3   fuel           3577 non-null   object
 4   seller_type    3577 non-null   object
 5   transmission   3577 non-null   object
 6   owner          3577 non-null   object
 7   Manufacturer   3577 non-null   object
 8   Model          3577 non-null   object
 9   Variant        3577 non-null   object
dtypes: int64(3), object(7)
memory usage: 307.4+ KB


In [16]:
data.describe()

Unnamed: 0,year,selling_price,km_driven
count,3577.0,3577.0,3577.0
mean,2012.962538,473912.5,69250.545709
std,4.251759,509301.8,47579.940016
min,1992.0,20000.0,1.0
25%,2010.0,200000.0,36000.0
50%,2013.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [17]:
cat_cols = data.select_dtypes(include='object').columns
print(cat_cols)

Index(['fuel', 'seller_type', 'transmission', 'owner', 'Manufacturer', 'Model',
       'Variant'],
      dtype='object')


In [18]:
num_cols = data.select_dtypes(exclude='object').columns
print(num_cols)

Index(['year', 'selling_price', 'km_driven'], dtype='object')


### Data Preprocessing

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
cat_cols

Index(['fuel', 'seller_type', 'transmission', 'owner', 'Manufacturer', 'Model',
       'Variant'],
      dtype='object')

In [21]:
label_encoder = LabelEncoder()

In [22]:
for feature in cat_cols:
    data[feature] = label_encoder.fit_transform(data[feature])

In [23]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,Manufacturer,Model,Variant
0,2007,60000,70000,4,1,1,0,18,12,49
1,2007,135000,50000,4,1,1,0,18,979,199
2,2012,600000,100000,1,1,1,0,10,934,241
3,2017,250000,46000,4,1,1,0,5,701,205
4,2014,450000,141000,1,1,1,2,9,66,307


In [24]:
X = data.drop(columns=['selling_price'])
y = data['selling_price']

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2861, 9)
(716, 9)
(2861,)
(716,)


In [28]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,Manufacturer,Model,Variant
0,2007,60000,70000,4,1,1,0,18,12,49
1,2007,135000,50000,4,1,1,0,18,979,199
2,2012,600000,100000,1,1,1,0,10,934,241
3,2017,250000,46000,4,1,1,0,5,701,205
4,2014,450000,141000,1,1,1,2,9,66,307


In [40]:
random_sample = data.sample(n=20, random_state=42)

In [41]:
random_sample

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,Manufacturer,Model,Variant
1045,2018,270000,20000,4,1,1,0,18,275,77
3194,2017,525000,39000,4,0,1,0,18,857,77
1530,2016,990000,146000,1,0,1,0,26,491,243
553,2018,3800000,10000,1,0,0,0,19,131,25
1612,2007,229999,70000,4,1,1,2,9,186,195
2652,2010,325000,120000,1,1,1,4,17,1036,44
152,2008,210000,53772,4,1,1,0,10,399,144
3985,2016,500000,35000,1,1,1,0,6,674,118
1059,2018,790000,19571,1,0,1,0,9,528,107
3788,2015,360000,80000,4,1,1,2,9,57,310


In [30]:
X_test = random_sample.drop(columns=['selling_price'])
y_test = random_sample['selling_price']

In [32]:
import joblib

In [33]:
filename = "best_model_random_forest.pkl"
loaded_model = joblib.load(filename)

In [34]:
y_pred_test = loaded_model.predict(X_test)

In [38]:
from sklearn.metrics import mean_squared_error, r2_score

mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print("Mean Squared Error (MSE):", mse_test)
print("R-squared (R2) Score:", r2_test)

Mean Squared Error (MSE): 81238284543.8484
R-squared (R2) Score: 0.8630232919856404
