In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [4]:
df = pd.read_csv('car-details.csv')

In [5]:
df.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
3861,Chevrolet Sail Hatchback Diesel,Chevrolet,Sail,Hatchback Diesel,2013,First,Diesel,Individual,Manual,68000,51.94,1248.0,76.9,205.0,5.0,355000
1821,Toyota Innova 2.5 G (Diesel) 7 Seater BS IV,Toyota,Innova,2.5 G (Diesel) 7 Seater BS IV,2011,First,Diesel,Individual,Manual,375000,30.08,2494.0,102.0,200.05566,8.0,650000
2539,Maruti Alto LXi,Maruti,Alto,LXi,2009,First,Petrol,Individual,Manual,100000,46.28,796.0,46.3,62.0,5.0,135000
1819,Mahindra TUV 300 T8,Mahindra,TUV,300 T8,2015,First,Diesel,Individual,Manual,58945,43.44,1493.0,100.0,240.0,7.0,700000
150,Hyundai Elantra S,Hyundai,Elantra,S,2016,First,Petrol,Dealer,Manual,49900,38.3,1797.0,147.5,177.5,5.0,805000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [9]:
df.describe()

Unnamed: 0,year,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
count,6926.0,6926.0,6718.0,6718.0,6717.0,6717.0,6718.0,6926.0
mean,2013.4203,73995.68,45.080782,1430.891337,87.7661,174.684585,5.434653,517270.7
std,4.078286,58358.1,10.693383,493.493277,31.724555,106.876249,0.98423,519767.0
min,1983.0,1.0,0.0,624.0,32.8,47.07192,2.0,29999.0
25%,2011.0,40000.0,38.9,1197.0,67.1,110.0,5.0,250000.0
50%,2014.0,70000.0,45.34,1248.0,81.83,160.0,5.0,400000.0
75%,2017.0,100000.0,52.44,1498.0,100.0,200.05566,5.0,633500.0
max,2020.0,2360457.0,98.7,3604.0,400.0,1863.2635,14.0,10000000.0


In [10]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [11]:
for col in df.select_dtypes(include='O').columns:
    print(f'Column: {col}')
    print(f'Cardinality: {df[col].nunique()}')
    print(df[col].unique())
    print(df[col].value_counts(normalize=True))
    print()

Column: name
Cardinality: 2058
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
name
Maruti Swift Dzire VDI                       0.017037
Maruti Alto 800 LXI                          0.010973
Maruti Alto LXi                              0.009962
Maruti Swift VDI                             0.008663
Maruti Swift VDI BSIV                        0.008085
                                               ...   
Tata Tiago 1.2 Revotron XZ Plus Dual Tone    0.000144
Mahindra KUV 100 mFALCON G80 K4 Plus 5str    0.000144
Hyundai Verna SX Diesel                      0.000144
Maruti S-Presso VXI AT                       0.000144
Hyundai Santro Xing XK eRLX EuroIII          0.000144
Name: proportion, Length: 2058, dtype: float64

Column: company
Cardinality: 32
['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford' 'Renault' 'Mahindra'
 'Tata

In [12]:
df = df.drop(columns = ['name', 'model', 'edition'])
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [13]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [14]:
df.duplicated().sum()

np.int64(0)

In [15]:
X = df.drop(columns = 'selling_price')
y = df.selling_price.copy()

In [16]:
X.shape, y.shape

((6907, 12), (6907,))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape, X_test.shape)
y_train.shape, y_test.shape

(5525, 12) (1382, 12)


((5525,), (1382,))

In [19]:
numerical_cols = X_train.select_dtypes(include= 'number').columns.tolist()
numerical_cols

['year',
 'km_driven',
 'mileage_mpg',
 'engine_cc',
 'max_power_bhp',
 'torque_nm',
 'seats']

In [20]:
categorical_cols = [col for col in X_train.columns if col not in numerical_cols]
categorical_cols

['company', 'owner', 'fuel', 'seller_type', 'transmission']

In [23]:
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipe, numerical_cols),
    ("cat", cat_pipe, categorical_cols)
])

regressor = RandomForestRegressor(
    n_estimators = 10, max_depth = 5, random_state = 42
)

rf_model = Pipeline(steps=[
    ('pre', preprocessor),
    ('reg', regressor)
])

In [24]:
rf_model.fit(X_train, y_train)

In [25]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [26]:
r2_score(y_train, y_train_pred)

0.8979695215795949

In [27]:
r2_score(y_test, y_test_pred)

0.865675993366697

In [31]:
print(root_mean_squared_error(y_train, y_train_pred))
print(root_mean_squared_error(y_test, y_test_pred))

169947.48964050272
172392.1313605195


In [33]:
X_train.columns.tolist()

['company',
 'year',
 'owner',
 'fuel',
 'seller_type',
 'transmission',
 'km_driven',
 'mileage_mpg',
 'engine_cc',
 'max_power_bhp',
 'torque_nm',
 'seats']