In [1]:
import pandas as pd

In [5]:
df = pd.read_csv('car-details.csv')
df.head()

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti Swift Dzire VDI,Maruti,Swift,Dzire VDI,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda Rapid 1.5 TDI Ambition,Skoda,Rapid,1.5 TDI Ambition,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda City 2017-2020 EXi,Honda,City,2017-2020 EXi,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai i20 Sportz Diesel,Hyundai,i20,Sportz Diesel,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti Swift VXI BSIII,Maruti,Swift,VXI BSIII,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [7]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [8]:
df = df.drop(columns=['name' , 'model' , 'edition'])

In [9]:
df.drop_duplicates(inplace=True)

In [12]:
X = df.drop(columns=['selling_price'])
y = df['selling_price']
print(X.shape, y.shape)


(6907, 12) (6907,)


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5525, 12) (1382, 12) (5525,) (1382,)


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [18]:
num_cols = X_train.select_dtypes(include='number').columns.to_list()
cat_cols = [col for col in X_train.columns if col not in num_cols]


print(num_cols)
print(cat_cols)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']
['company', 'owner', 'fuel', 'seller_type', 'transmission']


In [20]:
num_pipe = Pipeline(
    steps=[
        ('imputer' , SimpleImputer(strategy='median')),
        ('scaler' , StandardScaler())
    ]
)

cat_pipe = Pipeline(
    steps=[
        ('imputer' , SimpleImputer(strategy='constant' , fill_value='missing')),
        ('onehot' , OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num' , num_pipe , num_cols),
        ('cat' , cat_pipe , cat_cols) 
    ]
)


In [21]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42)

rf_model = Pipeline(
    steps=[
        ('preprocessor' , preprocessor),
        ('regressor' , regressor)
    ]
)

rf_model.fit(X_train, y_train)


In [23]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

from sklearn.metrics import mean_squared_error


train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f'Train MSE: {train_mse:,.3f}')
print(f'Test MSE: {test_mse:,.3f}')



Train MSE: 28,882,149,235.109
Test MSE: 29,719,046,955.023
