In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

Evaluation Metrics

In [2]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

Joblib to save the model pipeline

In [3]:
from joblib import dump, load

## Load dataset

In [5]:
df = pd.read_csv('cleaned_cars_data2.csv')
df.head()

Unnamed: 0,bt,km,transmission,ownerNo,oem,model,modelYear,price_in_lakhs,Insurance Validity,Fuel Type,Seats,Safety_count,top_features_count,Color,No of Cylinder,Turbo Charger,Super Charger,City,Power
0,Hatchback,120000,Manual,3,Maruti,Maruti Celerio,2015,4.0,third party insurance,Petrol,5,13,8,White,3.0,no,no,Bangalore,67.04
1,SUV,32706,Manual,2,Ford,Ford Ecosport,2018,8.11,comprehensive,Petrol,5,27,9,White,3.0,no,no,Bangalore,121.31
2,Hatchback,11949,Manual,1,Tata,Tata Tiago,2018,5.85,comprehensive,Petrol,5,24,9,Red,3.0,no,no,Bangalore,84.0
3,Sedan,17794,Manual,1,Hyundai,Hyundai Xcent,2014,4.62,comprehensive,Petrol,5,18,9,Others,4.0,no,no,Bangalore,81.86
4,SUV,60000,Manual,1,Maruti,Maruti SX4 S Cross,2015,7.9,third party insurance,Diesel,5,22,9,Gray,4.0,yes,no,Bangalore,88.5


# split Target and Features

In [6]:
df_X = df.drop(columns=['price_in_lakhs'])
df_y = df['price_in_lakhs']

# Split Train and Test data

In [7]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

## Model Building

In [8]:
# get numeric and categorical in seperate columns
numeric_columns = df_X.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_X.select_dtypes(include=['object']).columns.tolist()

# building pipeline steps for preprocessing
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numeric_columns),
    ('cat', cat_transformer, categorical_columns)
])


In [9]:
# store the pipeine steps in model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# fit the model
pipeline.fit(df_X_train, df_y_train)


### Model Evalation

In [10]:
model_pred = pipeline.predict(df_X_test)
print(f'Random Forest Regression MSE: {mean_squared_error(df_y_test, model_pred)}')
print(f'Random Forest Regression R2 Score: {r2_score(df_y_test, model_pred)}')
print(f'Random Forest Regression MAE: {mean_absolute_error(df_y_test, model_pred)}')

Random Forest Regression MSE: 44.072867922413124
Random Forest Regression R2 Score: 0.6849564315796989
Random Forest Regression MAE: 2.0618920474608924


In [11]:
df_y_test.iloc[0]

6.5

In [12]:
df_X_test.iloc[0]

Unnamed: 0,1502
bt,SUV
km,150000
transmission,Automatic
ownerNo,1
oem,Renault
model,Renault Duster
modelYear,2016
Insurance Validity,third party insurance
Fuel Type,Diesel
Seats,5


In [13]:
pred = pipeline.predict(df_X_test.iloc[0].to_frame().T)
print(f'Predicted Price: {pred[0]}')

Predicted Price: 5.626999999999999


## Training the whole dataset

In [14]:
pipeline.fit(df_X, df_y)

## save model

In [15]:
dump(pipeline, 'RFmodel.joblib')

['RFmodel.joblib']