In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('../../Datasets/car_price.csv')

In [3]:
df.head(10)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,9.25,9.83,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
7,s cross,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,8.75,8.89,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,7.45,8.92,42367,Diesel,Dealer,Manual,0


In [4]:
df.shape

(301, 9)

In [5]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [6]:
values=[["nm","year","p_price","kms","fuel_type","seller_type","transmission", "owner"]]
df_test=pd.DataFrame(data=values,columns=['Car_Name', 'Year', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'])
df_test

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,nm,year,p_price,kms,fuel_type,seller_type,transmission,owner


In [7]:
df.Fuel_Type.value_counts()

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64

In [8]:
df.Seller_Type.value_counts()

Dealer        195
Individual    106
Name: Seller_Type, dtype: int64

In [9]:
df.Transmission.value_counts()

Manual       261
Automatic     40
Name: Transmission, dtype: int64

In [10]:
numeric_features = list(df.select_dtypes(exclude='object').columns)
numeric_features

['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner']

In [11]:
numeric_features.remove('Selling_Price')

In [12]:
cat_features = list(df.select_dtypes(include='object').columns)
cat_features

['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission']

In [13]:
cat_features.remove("Car_Name")

## Pipeline Building

In [14]:
model_rf = RandomForestRegressor(n_estimators=100)

In [15]:
num_transformer = Pipeline([("imputer_n",SimpleImputer(strategy='median')),
                            ('scaler', MinMaxScaler())
                           ])
cat_transformer = Pipeline([("imputer_c",SimpleImputer(strategy='most_frequent')),
                            ('encoder', OneHotEncoder())
                           ])

In [19]:
preprocessor = ColumnTransformer([('num_pipe', num_transformer, numeric_features),
                                  ('cat_pipe', cat_transformer, cat_features)],remainder='drop')

In [20]:
final_pipe = Pipeline([('preprocess_pipe', preprocessor),
                      ('model_rf', RandomForestRegressor(n_estimators=100))])

In [21]:
final_pipe.fit(df.drop('Selling_Price', axis=1), df['Selling_Price'])

In [22]:
final_pipe.score(df.drop('Selling_Price', axis=1), df['Selling_Price'])

0.9889268020573583

In [23]:
X=df.iloc[:1,:]
X

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0


In [24]:
final_pipe.predict(X)

array([3.77])

In [25]:
x=df.drop('Selling_Price', axis=1).iloc[5:6,:]
x

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
5,vitara brezza,2018,9.83,2071,Diesel,Dealer,Manual,0


In [26]:
final_pipe.predict(x)

array([8.8])

### Model Saving

In [27]:
import pickle

In [28]:
pickle.dump(final_pipe,open("car_model_rf.pkl","wb"))

## To load saved model

In [29]:
modelNew = pickle.load(open("car_model_rf.pkl","rb"))

In [30]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [31]:
modelNew.predict(X)

array([3.77])