In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [4]:
car_data = pd.read_csv('car data.csv')

In [5]:
car_data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [6]:
car_data['Fuel_Type'].value_counts()

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64

In [7]:
car_data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [18]:
category_col = car_data.select_dtypes(exclude=np.number).columns
integer_col = car_data.select_dtypes(include=np.number).drop(columns='Selling_Price').columns

In [13]:
integer_cal

Index(['Year', 'Present_Price', 'Kms_Driven', 'Owner'], dtype='object')

In [16]:
category_pipe = Pipeline(steps = [('encode', LeaveOneOutEncoder())])
numeric_pipe = Pipeline(steps = [('scale', StandardScaler())])

In [20]:
preprocess_pipe = ColumnTransformer([('cat_pipe', category_pipe, category_col),
                                    ('numeric_pipe', numeric_pipe, integer_col)])

In [22]:
pipe = Pipeline([
    ('preprocess', preprocess_pipe),
    ('linearregression', LinearRegression())
])

In [24]:
X = car_data.drop(columns='Selling_Price')
y = car_data['Selling_Price']

In [25]:
pipe.fit(X, y)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat_pipe',
                                                  Pipeline(steps=[('encode',
                                                                   LeaveOneOutEncoder())]),
                                                  Index(['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission'], dtype='object')),
                                                 ('numeric_pipe',
                                                  Pipeline(steps=[('scale',
                                                                   StandardScaler())]),
                                                  Index(['Year', 'Present_Price', 'Kms_Driven', 'Owner'], dtype='object'))])),
                ('linearregression', LinearRegression())])

In [31]:
test_data = pd.DataFrame(['ritz', 2013, 5, 50000, 'Petrol', 'Dealer', 'Manual', 0]).T

test_data.columns = X.columns

pipe.predict(test_data)

array([2.68458399])

In [32]:
joblib.dump(pipe, 'pipe.pkl')

['pipe.pkl']