In [1]:
import pandas as pd
import numpy as np

In [2]:
car = pd.read_csv('car.csv')

In [3]:
car.shape

(892, 6)

In [4]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [5]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [6]:
# remove null values
car = car.dropna()

In [7]:
car.shape

(837, 6)

In [8]:
car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        837 non-null    object
 1   company     837 non-null    object
 2   year        837 non-null    object
 3   Price       837 non-null    object
 4   kms_driven  837 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 45.8+ KB


In [9]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


## Quality 
- year has many non-year values
- year object to int
- price has Ask for Price - Price object to int
- kms_driven has kms with integers
- kms_driven object to int
- kms_driven has nan values
- fuel type has nan value
- keep first 3 words of name


# Cleaning.

In [10]:
backup = car.copy()

In [11]:
print(car.shape)
car['year'].str.isnumeric().sum()

(837, 6)


837

In [12]:
# change car['year'] to integer
car['year'] = car['year'].astype(int)

In [13]:
car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        837 non-null    object
 1   company     837 non-null    object
 2   year        837 non-null    int32 
 3   Price       837 non-null    object
 4   kms_driven  837 non-null    object
 5   fuel_type   837 non-null    object
dtypes: int32(1), object(5)
memory usage: 42.5+ KB


In [14]:
car

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz VXI ABS,Maruti,2011,270000,"50,000 kms",Petrol
885,Tata Indica V2 DLE BS III,Tata,2009,110000,"30,000 kms",Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,"1,32,000 kms",Petrol
888,Tata Zest XM Diesel,Tata,2018,260000,"27,000 kms",Diesel


In [15]:
car.shape

(837, 6)

In [16]:
car.columns = car.columns.str.lower()

In [17]:
car['price']

0             80,000
1           4,25,000
2      Ask For Price
3           3,25,000
4           5,75,000
           ...      
883         2,70,000
885         1,10,000
886         3,00,000
888         2,60,000
889         3,90,000
Name: price, Length: 837, dtype: object

In [18]:
car['price'] = car['price'].str.replace(',', '')
car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        837 non-null    object
 1   company     837 non-null    object
 2   year        837 non-null    int32 
 3   price       837 non-null    object
 4   kms_driven  837 non-null    object
 5   fuel_type   837 non-null    object
dtypes: int32(1), object(5)
memory usage: 42.5+ KB


In [19]:
car = car[car['price'].str.isnumeric() == True]


In [20]:
car.head()

Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
6,Ford Figo,Ford,2012,175000,"41,000 kms",Diesel


In [21]:
car['kms_driven'] = car['kms_driven'].apply(lambda x: x.split(' ')[0])
car['kms_driven'].replace(',', '', regex=True, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['kms_driven'] = car['kms_driven'].apply(lambda x: x.split(' ')[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [22]:
car['kms_driven'] = car['kms_driven'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['kms_driven'] = car['kms_driven'].astype(int)


In [23]:
car['name'].apply(lambda x: x.split(' ')[:3])

0         [Hyundai, Santro, Xing]
1         [Mahindra, Jeep, CL550]
3           [Hyundai, Grand, i10]
4      [Ford, EcoSport, Titanium]
6                    [Ford, Figo]
                  ...            
883        [Maruti, Suzuki, Ritz]
885            [Tata, Indica, V2]
886      [Toyota, Corolla, Altis]
888              [Tata, Zest, XM]
889        [Mahindra, Quanto, C8]
Name: name, Length: 816, dtype: object

In [24]:
car['name'] = car['name'].str.split(' ').str[0:3].str.join(' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['name'] = car['name'].str.split(' ').str[0:3].str.join(' ')


In [25]:
car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 816 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int32 
 3   price       816 non-null    object
 4   kms_driven  816 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(2), object(4)
memory usage: 38.2+ KB


In [26]:
car.describe()

Unnamed: 0,year,kms_driven
count,816.0,816.0
mean,2012.444853,46275.531863
std,4.002992,34297.428044
min,1995.0,0.0
25%,2010.0,27000.0
50%,2013.0,41000.0
75%,2015.0,56818.5
max,2019.0,400000.0


In [27]:
car.head()


Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [28]:
car['price']= car['price'].astype(int)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['price']= car['price'].astype(int)


In [29]:
car[car['price']>6e6]

Unnamed: 0,name,company,year,price,kms_driven,fuel_type
562,Mahindra XUV500 W6,Mahindra,2014,8500003,45000,Diesel


In [30]:
car.drop([562], axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [31]:
car.shape

(815, 6)

In [32]:
car.to_csv('clean_car.csv', index=False)


# Model

In [33]:
x = car.drop(columns=['price'])
y = car['price']

In [34]:
#import train_test_split
from sklearn.model_selection import train_test_split
#import linear regression
from sklearn.linear_model import LinearRegression
# import r2_score
from sklearn.metrics import r2_score
#OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
#pipeline
from sklearn.pipeline import Pipeline
# make_column_transformer
from sklearn.compose import make_column_transformer
#make_pipeline
from sklearn.pipeline import make_pipeline

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=661)

In [36]:
car.head()

Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [37]:
ohe = OneHotEncoder()
ohe.fit(car[['name', 'company','fuel_type']])


In [38]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['name', 'company','fuel_type']), remainder='passthrough')

In [39]:
lr = LinearRegression()

In [40]:
pipe = make_pipeline(column_trans, lr)
pipe

In [41]:
pipe.fit(x_train, y_train)

In [42]:
y_pred = pipe.predict(x_test)
y_pred[1], y_test.iloc[1]

(605470.5014094114, 540000)

In [43]:
r2_score(y_test, y_pred)

0.7687315589885062

In [44]:
import pickle

In [45]:
pickle.dump(pipe, open('car.pkl', 'wb'))

In [46]:
x.head()

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,36000,Diesel
6,Ford Figo,Ford,2012,41000,Diesel
