# Car price prediction 

In [3]:
import pandas as pd

In [4]:
cars = pd.read_csv('train-data.csv')

In [5]:
cars.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [6]:
cars = cars.drop(cars.columns[0], axis=1) # remove unwanted and unnamed 1st column

In [7]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [8]:
cars.shape

(6019, 13)

In [9]:
cars_bkp = cars.copy()

# PREPROCESSING

1. Mileage - type is object in the form 'amount unit'
       - Change km/kg into kmpl for all entries.
       - Then set only the amount and remove the unit.
2. Engine - remove CC and convert data dype from obj to int32
3. Power - remove bhp and convert data dype from obj to int32
4. Seats - Convert float64 to int32. In case of NaN use 4
5. New_Price -
        - For NaN values, use flagging, introduce a new column and set it 0, for nonNaN set it 1.
        - NonNaN -- 'values unit'. there are two unique types for the UNIT part [['Lakh', 'Cr']]. for every Lakh value, write it as it is, for for every cr values, multiply it by 100 to convert it into lakh {cars[~pd.isna(cars['New_Price'])]['New_Price'].str.split().str[1].unique()}

In [10]:
# Apply the conversion function to the 'Mileage' column using apply()
# cars['Mileage'].str.split().str[0].apply(float) * 1.33
#cars['Mileage'] = (cars['Mileage'].str.split().str[0].apply(float) * 1.33) if (cars['Mileage'].str.split().str[1] == 'km/kg') else (cars['Mileage'].str.split().str[0].apply(float))

cars.loc[(~cars['Mileage'].isna()) & (cars['Mileage'].str.split().str[1] == 'km/kg'), 'Mileage'] = cars.loc[(~cars['Mileage'].isna()) & (cars['Mileage'].str.split().str[1] == 'km/kg')]['Mileage'].str.split().str[0].astype(float)*1.33
cars.loc[(~cars['Mileage'].isna()) & (cars['Mileage'].str.split().str[1] == 'kmpl'), 'Mileage'] = cars.loc[(~cars['Mileage'].isna()) & (cars['Mileage'].str.split().str[1] == 'kmpl')]['Mileage'].str.split().str[0].astype(float)
cars['Mileage'] = pd.to_numeric(cars['Mileage'], errors='coerce') # make it numeric and replace non numeric with NaN
cars = cars[~cars['Mileage'].isna()]

In [11]:
# as there are only 36 rows with no engine data, we can remove them
cars = cars[~cars['Engine'].isna()]
cars['Engine'] = cars['Engine'].str.replace(' CC', '')
cars['Engine'] = pd.to_numeric(cars['Engine'], errors='coerce') # make it numeric and replace non numeric with NaN

In [12]:
cars['Engine'] = cars['Engine'].astype(float)
cars['Engine'].unique()

array([ 998., 1582., 1199., 1248., 1968.,  814., 1461., 2755., 1598.,
       1462., 1497., 2179., 2477., 1498., 2143., 1995., 1984., 1197.,
       2494., 1798., 2696., 2698., 1061., 1198., 2987.,  796.,  624.,
       1999., 1991., 2694., 1120., 2498.,  799., 2393., 1399., 1796.,
       2148., 1396., 1950., 4806., 1998., 1086., 1193., 2982., 1493.,
       2967., 2993., 1196., 1799., 2497., 2354., 1373., 2996., 1591.,
       2894., 5461., 1595.,  936., 1997., 1896., 1390., 1364., 2199.,
        993.,  999., 1405., 2956., 1794.,  995., 2496., 1599., 2400.,
       1495., 2523.,  793., 4134., 1596., 1395., 2953., 1586., 2362.,
       1496., 1368., 1298., 1956., 1299., 3498., 2835., 1150., 3198.,
       1343., 1499., 1186., 1590., 2609., 2499., 2446., 1978., 2360.,
       3436., 2198., 4367., 2706., 1422., 2979., 1969., 1489., 2489.,
       1242., 1388., 1172., 2495., 1194., 3200., 1781., 1341., 2773.,
       3597., 1985., 2147., 1047., 2999., 2995., 2997., 1948., 2359.,
       4395., 2349.,

In [13]:
cars['Engine'].info()

<class 'pandas.core.series.Series'>
Index: 5981 entries, 0 to 6018
Series name: Engine
Non-Null Count  Dtype  
--------------  -----  
5981 non-null   float64
dtypes: float64(1)
memory usage: 93.5 KB


In [14]:
# power
cars['Power'] = pd.to_numeric(cars['Power'].str.replace(' bhp', ''), errors='coerce') # make it numeric and replace non numeric with NaN
cars = cars[~cars['Power'].isna()]

In [15]:
# seats
cars = cars[~cars['Seats'].isna()]
cars['Seats'] = cars['Seats'].astype(float)

In [16]:
# New_Price
cars['hasNewPrice'] = cars['New_Price'].isna()
cars.loc[(~cars['New_Price'].isna()) & (cars['New_Price'].str.contains(" Lakh")), 'New_Price'] = cars.loc[(~cars['New_Price'].isna()) & (cars['New_Price'].str.contains(" Lakh"))]['New_Price'].str.replace(' Lakh','').astype(float)
cars.loc[(~cars['New_Price'].isna()) & (cars['New_Price'].str.contains(" Cr")), 'New_Price'] = cars.loc[(~cars['New_Price'].isna()) & (cars['New_Price'].str.contains(" Cr"))]['New_Price'].str.replace(' Cr','').astype(float) * 100

In [17]:
cars.loc[cars['New_Price'].isna(), 'New_Price'] = 0.0

In [18]:
cars['New_Price'] = cars['New_Price'].astype(float)

In [19]:
cars['New_Price'].info()

<class 'pandas.core.series.Series'>
Index: 5872 entries, 0 to 6018
Series name: New_Price
Non-Null Count  Dtype  
--------------  -----  
5872 non-null   float64
dtypes: float64(1)
memory usage: 91.8 KB


## PreProcessing Completed

In [20]:
cars.to_csv('CleanTrainingData.csv')

# MODEL

In [21]:
X = cars.drop(columns=['Name', 'Price'])
Y = cars['Price']

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [24]:
ohe = OneHotEncoder()
ohe.fit(X[['Location', 'Year', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Seats', 'hasNewPrice']])

In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4697 entries, 5675 to 4409
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           4697 non-null   object 
 1   Year               4697 non-null   int64  
 2   Kilometers_Driven  4697 non-null   int64  
 3   Fuel_Type          4697 non-null   object 
 4   Transmission       4697 non-null   object 
 5   Owner_Type         4697 non-null   object 
 6   Mileage            4697 non-null   float64
 7   Engine             4697 non-null   float64
 8   Power              4697 non-null   float64
 9   Seats              4697 non-null   float64
 10  New_Price          4697 non-null   float64
 11  hasNewPrice        4697 non-null   bool   
dtypes: bool(1), float64(5), int64(2), object(4)
memory usage: 444.9+ KB


In [26]:
#column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats', 'New_Price', 'hasNewPrice']), remainder='passthrough')
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['Location', 'Year', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Seats', 'hasNewPrice']), remainder='passthrough')

In [27]:
ohe.categories_

[array(['Ahmedabad', 'Bangalore', 'Chennai', 'Coimbatore', 'Delhi',
        'Hyderabad', 'Jaipur', 'Kochi', 'Kolkata', 'Mumbai', 'Pune'],
       dtype=object),
 array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
        2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
       dtype=int64),
 array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object),
 array(['Automatic', 'Manual'], dtype=object),
 array(['First', 'Fourth & Above', 'Second', 'Third'], dtype=object),
 array([ 2.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
 array([False,  True])]

In [28]:
lr = LinearRegression()

In [29]:
pipe = make_pipeline(column_trans, lr)

In [30]:
X_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,hasNewPrice
5675,Kolkata,2013,26300,Diesel,Manual,First,25.44,936.0,57.60,5.0,0.0,True
1865,Delhi,2014,32805,Diesel,Manual,First,23.20,1248.0,73.94,5.0,0.0,True
963,Mumbai,2012,95000,Petrol,Automatic,First,13.70,1798.0,157.75,5.0,0.0,True
3973,Kolkata,2014,37298,Diesel,Manual,First,25.10,1498.0,98.60,5.0,0.0,True
5546,Hyderabad,2017,128000,Diesel,Manual,First,25.80,1498.0,98.60,5.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
206,Mumbai,2014,76000,Petrol,Automatic,First,14.70,1798.0,177.50,5.0,0.0,True
3081,Pune,2012,75000,Diesel,Manual,First,23.00,1396.0,90.00,5.0,0.0,True
5663,Bangalore,2016,41000,Petrol,Manual,Second,25.17,799.0,53.30,5.0,0.0,True
2357,Delhi,2016,25000,Petrol,Automatic,First,15.04,1991.0,183.00,5.0,0.0,True


In [31]:
pipe.fit(X_train, Y_train)

In [32]:
Y_pred = pipe.predict(X_test)

In [33]:
Y_pred

array([ 7.97733669, -1.9159809 ,  4.46754594, ...,  9.98787861,
       10.51846243,  8.86273185])

In [34]:
r2_score(Y_test, Y_pred)

0.6812647997231477

In [35]:
 import pickle

In [36]:
pickle.dump(pipe, open('LinearRegressionModel.pkl', 'wb'))

In [37]:
X_test.loc[X_test['Kilometers_Driven'] == 75000]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,hasNewPrice
2904,Chennai,2007,75000,Diesel,Automatic,First,11.0,2987.0,198.5,5.0,0.0,True
4107,Delhi,2013,75000,Petrol,Manual,First,16.47,1198.0,73.9,5.0,0.0,True
5309,Pune,2009,75000,Petrol,Manual,Second,14.7,1150.0,76.0,5.0,0.0,True
4656,Pune,2010,75000,Petrol,Manual,Second,17.92,1086.0,62.1,5.0,0.0,True
4478,Hyderabad,2008,75000,Diesel,Manual,Second,17.7,1399.0,68.0,5.0,0.0,True
2006,Bangalore,2009,75000,Diesel,Manual,First,13.8,1399.0,68.0,5.0,0.0,True


In [52]:
input_data = pd.DataFrame([['Delhi', 2012, 3000, 'Petrol', 'Manual', 'First', 23.00, 1211.0, 1222.00, 4.0, 0.00, False]], columns=['Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats', 'New_Price', 'hasNewPrice'])
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           1 non-null      object 
 1   Year               1 non-null      int64  
 2   Kilometers_Driven  1 non-null      int64  
 3   Fuel_Type          1 non-null      object 
 4   Transmission       1 non-null      object 
 5   Owner_Type         1 non-null      object 
 6   Mileage            1 non-null      float64
 7   Engine             1 non-null      float64
 8   Power              1 non-null      float64
 9   Seats              1 non-null      float64
 10  New_Price          1 non-null      float64
 11  hasNewPrice        1 non-null      bool   
dtypes: bool(1), float64(5), int64(2), object(4)
memory usage: 221.0+ bytes


In [53]:
pipe.predict(input_data)

array([152.75323678])