In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [5]:
base_path = os.getcwd()
raw_data_path = os.path.join(base_path, '../data_raw/raw_data.csv')
raw_data = pd.read_csv(raw_data_path)
raw_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
1,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13 km/kg,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
2,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5


Get rid of unneccesary column

In [6]:
raw_data.drop(['Unnamed: 0'], axis=1, inplace=True)
raw_data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
1,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13 km/kg,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
2,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
3,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
4,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5


a) Look for the missing values in all the columns and either impute them (replace with mean,
median, or mode) or drop them. Justify your action for this task.

In [7]:
raw_data.isna().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64

Due to the large number of missing values from the New_Price column, I believe it should be dropped entirely. This is because any imputation done to fill these missing values could introduce a large amount of innaccuracies into the dataset. The entries can't be dropped either, because we would lose a significant portion of data. 

In [8]:
raw_data.shape

(5847, 13)

In [16]:
# show missing data for Power and Engine
raw_data[raw_data['Engine'].isna()]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
186,Honda City 1.5 GXI,Ahmedabad,2007,60006,Petrol,Manual,First,0.0 kmpl,,,,,2.95
200,Maruti Swift 1.3 VXi,Kolkata,2010,42001,Petrol,Manual,First,16.1 kmpl,,,,,2.11
709,Maruti Swift 1.3 VXi,Chennai,2006,97800,Petrol,Manual,Third,16.1 kmpl,,,,,1.75
723,Land Rover Range Rover 3.0 D,Mumbai,2008,55001,Diesel,Automatic,Second,0.0 kmpl,,,,,26.5
1253,Honda City 1.3 DX,Delhi,2009,55005,Petrol,Manual,First,12.8 kmpl,,,,,3.2
1284,Maruti Swift 1.3 ZXI,Hyderabad,2015,50295,Petrol,Manual,First,16.1 kmpl,,,,,5.8
1339,Honda City 1.5 GXI,Pune,2004,115000,Petrol,Manual,Second,0.0 kmpl,,,,,1.5
1412,Land Rover Range Rover Sport 2005 2012 Sport,Coimbatore,2008,69078,Petrol,Manual,First,0.0 kmpl,,,,,40.88
2014,Maruti Swift 1.3 LXI,Pune,2011,24255,Petrol,Manual,First,16.1 kmpl,,,,,3.15
2036,Hyundai Santro LP zipPlus,Coimbatore,2004,52146,Petrol,Manual,First,0.0 kmpl,,,,,1.93


Along with dropping the New_Price column, Mileage, Engine, Power, and Seats need to be imputed in some fasion. The mileage N/A entries should be dropped, because we would potentially add error to the data (how do we know the mileage? It would be a complete guess) and there are only 2 missing, so dropping them is relatively inconsequential. Engine can be used to infer power, and vice versa

In [32]:
data_no_missing = raw_data.drop(['New_Price'], axis=1)
data_no_missing = data_no_missing.dropna()
data_no_missing.shape

(5807, 12)

In [18]:
data_no_missing.isna().sum()

Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

b) Remove the units from some of the attributes and only keep the numerical values

In [33]:
# Strip units and convert to numerical format for 'Mileage', 'Engine', and 'Power'
data_no_missing['Mileage'] = data_no_missing['Mileage'].str.extract('(\d+\.\d+|\d+)').astype(float)
data_no_missing['Engine'] = data_no_missing['Engine'].str.extract('(\d+)').astype(float)
data_no_missing['Power'] = data_no_missing['Power'].str.replace('null', 'NaN').str.extract('(\d+\.\d+|\d+)').astype(float)

# Check the correlations between numerical features
correlations = data_no_missing[['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Price']].corr()

correlations

  data_no_missing['Mileage'] = data_no_missing['Mileage'].str.extract('(\d+\.\d+|\d+)').astype(float)
  data_no_missing['Engine'] = data_no_missing['Engine'].str.extract('(\d+)').astype(float)
  data_no_missing['Power'] = data_no_missing['Power'].str.replace('null', 'NaN').str.extract('(\d+\.\d+|\d+)').astype(float)


Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Price
Year,1.0,-0.169006,0.29143,-0.06826,0.015246,0.300675
Kilometers_Driven,-0.169006,1.0,-0.061852,0.09322,0.033328,-0.00833
Mileage,0.29143,-0.061852,1.0,-0.636932,-0.536329,-0.340406
Engine,-0.06826,0.09322,-0.636932,1.0,0.865013,0.656887
Power,0.015246,0.033328,-0.536329,0.865013,1.0,0.772422
Price,0.300675,-0.00833,-0.340406,0.656887,0.772422,1.0


In [25]:
data_no_missing.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
1,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13.0,1199.0,88.7,5.0,4.5
2,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
3,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
4,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5


Lowest value for Mileage is 0. This is obviously in error, or there was no data so these should be treated as missing values

In [29]:
data_no_missing.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0
mean,2013.475805,58349.02,18.204169,1631.839332,113.827634,5.286551,9.673429
std,3.170718,92655.56,4.289961,601.822651,53.903495,0.80679,11.292012
min,1998.0,171.0,0.0,624.0,34.2,2.0,0.44
25%,2012.0,33288.0,15.26,1198.0,78.0,5.0,3.59
50%,2014.0,52400.0,18.2,1497.0,98.6,5.0,5.75
75%,2016.0,72457.5,21.1,1991.0,139.04,5.0,10.25
max,2019.0,6500000.0,28.4,5998.0,560.0,10.0,160.0


In [34]:
# view Mileage values of 0
data_no_missing[data_no_missing['Mileage'] == 0]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
12,Land Rover Freelander 2 TD4 SE,Pune,2012,85000,Diesel,Automatic,Second,0.0,2179.0,115.0,5.0,17.5
65,Mercedes-Benz C-Class Progressive C 220d,Coimbatore,2019,15369,Diesel,Automatic,First,0.0,1950.0,194.0,5.0,35.67
524,Mercedes-Benz New C-Class Progressive C 200,Kochi,2019,13190,Petrol,Automatic,First,0.0,1950.0,181.43,5.0,38.99
683,Mercedes-Benz M-Class ML 350 4Matic,Pune,2014,120000,Diesel,Automatic,First,0.0,2987.0,165.0,5.0,30.0
929,Mercedes-Benz C-Class Progressive C 220d,Mumbai,2018,8682,Diesel,Automatic,First,0.0,1950.0,194.0,5.0,39.5
962,Hyundai Santro Xing GL,Pune,2008,93000,Petrol,Manual,First,0.0,1086.0,62.0,5.0,1.45
1025,Hyundai Santro Xing GL,Hyderabad,2010,58163,Petrol,Manual,First,0.0,1086.0,62.0,5.0,2.45
1220,Land Rover Freelander 2 TD4 S,Bangalore,2010,125000,Diesel,Automatic,Second,0.0,2179.0,115.0,5.0,11.0
1267,Mercedes-Benz M-Class ML 350 4Matic,Bangalore,2014,33000,Diesel,Automatic,Second,0.0,2987.0,165.0,5.0,43.0
1309,Hyundai Santro Xing GL,Kochi,2011,20842,Petrol,Manual,First,0.0,1086.0,62.0,5.0,2.78


In [35]:
data_no_missing[data_no_missing['Mileage'] == 0].shape

(28, 12)