In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd

In [2]:
cars_data = pd.read_csv('Toyota.csv',index_col=0,na_values=['??','????'])

In [3]:
cars_data2 = cars_data.copy()
cars_data3 = cars_data.copy()

In pandas dataframe, missing data is represented as NaN (Not a Number).
To check null values in Pandas dataframes, isnull() and isna() are used.
These functions return a dataframe of Boolean values which are true for missing values.

# Identifying missing values
- Dataframe.isna().sum()
- Dataframe.isnull().sum()

In [7]:
cars_data2.isnull().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [8]:
cars_data2.isna().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [9]:
# Subsetting the rows that have one or more missing values.
missing = cars_data2[cars_data2.isnull().any(axis=1)]

In [11]:
missing.shape

(340, 10)

In [12]:
print(missing.head())

    Price   Age       KM FuelType    HP  MetColor  Automatic    CC Doors  \
2   13950  24.0  41711.0   Diesel  90.0       NaN          0  2000     3   
6   16900  27.0      NaN   Diesel   NaN       NaN          0  2000     3   
7   18600  30.0  75889.0      NaN  90.0       1.0          0  2000     3   
9   12950  23.0  71138.0   Diesel   NaN       NaN          0  1900     3   
15  22000  28.0  18739.0   Petrol   NaN       0.0          0  1800     3   

    Weight  
2     1165  
6     1245  
7     1245  
9     1105  
15    1185  


# Approaches to fill the missing the values
- Two ways of approach:
  1. Fill the missing values by mean/median, in case of numerical variables.
  2. Fill the missing values with mode, in case of categorical variable.

## Imputing the missing values

### DataFrame.describe()
- Generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset's distribution excluding NaN values.

In [13]:
cars_data2.describe()

Unnamed: 0,Price,Age,KM,HP,MetColor,Automatic,CC,Weight
count,1436.0,1336.0,1421.0,1430.0,1286.0,1436.0,1436.0,1436.0
mean,10730.824513,55.672156,68647.239972,101.478322,0.674961,0.05571,1566.827994,1072.45961
std,3626.964585,18.589804,37333.023589,14.768255,0.468572,0.229441,187.182436,52.64112
min,4350.0,1.0,1.0,69.0,0.0,0.0,1300.0,1000.0
25%,8450.0,43.0,43210.0,90.0,0.0,0.0,1400.0,1040.0
50%,9900.0,60.0,63634.0,110.0,1.0,0.0,1600.0,1070.0
75%,11950.0,70.0,87000.0,110.0,1.0,0.0,1600.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,1.0,2000.0,1615.0


#### Imputing missing values of 'Age'
- To fill NA/NaN values using the specified value.
  - DataFrame.fillna()

In [14]:
cars_data2['Age'].fillna(cars_data2['Age'].mean(),inplace=True)

In [16]:
cars_data2.isnull().sum()

Price          0
Age            0
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

#### Imputing missing values of 'KM'

In [17]:
cars_data2['KM'].fillna(cars_data2['KM'].median(),inplace=True)

In [18]:
cars_data2.isnull().sum()

Price          0
Age            0
KM             0
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

#### Imputing missing values of 'HP'

In [19]:
cars_data2['HP'].fillna(cars_data2['HP'].mean(), inplace=True)

In [20]:
cars_data2.isnull().sum()

Price          0
Age            0
KM             0
FuelType     100
HP             0
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

#### Imputing missing values of 'FuelType'

In [23]:
cars_data2['FuelType'].mode()

0    Petrol
Name: FuelType, dtype: object

In [27]:
cars_data2['FuelType'].fillna(cars_data2['FuelType'].mode()[0],inplace=True)

#### Imputing missing values of 'MetColor'

In [24]:
cars_data2['MetColor'].mode()

0    1.0
Name: MetColor, dtype: float64

In [28]:
cars_data2['MetColor'].fillna(cars_data2['MetColor'].mode()[0], inplace=True)

In [29]:
cars_data2.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

### Imputing missing values using lambda functions
- To fill the NA/NaN values in both numerical and categorical variables at one stretch.

In [31]:
cars_data3 = cars_data3.apply(lambda x:x.fillna(x.mean()) if x.dtype=='float' else x.fillna(x.value_counts().index[0]))