# Handling Missing Data
    1.. Replace it with some other data (mean)
    2.. removing that missing data item

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
carSalesMissing = pd.read_csv('../pandas/car-sales-extended-missing-data.csv')
carSalesMissing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [3]:
x = carSalesMissing.drop('Price', axis=1)
y = carSalesMissing['Price']

In [4]:
# Converting categorical data to numeric
categoricalFeatures = ['Make', 'Colour', 'Doors']
transformedX = pd.get_dummies(x, columns=categoricalFeatures)

In [5]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(transformedX, y, test_size = 0.2)

In [6]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

### 1. Filling the missing data with pandas! 

In [7]:
 # filling categorical data with 'missing' and numeric data with the mean of the respctive columns
    
carSalesMissing['Make'].fillna('missing', inplace = True)
carSalesMissing['Colour'].fillna('missing', inplace = True)
carSalesMissing['Odometer (KM)'].fillna(carSalesMissing['Odometer (KM)'].mean(), inplace = True)

# As the Doors column is actually Categorical and not numeric so we fill it up with 4  as its the most common category

carSalesMissing['Doors'].fillna(4, inplace = True)

In [8]:
carSalesMissing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [9]:
# Removing the Rows with missing price value as it's hard predict withot the label..
carSalesMissing.dropna(inplace = True)

In [10]:
len(carSalesMissing)

950

In [11]:
x = carSalesMissing.drop('Price', axis=1)
y = carSalesMissing['Price']

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ['Make', 'Colour', 'Doors']
oneHot = OneHotEncoder()
transformer = ColumnTransformer([('oneHot',
                                 oneHot,
                                 categoricalFeatures)],
                               remainder = 'passthrough')
transformedX = transformer.fit_transform(carSalesMissing)
print(transformedX)

[[0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 3.54310e+04
  1.53230e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 1.92714e+05
  1.99430e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 8.47140e+04
  2.83430e+04]
 ...
 [0.00000e+00 0.00000e+00 1.00000e+00 ... 0.00000e+00 6.66040e+04
  3.15700e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 2.15883e+05
  4.00100e+03]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 2.48360e+05
  1.27320e+04]]


In [13]:
pd.DataFrame(transformedX)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0,14043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0,32042.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0,5716.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0,31570.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0,4001.0


In [14]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(transformedX, y, test_size = 0.2)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

model.fit(trainX, trainY)
model.score(testX, testY)

0.9997032730125649

### 2. Filling Missing values with Scikit-learn

In [15]:
carSalesMissing = pd.read_csv('../pandas/car-sales-extended-missing-data.csv')

In [16]:
# dropping the rows without label (i.e. from the target feature - 'Price')
carSalesMissing.dropna(subset=['Price'], inplace = True)

In [17]:
carSalesMissing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [18]:
x = carSalesMissing.drop('Price', axis=1)
y = carSalesMissing['Price']

In [25]:
##### fill missing values with ScikitLearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' and numeric values with the mean
# defining what to be filled at missing places
categoricalImputer = SimpleImputer(strategy = 'constant', fill_value = 'missing')
numericImputer = SimpleImputer(strategy = 'mean')
doorImputer = SimpleImputer(strategy = 'constant', fill_value = 4)

# Define Columns whhich are to be filled 
categoricalFeatures = ['Make', 'Colour']
doorFeature = ['Doors']
numericFeatures = ['Odometer (KM)']

# Create an imputer (somethhing that actually fills the missing values)
imputer = ColumnTransformer([
    ('categoricalImputer', categoricalImputer, categoricalFeatures),
    ('numericImputer', numericImputer, numericFeatures),
    ('doorImputer', doorImputer,  doorFeature)
])
# Transform the data
transformedX = imputer.fit_transform(x)
transformedX

array([['Honda', 'White', 35431.0, 4.0],
       ['BMW', 'Blue', 192714.0, 5.0],
       ['Honda', 'White', 84714.0, 4.0],
       ...,
       ['Nissan', 'Blue', 66604.0, 4.0],
       ['Honda', 'White', 215883.0, 4.0],
       ['Toyota', 'Blue', 248360.0, 4.0]], dtype=object)

In [26]:
carSalesFilled = pd.DataFrame(transformedX, columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])
carSalesFilled

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
945,Toyota,Black,35820,4
946,missing,White,155144,3
947,Nissan,Blue,66604,4
948,Honda,White,215883,4


In [28]:
carSalesFilled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [31]:
# Noe encoding the complete data into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ['Make', 'Colour', 'Doors']
oneHot = OneHotEncoder()
transformer = ColumnTransformer([('oneHot',
                                 oneHot,
                                 categoricalFeatures)],
                                 remainder = 'passthrough')
transformedX = transformer.fit_transform(carSalesFilled)
transformedX

<950x913 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [33]:
# now we'have got our data as numbers and filled
np.random.seed(45)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(transformedX, y, test_size = 0.2)
model.fit(trainX, trainY)
model.score(testX, testY)

-0.03734639431298126