# Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv("C:\\Users\\user\\Desktop\\Repositories\\Data_Cleaning\\Datasets\\house-prices-advanced-regression-techniques\\train.csv")
test = pd.read_csv("C:\\Users\\user\\Desktop\\Repositories\\Data_Cleaning\\Datasets\\house-prices-advanced-regression-techniques\\test.csv")
print("Train = ", train.shape)
print("Test = ", test.shape)

Train =  (1460, 81)
Test =  (1459, 80)


In [4]:
X_train = train.drop(columns='SalePrice')
Y_train = train['SalePrice']
print("X_ train = ", X_train.shape)
print("Y_ train = ", Y_train.shape)
print("test = ", test.shape)

X_ train =  (1460, 80)
Y_ train =  (1460,)
test =  (1459, 80)


# Missing value imputation

In [7]:
is_null = X_train.isnull().sum()
is_null

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [10]:
num_vars = X_train.select_dtypes(include=["int64",'float64']).columns
num_vars_miss = {var for var in num_vars if is_null[var]>0}
num_vars_miss

{'GarageYrBlt', 'LotFrontage', 'MasVnrArea'}

In [13]:
cat_vars = X_train.select_dtypes(include=['O']).columns
cat_var_miss = [var for var in cat_vars if is_null[var]>0]
cat_var_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [14]:
num_var_mean = ['LotFrontage']
num_var_median = ['MasVnrArea','GarageYrBlt']
cat_vars_mode = [
    'Alley',
    'MasVnrType',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'Electrical',
    'FireplaceQu',
]
cat_var_constant = [
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]


In [16]:
num_var_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
num_var_median_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
cat_vars_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent"))])
cat_vars_constant_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value = "Missing"))])

In [17]:
preprocessor = ColumnTransformer(transformers=[
    ('mean_imputer',num_var_mean_imputer, num_var_mean),
    ("median_imputer", num_var_median_imputer, num_var_median),
    ("mode_imputer", cat_vars_mode_imputer,cat_vars_mode),
    ("constant_imputer", cat_vars_constant_imputer,cat_var_constant) 
])

In [19]:
preprocessor.fit(X_train)

In [21]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('constant_i

In [23]:
preprocessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([70.04995837])

In [25]:
train["LotFrontage"].mean()

70.04995836802665

In [26]:
X_train_clean = preprocessor.transform(X_train)
test_clean = preprocessor.transform(test)

In [27]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('constant_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='Missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
  

In [28]:
X_train_miss_var = pd.DataFrame(X_train_clean, columns=num_var_mean +num_var_median + cat_vars_mode+cat_var_constant)

In [30]:
X_train_miss_var.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65.0,196.0,2003.0,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,Missing,Missing,Missing
1,80.0,0.0,1976.0,Grvl,BrkFace,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing
2,68.0,162.0,2001.0,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing
3,60.0,0.0,1998.0,Grvl,BrkFace,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,Missing,Missing,Missing
4,84.0,350.0,2000.0,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing


In [32]:
X_train_miss_var.isnull().sum().sum()

0