## Method 2,3,4 -> Using scikit learn module (all strategies at once)

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [2]:
train=pd.read_csv(r"C:\Users\lalit\Documents\JUPYTER NOTEBOOK\house-prices-advanced-regression-techniques\train.csv")
test=pd.read_csv(r"C:\Users\lalit\Documents\JUPYTER NOTEBOOK\house-prices-advanced-regression-techniques\test.csv")
print("shape of train df = ",train.shape)
print("shape of test df = ",test.shape)
# dataset link -> https://drive.google.com/drive/folders/1gdfNSb3ptz0WvShshUeiM-kqfMYzQ5tm?usp=sharing

shape of train df =  (1460, 81)
shape of test df =  (1459, 80)


In [3]:
# convert data into X_train y_train
X_train=train.drop(columns="SalePrice",axis=1)
y_train=train["SalePrice"]
X_test=test.copy()
print("shape of X_train df = ",X_train.shape)
print("shape of y_train df = ",y_train.shape)
print("shape of X_test df = ",X_test.shape)

shape of X_train df =  (1460, 80)
shape of y_train df =  (1460,)
shape of X_test df =  (1459, 80)


## Missing value imputation

In [4]:
isnull_sum=X_train.isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [5]:
# names of columns having missing values -> numerical 
num_vars=X_train.select_dtypes(include=["int64","float64"]).columns
num_vars_miss=[var for var in num_vars if isnull_sum[var]>0]

In [6]:
num_vars_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [8]:
# names of columns having missing values -> numerical
cat_vars=X_train.select_dtypes(include=["object"]).columns
cat_vars_miss=[var for var in cat_vars if isnull_sum[var]>0]
cat_vars_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
num_var_mean=["LotFrontage"] # mean for LotFrontage
num_var_median= ["MasVnrArea",'GarageYrBlt']  # median for both
cat_var_mode = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu'] # mode value 
cat_var_const= ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']  # const value

In [10]:
# create pipeline to fix the sequence 
num_var_mean_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="mean"))])
num_var_median_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="median"))])
cat_var_mode_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])
cat_var_const_imputer =Pipeline(steps=[("imputer",SimpleImputer(strategy="constant",fill_value="KHALI COLUMN"))])


In [11]:
# Transform the pipeline to columns
# transformers ("name", startegy , columns)
prep = ColumnTransformer(transformers=[("mean_imputer",num_var_mean_imputer,num_var_mean),
                               ("median_imputer",num_var_median_imputer,num_var_median),
                               ("mode_imputer",cat_var_mode_imputer,cat_var_mode),
                               ("const_imputer",cat_var_const_imputer,cat_var_const)])

In [12]:
# fit the transforms  
prep.fit(X_train)

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('const_imputer',
                                 Pipeli

In [13]:
# detailed info
prep.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('const_impu

In [14]:
# getting the values that r filled 
prep.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([70.04995837])

In [15]:
train["LotFrontage"].mean()

70.04995836802665

In [16]:
prep.named_transformers_["mode_imputer"].named_steps["imputer"].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [17]:
X_train_clean =prep.transform(X_train)
X_test_clean =prep.transform(X_test)

In [18]:
X_train_clean

array([[65.0, 196.0, 2003.0, ..., 'KHALI COLUMN', 'KHALI COLUMN',
        'KHALI COLUMN'],
       [80.0, 0.0, 1976.0, ..., 'KHALI COLUMN', 'KHALI COLUMN',
        'KHALI COLUMN'],
       [68.0, 162.0, 2001.0, ..., 'KHALI COLUMN', 'KHALI COLUMN',
        'KHALI COLUMN'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'KHALI COLUMN', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'KHALI COLUMN', 'KHALI COLUMN',
        'KHALI COLUMN'],
       [75.0, 0.0, 1965.0, ..., 'KHALI COLUMN', 'KHALI COLUMN',
        'KHALI COLUMN']], dtype=object)

In [19]:
prep.transformers_

# drop -> rows or columns skipped as they dont have missing values

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('const_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='KHALI COLUMN',
                                 strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
  

In [20]:
X_train_clean_DF =pd.DataFrame(X_train_clean,columns=num_var_mean+num_var_median+cat_var_mode+cat_var_const)
X_train_clean_DF

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65,196,2003,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN
1,80,0,1976,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN
2,68,162,2001,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN
3,60,0,1998,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN
4,84,350,2000,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62,0,1999,Grvl,,Gd,TA,No,Unf,Unf,SBrkr,TA,Attchd,RFn,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN
1456,85,119,1978,Grvl,Stone,Gd,TA,No,ALQ,Rec,SBrkr,TA,Attchd,Unf,TA,TA,KHALI COLUMN,MnPrv,KHALI COLUMN
1457,66,0,1941,Grvl,,TA,Gd,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,KHALI COLUMN,GdPrv,Shed
1458,68,0,1950,Grvl,,TA,TA,Mn,GLQ,Rec,FuseA,Gd,Attchd,Unf,TA,TA,KHALI COLUMN,KHALI COLUMN,KHALI COLUMN


In [21]:
X_train_clean_DF.isnull().sum().sum()

0

In [23]:
# check values filled are correct or not
train["Alley"].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [24]:
X_train_clean_DF["Alley"].value_counts()

Grvl    1419
Pave      41
Name: Alley, dtype: int64