# What If, we have to use Different Strategies for Different Variables

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
train = pd.read_csv(r"C:\Users\Dell\dataSets\train.csv")
test = pd.read_csv(r"C:\Users\Dell\dataSets\test.csv")
print("Shape of train dataframe : ", train.shape)
print("Shape of test dataframe : ", test.shape)

Shape of train dataframe :  (1460, 81)
Shape of test dataframe :  (1460, 80)


In [5]:
X_train = train.drop(columns = "SalePrice") #Saleprice is here target variable...
Y_train = train["SalePrice"]                ## So 'SalePrice' column is taken as another dataFrame... with single column.
X_test = test.copy()

print("Shape of X_train dataframe : ", X_train.shape)
print("Shape of Y_train dataframe : ", Y_train.shape)
print("Shape of X_test dataframe : ", X_test.shape)

Shape of X_train dataframe :  (1460, 80)
Shape of Y_train dataframe :  (1460,)
Shape of X_test dataframe :  (1460, 80)


## Missing Values Immputations

In [6]:
isnull_sum = X_train.isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [7]:
# taking only numerical variables which have missing values...
num_vars = X_train.select_dtypes(include=["int64","float64"]).columns
num_vars_miss = [var for var in num_vars if isnull_sum[var] > 0]
num_vars_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [9]:
# taking only categorical variables which have missing values...
cat_vars = X_train.select_dtypes(include=["O"]).columns
cat_vars_miss = [var for var in cat_vars if isnull_sum[var] > 0]
cat_vars_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [11]:
# say we have to apply startegies for following variables as follows..

num_vars_mean = ['LotFrontage']  # going to impute by mean value
num_vars_median = ['MasVnrArea', 'GarageYrBlt']   # going to impute by median value

cat_vars_mode = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                 'Electrical', 'FireplaceQu']  # going to impute by mode value

cat_vars_const = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
# going to impute by constant value

### Using Pipeline Class... to impute Missing Values.

In [12]:
# syntax : Pipeline(name, transform_method)  --> we are using SimpleImputer() method here..

# Creation of Pipelines by applying Different Strategies and Methods..
num_vars_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy = 'mean'))])
num_vars_median_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy = 'median'))])
cat_vars_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy = 'most_frequent'))])
cat_vars_const_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy = 'constant', fill_value="Missing"))])

### Using ColumnTransformer

In [16]:
# we will give variables to These Pipeline Strategies.. using ColumnTransformer..

#syntax : ColumnTransformer(transformers=[('name', pipeline_variable_name, column_names_tobe_apply)])

preprocessor = ColumnTransformer(transformers = [("mean_imputer", num_vars_mean_imputer, num_vars_mean),
                                 ("median_imputer", num_vars_median_imputer, num_vars_median),
                                 ("mode_imputer", cat_vars_mode_imputer, cat_vars_mode),
                                 ("const_imputer", cat_vars_const_imputer, cat_vars_const),])
preprocessor

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('const_imputer',
                                 Pipeli

In [17]:
# now we have to use fit() method..
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('const_imputer',
                                 Pipeli

In [19]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('const_impu

In [20]:
# now to check values... we use
preprocessor.named_transformers_["mean_imputer"].named_steps['imputer'].statistics_

array([70.04995837])

In [21]:
# is it right??
train["LotFrontage"].mean()

70.04995836802665

In [22]:
preprocessor.named_transformers_["median_imputer"].named_steps['imputer'].statistics_

array([   0., 1980.])

In [23]:
preprocessor.named_transformers_["mode_imputer"].named_steps['imputer'].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [24]:
# now we have to impute these values to our main dataframe...
X_train_clean = preprocessor.transform(X_train)
X_test_clean = preprocessor.transform(X_test)
X_train_clean

array([[65.0, 196.0, 2003.0, ..., 'Missing', 'Missing', 'Missing'],
       [80.0, 0.0, 1976.0, ..., 'Missing', 'Missing', 'Missing'],
       [68.0, 162.0, 2001.0, ..., 'Missing', 'Missing', 'Missing'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'Missing', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'Missing', 'Missing', 'Missing'],
       [75.0, 0.0, 1965.0, ..., 'Missing', 'Missing', 'Missing']],
      dtype=object)

In [26]:
# Now we have to convert these 2D numpy array into dataframe...

# first check how many columns are missied...
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('const_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='Missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50

In [28]:
# creating dataframe by concatenation...

X_train_clean_miss_vars = pd.DataFrame(X_train_clean, columns = num_vars_mean + num_vars_median + cat_vars_mode + cat_vars_const)

In [29]:
X_train_clean_miss_vars.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65.0,196.0,2003.0,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,Missing,Missing,Missing
1,80.0,0.0,1976.0,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing
2,68.0,162.0,2001.0,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing
3,60.0,0.0,1998.0,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,Missing,Missing,Missing
4,84.0,350.0,2000.0,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing


In [31]:
X_train_clean_miss_vars.isnull().sum().sum()  # no NAN value will be there..

0