In [27]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.impute import SimpleImputer
class PipelineWithFeatureNames(Pipeline):
    def __init__(self, steps, memory=None, verbose=False, feature_names = None):
        Pipeline.__init__(self, steps, memory,verbose)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.steps[-1][-1].get_feature_names(input_features = self.feature_names)
class SimpleImputerWithFeatureNames(SimpleImputer):
    def __init__(self, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False, feature_names = None):
        SimpleImputer.__init__(self, missing_values,strategy,fill_value,verbose,copy,add_indicator)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.feature_names
train_data = pd.read_csv('train.csv')
train_data = train_data.drop(['Id','SalePrice'], axis = 1)
categorial_features = ['MSSubClass','Street','LotShape','LandContour',
            'LotConfig','LandSlope','Neighborhood',
            'Condition1','Condition2','BldgType','HouseStyle',
            'RoofStyle','RoofMatl',
            'ExterQual','ExterCond','Foundation','Heating','HeatingQC',
            'CentralAir',
            'PavedDrive','SaleCondition']
categorial_features_ordinal = []
categorial_features_with_missing_values_const = ['Alley','BsmtCond','BsmtQual','BsmtExposure','BsmtFinType1',
                                           'BsmtFinType2','FireplaceQu','GarageType','GarageQual','GarageFinish',
                                           'GarageCond','PoolQC','Fence','MiscFeature']
categorial_features_with_missing_values_impute = ['SaleType','Functional','KitchenQual','Exterior1st','Exterior2nd','Utilities','MSZoning','MasVnrType','Electrical']
numeric_columns_impute = ['GarageArea','GarageCars','BsmtHalfBath','BsmtFullBath','TotalBsmtSF','BsmtUnfSF','BsmtFinSF1','BsmtFinSF2','LotFrontage','MasVnrArea','GarageYrBlt']
categorical_with_missing_values_pipeline_const = PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Nan')),
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features_with_missing_values_const)
#for now let it be most frequent- median is not supported for numeric but seems same as frequent, but further maybe use advanced imputer that uses ml
categorical_with_missing_values_pipeline_impute = PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features_with_missing_values_impute)
categorical_pipeline = PipelineWithFeatureNames(steps=[
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features)
categoricalTransformer = ColumnTransformer(transformers = [
    ('categorical_with_missing_values_const', categorical_with_missing_values_pipeline_const,categorial_features_with_missing_values_const),
    ('categorical_with_missing_values_impute', categorical_with_missing_values_pipeline_impute,categorial_features_with_missing_values_impute),
    ('categorical', categorical_pipeline,categorial_features),
    ('numeric_with_missing_values_impute',SimpleImputerWithFeatureNames(strategy='median',feature_names = numeric_columns_impute),numeric_columns_impute)])
categoricalTransformerPassthrough = ColumnTransformer(transformers = [
    ('categorical_with_missing_values_const', categorical_with_missing_values_pipeline_const,categorial_features_with_missing_values_const),
    ('categorical_with_missing_values_impute', categorical_with_missing_values_pipeline_impute,categorial_features_with_missing_values_impute),
    ('categorical', categorical_pipeline,categorial_features),
    ('numeric_with_missing_values_impute',SimpleImputerWithFeatureNames(strategy='median',feature_names = numeric_columns_impute),numeric_columns_impute)],remainder = "passthrough")


In [28]:
pd.DataFrame(data = categoricalTransformer.fit_transform(train_data),columns = categoricalTransformer.get_feature_names())










Unnamed: 0,categorical_with_missing_values_const__Alley_Grvl,categorical_with_missing_values_const__Alley_Nan,categorical_with_missing_values_const__Alley_Pave,categorical_with_missing_values_const__BsmtCond_Fa,categorical_with_missing_values_const__BsmtCond_Gd,categorical_with_missing_values_const__BsmtCond_Nan,categorical_with_missing_values_const__BsmtCond_Po,categorical_with_missing_values_const__BsmtCond_TA,categorical_with_missing_values_const__BsmtQual_Ex,categorical_with_missing_values_const__BsmtQual_Fa,...,numeric_with_missing_values_impute__GarageCars,numeric_with_missing_values_impute__BsmtHalfBath,numeric_with_missing_values_impute__BsmtFullBath,numeric_with_missing_values_impute__TotalBsmtSF,numeric_with_missing_values_impute__BsmtUnfSF,numeric_with_missing_values_impute__BsmtFinSF1,numeric_with_missing_values_impute__BsmtFinSF2,numeric_with_missing_values_impute__LotFrontage,numeric_with_missing_values_impute__MasVnrArea,numeric_with_missing_values_impute__GarageYrBlt
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,0.0,1.0,856.0,150.0,706.0,0.0,65.0,196.0,2003.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,1.0,0.0,1262.0,284.0,978.0,0.0,80.0,0.0,1976.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,0.0,1.0,920.0,434.0,486.0,0.0,68.0,162.0,2001.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,1.0,756.0,540.0,216.0,0.0,60.0,0.0,1998.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,1.0,1145.0,490.0,655.0,0.0,84.0,350.0,2000.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,0.0,1.0,796.0,64.0,732.0,0.0,85.0,0.0,1993.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,2.0,0.0,1.0,1686.0,317.0,1369.0,0.0,75.0,186.0,2004.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,0.0,1.0,1107.0,216.0,859.0,32.0,69.0,240.0,1973.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,0.0,0.0,952.0,952.0,0.0,0.0,51.0,0.0,1931.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,991.0,140.0,851.0,0.0,50.0,0.0,1939.0


In [29]:
categoricalTransformer.get_feature_names()

['categorical_with_missing_values_const__Alley_Grvl',
 'categorical_with_missing_values_const__Alley_Nan',
 'categorical_with_missing_values_const__Alley_Pave',
 'categorical_with_missing_values_const__BsmtCond_Fa',
 'categorical_with_missing_values_const__BsmtCond_Gd',
 'categorical_with_missing_values_const__BsmtCond_Nan',
 'categorical_with_missing_values_const__BsmtCond_Po',
 'categorical_with_missing_values_const__BsmtCond_TA',
 'categorical_with_missing_values_const__BsmtQual_Ex',
 'categorical_with_missing_values_const__BsmtQual_Fa',
 'categorical_with_missing_values_const__BsmtQual_Gd',
 'categorical_with_missing_values_const__BsmtQual_Nan',
 'categorical_with_missing_values_const__BsmtQual_TA',
 'categorical_with_missing_values_const__BsmtExposure_Av',
 'categorical_with_missing_values_const__BsmtExposure_Gd',
 'categorical_with_missing_values_const__BsmtExposure_Mn',
 'categorical_with_missing_values_const__BsmtExposure_Nan',
 'categorical_with_missing_values_const__BsmtExpos

In [30]:
categoricalTransformerPassthrough.fit_transform(train_data)

array([[0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 2.000e+00,
        2.008e+03],
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 5.000e+00,
        2.007e+03],
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 9.000e+00,
        2.008e+03],
       ...,
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 2.500e+03, 5.000e+00,
        2.010e+03],
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 4.000e+00,
        2.010e+03],
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 6.000e+00,
        2.008e+03]])