In [211]:
import pandas as pd
import numpy as np
import importlib
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import helper
importlib.reload(helper)

<module 'helper' from '/Users/home/Documents/Projects/ai-engineering-portfolio/classical-ml/house-price-prediction/notebooks/helper.py'>

In [212]:
train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")

In [213]:
train_percent_nan = helper.percent_missing(train_df)
train_percent_nan

Electrical       0.068493
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtFinType1     2.534247
BsmtExposure     2.602740
BsmtFinType2     2.602740
GarageCond       5.547945
GarageQual       5.547945
GarageFinish     5.547945
GarageYrBlt      5.547945
GarageType       5.547945
LotFrontage     17.739726
FireplaceQu     47.260274
MasVnrType      59.726027
Fence           80.753425
Alley           93.767123
MiscFeature     96.301370
PoolQC          99.520548
dtype: float64

In [214]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [215]:
## create a base pipeline by replacing missing numerical values with Median and missing categorical values with mode

X = train_df.drop('SalePrice',axis=1)

##as the y label is right skewed , we will log transform as regression expects normal distributed data
y = np.log1p(train_df['SalePrice'])

X = X.drop("Id",axis=1)

num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include='object').columns

num_pipeline = Pipeline([('imputer',SimpleImputer(strategy ='median'))])
cat_pipeline = Pipeline([('imputer',SimpleImputer(strategy = 'most_frequent')),
                          ('encoder',OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer([('numerical',num_pipeline,num_cols),
                                  ('categorical',cat_pipeline,cat_cols)])

pipeline = Pipeline([('preprocessor',preprocessor),
                     ('model', Ridge(alpha=1))])

scores = cross_val_score(pipeline, X, y,
                             scoring = 'neg_root_mean_squared_error',
                             cv=5)

base_rmse = -scores.mean()
base_rmse

np.float64(0.15298936794199486)

In [216]:
train_exp = train_df.copy()

In [217]:
helper.percent_missing(train_exp)


Electrical       0.068493
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtFinType1     2.534247
BsmtExposure     2.602740
BsmtFinType2     2.602740
GarageCond       5.547945
GarageQual       5.547945
GarageFinish     5.547945
GarageYrBlt      5.547945
GarageType       5.547945
LotFrontage     17.739726
FireplaceQu     47.260274
MasVnrType      59.726027
Fence           80.753425
Alley           93.767123
MiscFeature     96.301370
PoolQC          99.520548
dtype: float64

In [218]:
## drop id columns
train_exp = train_exp.drop(columns='Id',axis=1)

In [219]:
##drop columns with more than 80% row missing

train_exp = train_exp.drop(columns = ['Fence','Alley','MiscFeature','PoolQC'],axis=1)



In [220]:
train_exp['Electrical'].value_counts()

Electrical
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: count, dtype: int64

In [221]:
train_exp[train_exp['Electrical'].isnull()]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1379,80,RL,73.0,9735,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,5,2008,WD,Normal,167500


In [222]:
## Electrical has only one row missing so filling it with most common value i.e SBrkr
train_exp['Electrical'] = train_exp['Electrical'].fillna('SBrkr')
train_exp['Electrical'].value_counts()

Electrical
SBrkr    1335
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: count, dtype: int64

In [223]:
train_exp[train_exp['BsmtQual'].isnull()][['BsmtQual','BsmtCond','BsmtFinType1','BsmtExposure','BsmtFinType2']]

Unnamed: 0,BsmtQual,BsmtCond,BsmtFinType1,BsmtExposure,BsmtFinType2
17,,,,,
39,,,,,
90,,,,,
102,,,,,
156,,,,,
182,,,,,
259,,,,,
342,,,,,
362,,,,,
371,,,,,


In [224]:
##we can assume that these data dont have basement and hence fill with appropriate no basement data
bsmt_nan_col = ['BsmtQual','BsmtCond','BsmtFinType1','BsmtExposure','BsmtFinType2']
train_exp[bsmt_nan_col] = train_exp[bsmt_nan_col].fillna('NA')

In [225]:
helper.percent_missing(train_exp)


MasVnrArea       0.547945
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
LotFrontage     17.739726
FireplaceQu     47.260274
MasVnrType      59.726027
dtype: float64

In [226]:
train_exp[train_exp['GarageType'].isnull()][['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 39 to 1453
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   GarageType    0 non-null      object 
 1   GarageYrBlt   0 non-null      float64
 2   GarageFinish  0 non-null      object 
 3   GarageQual    0 non-null      object 
 4   GarageCond    0 non-null      object 
dtypes: float64(1), object(4)
memory usage: 3.8+ KB


In [227]:
##fill missing garage data rows assuming no garage
train_exp['GarageYrBlt']=train_exp['GarageYrBlt'].fillna(0)
garage_fill_col = ['GarageType','GarageFinish','GarageQual','GarageCond']
train_exp[garage_fill_col] = train_exp[garage_fill_col].fillna('NA')

In [228]:
helper.percent_missing(train_exp)

MasVnrArea      0.547945
LotFrontage    17.739726
FireplaceQu    47.260274
MasVnrType     59.726027
dtype: float64

In [229]:
train_exp[train_exp['MasVnrType'].isnull()][['MasVnrArea']].value_counts()

MasVnrArea
0.0           859
1.0             2
288.0           1
312.0           1
344.0           1
Name: count, dtype: int64

In [230]:
##fill MasVnrtype and MasVnrArea with None and 0
train_exp['MasVnrType'] = train_exp['MasVnrType'].fillna('None')
train_exp['MasVnrArea'] = train_exp['MasVnrArea'].fillna(0)

In [231]:
helper.percent_missing(train_exp)


LotFrontage    17.739726
FireplaceQu    47.260274
dtype: float64

In [232]:
train_exp[train_exp['FireplaceQu'].isnull()][['Fireplaces']].value_counts()

Fireplaces
0             690
Name: count, dtype: int64

In [233]:
## fill FireplaceQu wit NA as Fireplaces field is having 0
train_exp['FireplaceQu'] = train_exp['FireplaceQu'].fillna('NA')

In [234]:
train_exp.groupby('Neighborhood')['LotFrontage'].transform(lambda value : value.fillna(value.mean))

mean_LotFrontage_by_Neighborhood = train_exp.groupby('Neighborhood')['LotFrontage'].mean()
mean_LotFrontage_by_Neighborhood

Neighborhood
Blmngtn    47.142857
Blueste    24.000000
BrDale     21.562500
BrkSide    57.509804
ClearCr    83.461538
CollgCr    71.682540
Crawfor    71.804878
Edwards    68.217391
Gilbert    79.877551
IDOTRR     62.500000
MeadowV    27.800000
Mitchel    70.083333
NAmes      76.462366
NPkVill    32.285714
NWAmes     81.288889
NoRidge    91.878788
NridgHt    81.881579
OldTown    62.788991
SWISU      58.913043
Sawyer     74.437500
SawyerW    71.500000
Somerst    64.666667
StoneBr    62.700000
Timber     80.133333
Veenker    59.714286
Name: LotFrontage, dtype: float64

In [235]:
##based on domain knowledge which suggest LotFrontage i.e Linear feet of street connected to property is closly determined by the neighborhood , 
# fill missing LotFrontage with mean of neighborhood
train_exp['LotFrontage'] = train_exp['LotFrontage'].fillna(train_exp['Neighborhood'].map(mean_LotFrontage_by_Neighborhood))

In [236]:
helper.percent_missing(train_exp)


Series([], dtype: float64)

##we create a pipeline for all the above mentioned above

In [237]:


from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.columns = columns
    def get_feature_names_out(self, input_features=None):
        """Allows set_output(transform='pandas') to work."""
        return input_features
    def fit(self,X , y = None):
        return self
    def transform(self,X):
        return X.drop(columns=self.columns,axis=1, errors = 'ignore')

In [238]:
fill_zero_pipeline = Pipeline([('imputer',SimpleImputer(strategy='constant',fill_value = 0))])

fill_most_frequent_pipeline = Pipeline([('imputer',SimpleImputer(strategy='most_frequent'))])

fill_NA_pipeline = Pipeline([('imputer',SimpleImputer(strategy='constant',fill_value='NA'))])

fill_None_pipeline = Pipeline([('imputer',SimpleImputer(strategy='constant',fill_value='None'))])

fill_mode_pipeline = Pipeline([('imputer',SimpleImputer(strategy='most_frequent'))])

In [239]:
##create a Imputer to fill LotFrontage based on median of each neighbourhood

class ColumnBasedMedianImputer(BaseEstimator,TransformerMixin):
    def __init__(self,fill_column,group_by_columns):
        self.fill_column = fill_column
        self.group_by_columns = group_by_columns
    
    def fit(self,X,y=None):
        X = pd.DataFrame(X).copy()

        self.mean_fill_column_by_group = X.groupby(self.group_by_columns)[self.fill_column].mean()


        self.global_mean = X[self.fill_column].mean()
        return self

    def get_feature_names_out(self, input_features=None):
        """Allows set_output(transform='pandas') to work."""
        return input_features
    
    def transform(self,X):
        X = pd.DataFrame(X).copy()

        # def impute(row):
        #     if pd.isna(row[self.fill_column]):
        #         return self.mean_fill_column_by_group.get(row[self.group_by_columns],self.global_mean)
        #     return row[self.fill_column]
        
        # X[self.fill_column] = X.apply(impute, axis=1)
        X[self.fill_column] = X[self.fill_column].fillna(X[self.group_by_columns].map(self.mean_fill_column_by_group))
        X[self.fill_column] = X[self.fill_column].fillna(self.global_mean)
        return X
    

fill_LotFrontage_imputer = Pipeline([('imputer',ColumnBasedMedianImputer(fill_column='LotFrontage',
                                                                         group_by_columns='Neighborhood'))])

In [240]:
test_exp = test_df.copy()

In [241]:
test_percent_nan = helper.percent_missing(test_exp)
test_percent_nan

TotalBsmtSF      0.068540
GarageArea       0.068540
GarageCars       0.068540
KitchenQual      0.068540
BsmtUnfSF        0.068540
BsmtFinSF2       0.068540
BsmtFinSF1       0.068540
SaleType         0.068540
Exterior1st      0.068540
Exterior2nd      0.068540
Functional       0.137080
Utilities        0.137080
BsmtHalfBath     0.137080
BsmtFullBath     0.137080
MSZoning         0.274160
MasVnrArea       1.028101
BsmtFinType1     2.878684
BsmtFinType2     2.878684
BsmtQual         3.015764
BsmtExposure     3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageCond       5.346127
GarageQual       5.346127
GarageYrBlt      5.346127
GarageFinish     5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
MasVnrType      61.274846
Fence           80.123372
Alley           92.666210
MiscFeature     96.504455
PoolQC          99.794380
dtype: float64

In [242]:
test_exp[['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']] = test_exp[['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']].fillna(0)

In [243]:
test_percent_nan = helper.percent_missing(test_exp)
test_percent_nan

KitchenQual      0.068540
GarageArea       0.068540
GarageCars       0.068540
Exterior2nd      0.068540
SaleType         0.068540
Exterior1st      0.068540
Utilities        0.137080
Functional       0.137080
MSZoning         0.274160
MasVnrArea       1.028101
BsmtFinType1     2.878684
BsmtFinType2     2.878684
BsmtExposure     3.015764
BsmtQual         3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageCond       5.346127
GarageQual       5.346127
GarageYrBlt      5.346127
GarageFinish     5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
MasVnrType      61.274846
Fence           80.123372
Alley           92.666210
MiscFeature     96.504455
PoolQC          99.794380
dtype: float64

In [244]:
test_exp[test_exp['GarageCars'].isnull()][['GarageArea','GarageCars','GarageType']]

Unnamed: 0,GarageArea,GarageCars,GarageType
1116,,,Detchd


In [245]:
test_exp[['GarageArea','GarageCars']] = test_exp[['GarageArea','GarageCars']].fillna(0)

In [246]:
test_percent_nan = helper.percent_missing(test_exp)
test_percent_nan

SaleType         0.068540
KitchenQual      0.068540
Exterior1st      0.068540
Exterior2nd      0.068540
Functional       0.137080
Utilities        0.137080
MSZoning         0.274160
MasVnrArea       1.028101
BsmtFinType1     2.878684
BsmtFinType2     2.878684
BsmtExposure     3.015764
BsmtQual         3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageYrBlt      5.346127
GarageFinish     5.346127
GarageQual       5.346127
GarageCond       5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
MasVnrType      61.274846
Fence           80.123372
Alley           92.666210
MiscFeature     96.504455
PoolQC          99.794380
dtype: float64

In [247]:
test_exp[test_exp['Exterior1st'].isnull()][['Exterior1st','Exterior2nd','ExterQual']]

Unnamed: 0,Exterior1st,Exterior2nd,ExterQual
691,,,TA


In [248]:
test_exp['Exterior2nd'].mode()[0]

'VinylSd'

In [249]:
test_exp['Exterior1st'] = test_exp['Exterior1st'].fillna(test_exp['Exterior1st'].mode()[0])
test_exp['Exterior2nd'] = test_exp['Exterior2nd'].fillna(test_exp['Exterior2nd'].mode()[0])

In [250]:
test_percent_nan = helper.percent_missing(test_exp)
test_percent_nan

KitchenQual      0.068540
SaleType         0.068540
Functional       0.137080
Utilities        0.137080
MSZoning         0.274160
MasVnrArea       1.028101
BsmtFinType1     2.878684
BsmtFinType2     2.878684
BsmtQual         3.015764
BsmtExposure     3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageCond       5.346127
GarageQual       5.346127
GarageYrBlt      5.346127
GarageFinish     5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
MasVnrType      61.274846
Fence           80.123372
Alley           92.666210
MiscFeature     96.504455
PoolQC          99.794380
dtype: float64

In [251]:
test_exp['KitchenQual'].value_counts()

KitchenQual
TA    757
Gd    565
Ex    105
Fa     31
Name: count, dtype: int64

In [252]:
test_exp['KitchenQual'] = test_exp['KitchenQual'].fillna(test_exp['KitchenQual'].mode()[0])

In [253]:
test_percent_nan = helper.percent_missing(test_exp)
test_percent_nan

SaleType         0.068540
Functional       0.137080
Utilities        0.137080
MSZoning         0.274160
MasVnrArea       1.028101
BsmtFinType2     2.878684
BsmtFinType1     2.878684
BsmtQual         3.015764
BsmtExposure     3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageCond       5.346127
GarageQual       5.346127
GarageYrBlt      5.346127
GarageFinish     5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
MasVnrType      61.274846
Fence           80.123372
Alley           92.666210
MiscFeature     96.504455
PoolQC          99.794380
dtype: float64

In [254]:
test_exp['SaleType'] = test_exp['SaleType'].fillna(test_exp['SaleType'].mode()[0])
test_exp['Utilities'] = test_exp['Utilities'].fillna(test_exp['Utilities'].mode()[0])
test_exp['MSZoning'] = test_exp['MSZoning'].fillna(test_exp['MSZoning'].mode()[0])
test_exp['Functional'] = test_exp['Functional'].fillna(test_exp['Functional'].mode()[0])

In [255]:
test_percent_nan = helper.percent_missing(test_exp)
test_percent_nan

MasVnrArea       1.028101
BsmtFinType2     2.878684
BsmtFinType1     2.878684
BsmtQual         3.015764
BsmtExposure     3.015764
BsmtCond         3.084304
GarageType       5.209047
GarageCond       5.346127
GarageQual       5.346127
GarageFinish     5.346127
GarageYrBlt      5.346127
LotFrontage     15.558602
FireplaceQu     50.034270
MasVnrType      61.274846
Fence           80.123372
Alley           92.666210
MiscFeature     96.504455
PoolQC          99.794380
dtype: float64

In [256]:
# create new model pipeline
X = train_df.drop('SalePrice',axis=1)

##as the y label is right skewed , we will log transform as regression expects normal distributed data
y = np.log1p(train_df['SalePrice'])

len(X.columns)

80

In [None]:
#create a preprocessor
fill_na_cols = ['FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','BsmtQual','BsmtCond','BsmtFinType1','BsmtExposure','BsmtFinType2']
fill_zero_cols = ['MasVnrArea','GarageYrBlt','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageArea','GarageCars']
fill_none_cols = ['MasVnrType']
fill_mode_cols = ['Exterior1st','Exterior2nd','SaleType','Utilities','MSZoning','Functional','KitchenQual']
drop_cols = ['Id','Fence','Alley','MiscFeature','PoolQC']
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include='object').columns

updated_num_cols = [c for c in num_cols if c not in drop_cols]
updated_cat_cols = [c for c in cat_cols if c not in drop_cols]

class DebugType(BaseEstimator, TransformerMixin):
    def __init__(self, step_name):
        self.step_name = step_name
    def fit(self, X, y=None): return self
    def get_feature_names_out(self, input_features=None):
        """Allows set_output(transform='pandas') to work."""
        return input_features
    def transform(self, X):
        print(f"{self.step_name} Data type reaching preprocessor: {type(X)}")
        if isinstance(X, pd.DataFrame):
            print(f"Columns available: {X.columns}...")
        return X

preprocessor = ColumnTransformer(transformers=[
    ('fill_na',fill_NA_pipeline,fill_na_cols),
    ('fill_zero',fill_zero_pipeline,fill_zero_cols),
    ('fill_none',fill_None_pipeline,fill_none_cols),
    ('fill_mode',fill_mode_pipeline,fill_mode_cols)
],remainder = 'passthrough',verbose_feature_names_out=False)

preprocessor.set_output(transform='pandas')


from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector

scaler_encoder = ColumnTransformer([('scaler',StandardScaler(),updated_num_cols),
                                    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False,),updated_cat_cols)])

In [258]:



pipeline = Pipeline([('debug_before_drop', DebugType("debug_before_drop")),
                     ('drop_column',ColumnDropper(drop_cols)),
                     #('debug_drop', DebugType("debug_drop")),
                     ('fill_LotFrontage',ColumnBasedMedianImputer(fill_column='LotFrontage',group_by_columns='Neighborhood')),
                     #('debug_fill_lotFrontage_output', DebugType('debug_fill_lotFrontage_output')),
                     ('preprocessor',preprocessor),
                     #('debug_proprocesor_output', DebugType('debug_proprocesor_output')),
                     ('scale_and_encode',scaler_encoder),
                     #('debug_scale', DebugType('debug_scale')),
                     ('model', Ridge(alpha=1))])

scores = cross_val_score(pipeline, X, y,
                             scoring = 'neg_root_mean_squared_error',
                             cv=5)

base_rmse = -scores.mean()
base_rmse

debug_before_drop Data type reaching preprocessor: <class 'pandas.core.frame.DataFrame'>
Columns available: Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
    

np.float64(0.14440443394027708)

In [259]:
preprocessing_pipeline = pipeline[:-1] ##without model


preprocessing_pipeline.fit(X,y)

debug_before_drop Data type reaching preprocessor: <class 'pandas.core.frame.DataFrame'>
Columns available: Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
    

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [260]:
preprocessing_pipeline.set_output(transform = 'pandas')
train_processed = preprocessing_pipeline.transform(train_df.drop('SalePrice',axis=1))

debug_before_drop Data type reaching preprocessor: <class 'pandas.core.frame.DataFrame'>
Columns available: Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
    

ValueError: Pandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas output via` ohe.set_output(transform="default").

In [None]:
train_processed