In [40]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# reading
house_iter4 = pd.read_csv(r'house_iter4.csv')
X = house_iter4.drop(columns=[ "CentralAir","MSZoning","Condition1","Street"],axis=1)

In [41]:
# X and y creation
X.head()

y = X.pop("Expensive")


In [42]:
X["Foundation"].unique()

array(['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'], dtype=object)

In [43]:
enc=OrdinalEncoder()
enc

In [44]:
enc.fit_transform(X[["Foundation"]])

array([[2.],
       [1.],
       [2.],
       ...,
       [4.],
       [1.],
       [1.]])

In [59]:
X["Foundation"]=enc.fit_transform(X[["Foundation"]]).copy()

In [60]:
X_ord=X

In [61]:


# data splitting
X_train, X_test, y_train, y_test = train_test_split(X_ord, y, test_size=0.2, random_state=123)

2.1. Replacing NaNs

2.1.1. Replacing NaNs in categorical features
We were imputing the mean to NaN’s on our preprocessing pipeline for numerical features. There's a problem with categorical values: they don’t have a “mean”. Here, we will replace NaNs with a string that marks them: “N_A”. It is not an elegant solution, but it will allow us to move forward.

In [62]:
# selecting non-numerical columns
X_train_cat = X_train.select_dtypes(exclude="number")

# defining the imputer to use "N_A" as replacement value
cat_imputer = SimpleImputer(strategy="constant", 
                            fill_value="N_A").set_output(transform='pandas')

# fitting and transforming
X_cat_imputed = cat_imputer.fit_transform(X_train_cat)

X_cat_imputed.head()

Unnamed: 0,Heating,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
318,GasA,Gd,TA,Gd,TA,Gd,GLQ,Gd,TA
580,GasA,TA,TA,TA,TA,No,BLQ,Gd,Gd
961,GasA,TA,Gd,Gd,Gd,No,ALQ,TA,TA
78,GasA,TA,TA,TA,TA,No,Unf,TA,N_A
5,GasA,TA,TA,Gd,TA,No,GLQ,TA,N_A


2.1.2. Replacing NaNs in numerical features
This is what we already did in previous notebooks: replacing numerical NaNs with the mean of their column.

In [63]:
# Selecting numerical columns
X_train_num = X_train.select_dtypes(include="number")

# Imputing the mean
num_imputer = SimpleImputer(strategy="mean").set_output(transform='pandas')

# Fitting and transforming
X_num_imputed = num_imputer.fit_transform(X_train_num)

X_num_imputed.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Foundation
318,9900.0,90.0,1347.0,4.0,1.0,0.0,3.0,340.0,0.0,2.0
580,14585.0,69.58427,1144.0,3.0,2.0,0.0,2.0,216.0,0.0,1.0
961,12227.0,69.58427,1330.0,4.0,1.0,0.0,2.0,550.0,0.0,1.0
78,10778.0,72.0,1768.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
5,14115.0,85.0,796.0,1.0,0.0,0.0,2.0,40.0,0.0,5.0


2.3.3. Concatenating "one-hot" columns with numerical columns:
Now that the categorical columns are numerical, we can join them back with the originally numerical columns and assemble the dataset that will be ready for modelling:

In [64]:
X_imputed = pd.concat([X_cat_imputed, X_num_imputed], axis=1)

X_imputed

Unnamed: 0,Heating,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Foundation
318,GasA,Gd,TA,Gd,TA,Gd,GLQ,Gd,TA,9900.0,90.00000,1347.0,4.0,1.0,0.0,3.0,340.0,0.0,2.0
580,GasA,TA,TA,TA,TA,No,BLQ,Gd,Gd,14585.0,69.58427,1144.0,3.0,2.0,0.0,2.0,216.0,0.0,1.0
961,GasA,TA,Gd,Gd,Gd,No,ALQ,TA,TA,12227.0,69.58427,1330.0,4.0,1.0,0.0,2.0,550.0,0.0,1.0
78,GasA,TA,TA,TA,TA,No,Unf,TA,N_A,10778.0,72.00000,1768.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
5,GasA,TA,TA,Gd,TA,No,GLQ,TA,N_A,14115.0,85.00000,796.0,1.0,0.0,0.0,2.0,40.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,GasA,TA,TA,TA,TA,No,GLQ,Gd,N_A,9130.0,69.58427,800.0,4.0,0.0,0.0,2.0,0.0,0.0,1.0
1122,GasA,TA,TA,TA,TA,No,Unf,TA,N_A,8926.0,69.58427,672.0,3.0,0.0,0.0,1.0,64.0,160.0,1.0
1346,GasA,TA,TA,TA,TA,No,BLQ,TA,Gd,20781.0,69.58427,1568.0,3.0,1.0,0.0,2.0,0.0,0.0,1.0
1406,GasA,TA,TA,Gd,TA,Av,GLQ,TA,N_A,8445.0,70.00000,768.0,2.0,0.0,0.0,2.0,58.0,0.0,1.0


3.1. Creating the "numeric pipe" and the "categoric pipe"

In [68]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))


 
# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    LogisticRegression(max_iter=500))




 

3.2. Using make_column_transformer on 2 branches/pipelines (the preprocessor)
We simply tell the pipeline the following:

One branch, called "numeric_pipe", will apply the steps in the numeric_pipe to the columns that make_column_selector selects ((dtype_include='number'))
The second branch, called "categoric_pipe", will apply the steps in the categoric_pipe to the columns make_column_selector selects ((dtype_include='object'))
make_column_selector example: