In [125]:
# IMPORT
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import set_config
set_config(display='diagram')

In [129]:
# READ DATA
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# TÁCH CÁC TẬP:


In [139]:
class Preprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, tr_X, y=None):
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        # Chuyển đổi thuộc tính dạng numberic sang Categorical.
        for col in ('MSSubClass', 'YrSold', 'MoSold'):
            df[col] = df[col].astype(str)
        # Xóa các thuộc tính không cần thiết, và thêm thuộc tính mới vào.
        df['HasPool'] = df['PoolQC'].isna().apply(lambda x : 0 if x == True else 1)
        df.drop(columns=['Id','Street','Utilities','PoolQC'], inplace = True)
        
        # chuyển các thuộc categorical sang dạng nominal có thứ tự.
        col1s = ['HeatingQC','KitchenQual','ExterQual','ExterCond','FireplaceQu','BsmtQual','BsmtCond','GarageQual','GarageCond']
        for col in col1s:
            df[col].replace(to_replace=['Ex','Gd','TA','Fa','Po',np.nan], value=[5,4,3,2,1,0],inplace = True)
        df['Alley'].replace(to_replace=['Grvl','Pave',np.nan], value=[1,2,0],inplace = True)
        df['LotShape'].replace(to_replace=['Reg','IR1','IR2','IR3'],value=[4,3,2,1],inplace = True)
        df['LandContour'].replace(to_replace=['Lvl','Bnk','HLS','Low'], value=[2,3,1,0],inplace = True)
        df['LandSlope'].replace(to_replace=['Gtl','Mod','Sev'], value=[2,1,0],inplace = True)
        df['RoofMatl'].replace(to_replace=['ClyTile','CompShg','Membran','Metal',\
                                                     'Roll','Tar&Grv','WdShake','WdShngl'], value=[2,4,1,1,1,3,1,1],inplace = True)
        df['BsmtExposure'].replace(to_replace=['Gd','Av','Mn','No',np.nan], value=[4,3,2,1,0],inplace = True)
        df['BsmtFinType1'].replace(to_replace=['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan],\
                                                 value=[6,5,4,3,2,1,0],inplace = True)
        df['BsmtFinType2'].replace(to_replace=['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan]\
                                                 , value=[6,5,4,3,2,1,0],inplace = True)
        df['Functional'].replace(to_replace=['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'],\
                                             value=[5,2,2,3,4,4,0,1],inplace = True)
        df['GarageType'].replace(to_replace=['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd',np.nan],\
                                             value=[1,2,2,2,2,1,0],inplace = True)
        df['GarageFinish'].replace(to_replace=['Fin','RFn','Unf',np.nan],value=[3,2,1,0],inplace = True)
        df['PavedDrive'].replace(to_replace=['Y','P','N'], value=[2,1,0],inplace = True)
        df['Fence'].replace(to_replace=['GdPrv','MnPrv','GdWo','MnWw',np.nan], value=[4,3,2,1,0],inplace = True)
        df['RoofStyle'].replace(to_replace=['Flat','Gable','Gambrel','Hip','Mansard','Shed'],\
                                           value=[1,2,1,2,1,1],inplace = True)
        # fillna các cột Categorical
        Cate_cols = df.select_dtypes(include='object').keys()
        df[Cate_cols] = df[Cate_cols].fillna('None')
        # fillna các cột GarageYrBlt và LotFrontage
        df['GarageYrBlt'].fillna(df['YearBuilt'],inplace = True)
        df['LotFrontage'].fillna(0,inplace = True)
        return df

In [140]:
cate_cols = ['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
       'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
num_cols = ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF',
       'EnclosedPorch', 'ExterCond', 'ExterQual', 'Fence', 'FireplaceQu',
       'Fireplaces', 'FullBath', 'Functional', 'GarageArea', 'GarageCars',
       'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt',
       'GrLivArea', 'HalfBath', 'HasPool', 'HeatingQC', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotFrontage',
       'LotShape', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'OpenPorchSF',
       'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'RoofMatl',
       'RoofStyle', 'SalePrice', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF',
       'WoodDeckSF', 'YearBuilt', 'YearRemodAdd']

preprocess = make_pipeline(
    (Preprocess()),
    (make_column_transformer(
        (OneHotEncoder(),cate_cols),
        (StandardScaler(),num_cols)
    ))
)

In [141]:
preprocess

In [143]:
preprocess.fit_transform(train)

array([[ 0.        ,  0.        ,  0.        , ..., -0.75217584,
         1.05099379,  0.87866809],
       [ 0.        ,  0.        ,  0.        , ...,  1.62619479,
         0.15673371, -0.42957697],
       [ 0.        ,  0.        ,  0.        , ..., -0.75217584,
         0.9847523 ,  0.83021457],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.75217584,
        -1.00249232,  1.02402865],
       [ 0.        ,  0.        ,  0.        , ...,  2.16891024,
        -0.70440562,  0.53949344],
       [ 0.        ,  0.        ,  0.        , ...,  5.12192075,
        -0.20759447, -0.96256569]])