# Import Libraries


In [None]:
import importlib
import sys
sys.modules['imp'] = importlib

In [None]:
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
# from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
params_cfg = {
    "action"   : "train_feat02",
    "feat_path": "/content/drive/MyDrive/house_predict/process/exps/data.npz",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('/content/drive/MyDrive/house_predict/process/exps'),
    'exp_name': 'trainbase_24102025',
    "data_dir": os.path.abspath("/content/drive/MyDrive/house_predict/data"),
    "verbose" : True,
}

params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)

+ action: train_feat02
+ feat_path: /content/drive/MyDrive/house_predict/process/exps/data.npz
+ seed: 42
+ exp_dir: /content/drive/MyDrive/house_predict/process/exps
+ exp_name: trainbase_24102025
+ data_dir: /content/drive/MyDrive/house_predict/data
+ verbose: True
+ save_dir: /content/drive/MyDrive/house_predict/process/exps/trainbase_24102025


# Data Load


In [None]:
df_train = pd.read_csv(f'{data_dir}/train.csv')
df_test = pd.read_csv(f'{data_dir}/test.csv')

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

---------- information ----------
train-col: {'FireplaceQu', 'Street', 'Neighborhood', 'LowQualFinSF', 'Alley', 'CentralAir', 'WoodDeckSF', 'Id', 'OpenPorchSF', 'GarageType', 'BsmtFinType2', 'GarageQual', 'LotShape', 'ScreenPorch', 'YearRemodAdd', 'SaleCondition', '2ndFlrSF', 'Exterior2nd', 'RoofMatl', 'RoofStyle', 'OverallQual', 'Heating', 'MiscFeature', 'BsmtFinSF1', 'FullBath', 'Exterior1st', 'PoolArea', 'Fireplaces', 'MiscVal', 'GarageArea', 'BsmtFinType1', 'Condition1', 'BsmtExposure', 'MoSold', 'HouseStyle', 'BldgType', 'Functional', 'PoolQC', 'ExterCond', 'MSSubClass', 'MasVnrType', 'ExterQual', 'YearBuilt', 'YrSold', 'BsmtHalfBath', 'GarageFinish', 'GarageCars', '1stFlrSF', 'BsmtQual', 'OverallCond', 'Electrical', 'HalfBath', 'GarageCond', 'TotRmsAbvGrd', 'EnclosedPorch', 'LandContour', 'Foundation', 'BsmtUnfSF', 'MasVnrArea', 'KitchenAbvGr', 'MSZoning', 'LandSlope', 'TotalBsmtSF', 'GarageYrBlt', 'BsmtFinSF2', 'Condition2', '3SsnPorch', 'PavedDrive', 'SalePrice', 'Fence', 'Bedr

# Processing

## Xử lí các dữ liệu thiếu trong việc phân tích EDA

### PoolQc

In [None]:
np.unique(df_train['PoolQC'].astype(str))

array(['Ex', 'Fa', 'Gd', 'nan'], dtype=object)

In [None]:
df_train['PoolQC'].fillna('no', inplace=True)

**Nhận Xét:**
+ PoolQc là thuộc tính dạng string
+ Điền NoPool cho các cột NaN vì không phải nhà nào cũng có hồ bơi để đánh giá.
+ Ta sẽ mã hóa cột PoolQc dưới dạng số: "NoPool": 0, "Ex": 1, "Fa": 2, "Gd": 3.

In [None]:
pool_mapping = {'no': 0, 'Ex': 1, 'Fa': 2, 'Gd': 3}

In [None]:
df_output = pd.DataFrame()
df_output['PoolQC'] = df_train['PoolQC'].apply(lambda x: pool_mapping[x])

### MiscFeature

In [None]:
np.unique(df_train['MiscFeature'].astype(str))

array(['Gar2', 'Othr', 'Shed', 'TenC', 'nan'], dtype=object)

**Nhận Xét:**
+ Biến MiscFeature thuộc kiểu dữ liệu Str
+ Điền giá trị thiếu = no

In [None]:
df_train['MiscFeature'].fillna('no', inplace=True)

In [None]:
cls_misc = {'no': 0, 'Gar2': 1, 'Shed': 2, 'TenC': 3, 'Othr': 4}

In [None]:
df_output['MiscFeature'] = df_train['MiscFeature'].apply(lambda x: cls_misc[x])

### Alley

In [None]:
np.unique(df_train['Alley'].astype(str))

array(['Grvl', 'Pave', 'nan'], dtype=object)

In [None]:
df_train['Alley'].fillna('no', inplace=True)

In [None]:
cls_alley = {'no': 0, 'Grvl': 1, 'Pave': 2}

In [None]:
df_output['Alley'] = df_train['Alley'].apply(lambda x: cls_alley[x])

### Fence

In [None]:
np.unique(df_train['Fence'].astype(str))

array(['GdPrv', 'GdWo', 'MnPrv', 'MnWw', 'nan'], dtype=object)

In [None]:
df_train['Fence'].fillna('no', inplace=True)

In [None]:
cls_fence = {'no': 0, 'MnPrv': 1, 'GdPrv': 2, 'MnWw': 3, 'GdWo': 4}

In [None]:
df_output['Fence'] = df_train['Fence'].apply(lambda x: cls_fence[x])

### MasVnrType

In [None]:
np.unique(df_train['MasVnrType'].astype(str))

array(['BrkCmn', 'BrkFace', 'Stone', 'nan'], dtype=object)

In [None]:
df_train['MasVnrType'].fillna('no', inplace=True)

In [None]:
cls_mas = {'no': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3, 'CBlock': 4}

In [None]:
df_output['MasVnrType'] = df_train['MasVnrType'].apply(lambda x: cls_mas[x])

### FireplaceQu: Chất lượng lò sưởi

In [None]:
np.unique(df_train['FireplaceQu'].astype(str))

array(['Ex', 'Fa', 'Gd', 'Po', 'TA', 'nan'], dtype=object)

In [None]:
df_train['FireplaceQu'].fillna('no', inplace=True)

In [None]:
cls_fire = {'no': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

In [None]:
df_output['FireplaceQu'] = df_train['FireplaceQu'].apply(lambda x: cls_fire[x])

### LotFrontage

In [None]:
np.unique(df_train['LotFrontage'].values)

array([ 21.,  24.,  30.,  32.,  33.,  34.,  35.,  36.,  37.,  38.,  39.,
        40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,  48.,  49.,  50.,
        51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.,  60.,  61.,
        62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,  70.,  71.,  72.,
        73.,  74.,  75.,  76.,  77.,  78.,  79.,  80.,  81.,  82.,  83.,
        84.,  85.,  86.,  87.,  88.,  89.,  90.,  91.,  92.,  93.,  94.,
        95.,  96.,  97.,  98.,  99., 100., 101., 102., 103., 104., 105.,
       106., 107., 108., 109., 110., 111., 112., 114., 115., 116., 118.,
       120., 121., 122., 124., 128., 129., 130., 134., 137., 138., 140.,
       141., 144., 149., 150., 152., 153., 160., 168., 174., 182., 313.,
        nan])

**Nhận Xét:**
+ Cột LotFrontage là dữ liệu dạng số có dữ liệu bị thiếu khoảng 17.7%
+ Tiến hành median.

In [None]:
df_output['LotFrontage'] = df_train['LotFrontage'].fillna(df_train['LotFrontage'].median())

### GarageType

In [None]:
np.unique(df_train['GarageType'].astype(str))

array(['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd',
       'nan'], dtype=object)

In [None]:
df_train['GarageType'].fillna('no', inplace=True)

In [None]:
cls_gaty = {'no': 0, 'Detchd': 1, 'BuiltIn': 2, 'CarPort': 3, 'Basment': 4, 'Attchd': 5, '2Types': 6}

In [None]:
df_output['GarageType'] = df_train['GarageType'].apply(lambda x: cls_gaty[x])

### GarageFinish

In [None]:
np.unique(df_train['GarageFinish'].astype(str))

array(['Fin', 'RFn', 'Unf', 'nan'], dtype=object)

In [None]:
df_train['GarageFinish'].fillna('no', inplace=True)

In [None]:
cls_gafin = {'no': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

In [None]:
df_output['GarageFinish'] = df_train['GarageFinish'].apply(lambda x: cls_gafin[x])

### GarageQual

In [None]:
np.unique(df_train['GarageQual'].astype(str))

array(['Ex', 'Fa', 'Gd', 'Po', 'TA', 'nan'], dtype=object)

In [None]:
df_train['GarageQual'].fillna('no', inplace=True)

In [None]:
cls_gaqua = {'no': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

In [None]:
df_output['GarageQual'] = df_train['GarageQual'].apply(lambda x: cls_gaqua[x])

### GarageCond

In [None]:
np.unique(df_train['GarageCond'].astype(str))

array(['Ex', 'Fa', 'Gd', 'Po', 'TA', 'nan'], dtype=object)

In [None]:
df_train['GarageCond'].fillna('no', inplace=True)

In [None]:
cls_gacond = {'no': 0, 'Ex': 1, 'Fa': 2, 'Gd': 3, 'Po': 4, 'TA':5}

In [None]:
df_output['GarageCond'] = df_train['GarageCond'].apply(lambda x: cls_gacond[x])

### GarageYrBlt

In [None]:
np.unique(df_train['GarageYrBlt'].values)

array([1900., 1906., 1908., 1910., 1914., 1915., 1916., 1918., 1920.,
       1921., 1922., 1923., 1924., 1925., 1926., 1927., 1928., 1929.,
       1930., 1931., 1932., 1933., 1934., 1935., 1936., 1937., 1938.,
       1939., 1940., 1941., 1942., 1945., 1946., 1947., 1948., 1949.,
       1950., 1951., 1952., 1953., 1954., 1955., 1956., 1957., 1958.,
       1959., 1960., 1961., 1962., 1963., 1964., 1965., 1966., 1967.,
       1968., 1969., 1970., 1971., 1972., 1973., 1974., 1975., 1976.,
       1977., 1978., 1979., 1980., 1981., 1982., 1983., 1984., 1985.,
       1986., 1987., 1988., 1989., 1990., 1991., 1992., 1993., 1994.,
       1995., 1996., 1997., 1998., 1999., 2000., 2001., 2002., 2003.,
       2004., 2005., 2006., 2007., 2008., 2009., 2010.,   nan])

In [None]:
df_output['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].median())

### Bsm

In [None]:
# Các cột liên quan đến tầng hầm
bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']


In [None]:
for col in bsmt_cols:
    print(f"--- {col} ---")
    print(df_train[col].astype(str).unique())  # ép sang str để hiện cả NaN
    print()

--- BsmtQual ---
['Gd' 'TA' 'Ex' 'nan' 'Fa']

--- BsmtCond ---
['TA' 'Gd' 'nan' 'Fa' 'Po']

--- BsmtExposure ---
['No' 'Gd' 'Mn' 'Av' 'nan']

--- BsmtFinType1 ---
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'nan' 'LwQ']

--- BsmtFinType2 ---
['Unf' 'BLQ' 'nan' 'ALQ' 'Rec' 'LwQ' 'GLQ']



In [None]:
for col in bsmt_cols:
    df_train[col] = df_train[col].fillna('No')

In [None]:
cls_bsmtQual = {'No': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
cls_bsmtCond = {'No': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
cls_bsmtEx = {'No': 0, 'Mn': 1, 'Av': 2, 'Gd': 3}
cls_bsmtFinTyp1 = {'No': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
cls_bsmtFinTyp2 = {'No': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}



In [None]:
df_output['BsmtQual'] = df_train['BsmtQual'].apply(lambda x: cls_bsmtQual[x])
df_output['BsmtCond'] = df_train['BsmtCond'].apply(lambda x: cls_bsmtCond[x])
df_output['BsmtExposure'] = df_train['BsmtExposure'].apply(lambda x: cls_bsmtEx[x])
df_output['BsmtFinType1'] = df_train['BsmtFinType1'].apply(lambda x: cls_bsmtFinTyp1[x])
df_output['BsmtFinType2'] = df_train['BsmtFinType2'].apply(lambda x: cls_bsmtFinTyp2[x])

### MasVnrArea

In [None]:
np.unique(df_train['MasVnrArea'].values)

array([0.000e+00, 1.000e+00, 1.100e+01, 1.400e+01, 1.600e+01, 1.800e+01,
       2.200e+01, 2.400e+01, 2.700e+01, 2.800e+01, 3.000e+01, 3.100e+01,
       3.200e+01, 3.400e+01, 3.600e+01, 3.800e+01, 4.000e+01, 4.100e+01,
       4.200e+01, 4.400e+01, 4.500e+01, 4.600e+01, 4.800e+01, 5.000e+01,
       5.100e+01, 5.300e+01, 5.400e+01, 5.600e+01, 5.700e+01, 6.000e+01,
       6.300e+01, 6.400e+01, 6.500e+01, 6.600e+01, 6.700e+01, 6.800e+01,
       7.000e+01, 7.200e+01, 7.400e+01, 7.500e+01, 7.600e+01, 8.000e+01,
       8.100e+01, 8.200e+01, 8.400e+01, 8.500e+01, 8.600e+01, 8.800e+01,
       8.900e+01, 9.000e+01, 9.200e+01, 9.400e+01, 9.500e+01, 9.600e+01,
       9.700e+01, 9.800e+01, 9.900e+01, 1.000e+02, 1.010e+02, 1.020e+02,
       1.040e+02, 1.050e+02, 1.060e+02, 1.080e+02, 1.090e+02, 1.100e+02,
       1.120e+02, 1.130e+02, 1.140e+02, 1.150e+02, 1.160e+02, 1.170e+02,
       1.190e+02, 1.200e+02, 1.220e+02, 1.230e+02, 1.250e+02, 1.260e+02,
       1.270e+02, 1.280e+02, 1.300e+02, 1.320e+02, 

In [None]:
df_output['MasVnrArea'] = df_train['MasVnrArea'].fillna(df_train['MasVnrArea'].median())

### Electrical

In [None]:
np.unique(df_train['Electrical'].astype(str))

array(['FuseA', 'FuseF', 'FuseP', 'Mix', 'SBrkr', 'nan'], dtype=object)

**Nhận Xét:**
+ Do chỉ thiếu 1 nên ta điền mode

In [None]:
cls_ele = {'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4}

In [None]:
df_output['Electrical'] = df_train['Electrical'].fillna(df_train['Electrical'].mode()[0]).apply(lambda x: cls_ele[x])

In [None]:
def preprocessing_feature_01(df_data, is_train=True, is_debug=True, **kwargs):
    df_output = pd.DataFrame()

    # ====== Numeric features ======
    df_output['MasVnrArea'] = df_data['MasVnrArea'].fillna(df_data['MasVnrArea'].median())
    df_output['LotFrontage'] = df_data['LotFrontage'].fillna(df_data['LotFrontage'].median())
    df_output['GarageYrBlt'] = df_data['GarageYrBlt'].fillna(df_data['GarageYrBlt'].median())

    # ====== Electrical ======
    cls_ele = {'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4}
    df_output['Electrical'] = df_data['Electrical'].fillna(df_data['Electrical'].mode()[0]).apply(lambda x: cls_ele[x])

    # ====== Garage features ======
    cls_gaty = {'no': 0, 'Detchd': 1, 'BuiltIn': 2, 'CarPort': 3, 'Basment': 4, 'Attchd': 5, '2Types': 6}
    df_data['GarageType'].fillna('no', inplace=True)
    df_output['GarageType'] = df_data['GarageType'].apply(lambda x: cls_gaty[x])

    cls_gafin = {'no': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
    df_data['GarageFinish'].fillna('no', inplace=True)
    df_output['GarageFinish'] = df_data['GarageFinish'].apply(lambda x: cls_gafin[x])

    cls_gaqua = {'no': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    df_data['GarageQual'].fillna('no', inplace=True)
    df_output['GarageQual'] = df_data['GarageQual'].apply(lambda x: cls_gaqua[x])

    cls_gacond = {'no': 0, 'Ex': 1, 'Fa': 2, 'Gd': 3, 'Po': 4, 'TA': 5}
    df_data['GarageCond'].fillna('no', inplace=True)
    df_output['GarageCond'] = df_data['GarageCond'].apply(lambda x: cls_gacond[x])

    # ====== Pool ======
    pool_mapping = {'no': 0, 'Ex': 1, 'Fa': 2, 'Gd': 3}
    df_data['PoolQC'].fillna('no', inplace=True)
    df_output['PoolQC'] = df_data['PoolQC'].apply(lambda x: pool_mapping[x])

    # ====== MiscFeature ======
    cls_misc = {'no': 0, 'Gar2': 1, 'Shed': 2, 'TenC': 3, 'Othr': 4}
    df_data['MiscFeature'].fillna('no', inplace=True)
    df_output['MiscFeature'] = df_data['MiscFeature'].apply(lambda x: cls_misc[x])

    # ====== Alley ======
    cls_alley = {'no': 0, 'Grvl': 1, 'Pave': 2}
    df_data['Alley'].fillna('no', inplace=True)
    df_output['Alley'] = df_data['Alley'].apply(lambda x: cls_alley[x])

    # ====== Fence ======
    cls_fence = {'no': 0, 'MnPrv': 1, 'GdPrv': 2, 'MnWw': 3, 'GdWo': 4}
    df_data['Fence'].fillna('no', inplace=True)
    df_output['Fence'] = df_data['Fence'].apply(lambda x: cls_fence[x])

    # ====== Masonry veneer ======
    cls_mas = {'no': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3, 'CBlock': 4}
    df_data['MasVnrType'].fillna('no', inplace=True)
    df_output['MasVnrType'] = df_data['MasVnrType'].apply(lambda x: cls_mas[x])

    # ====== Fireplace ======
    cls_fire = {'no': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    df_data['FireplaceQu'].fillna('no', inplace=True)
    df_output['FireplaceQu'] = df_data['FireplaceQu'].apply(lambda x: cls_fire[x])

    # ====== Basement features ======
    bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
    for col in bsmt_cols:
        df_data[col] = df_data[col].fillna('No')

    cls_bsmtQual = {'No': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    cls_bsmtCond = {'No': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    cls_bsmtEx = {'No': 0, 'Mn': 1, 'Av': 2, 'Gd': 3}
    cls_bsmtFinTyp1 = {'No': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
    cls_bsmtFinTyp2 = {'No': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

    df_output['BsmtQual'] = df_data['BsmtQual'].apply(lambda x: cls_bsmtQual[x])
    df_output['BsmtCond'] = df_data['BsmtCond'].apply(lambda x: cls_bsmtCond[x])
    df_output['BsmtExposure'] = df_data['BsmtExposure'].apply(lambda x: cls_bsmtEx[x])
    df_output['BsmtFinType1'] = df_data['BsmtFinType1'].apply(lambda x: cls_bsmtFinTyp1[x])
    df_output['BsmtFinType2'] = df_data['BsmtFinType2'].apply(lambda x: cls_bsmtFinTyp2[x])

    # ====== Output column (chỉ khi train) ======
    if is_train and 'SalePrice' in df_data.columns:
        df_output['Output'] = df_data['SalePrice']

    # ====== Thêm các cột chưa xử lý (numeric + object encoder) ======
    processed_cols = list(df_output.columns)
    remaining_cols = [col for col in df_data.columns if col not in processed_cols and col != 'SalePrice']

    bsmt_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
    for col in bsmt_cols:
        df_data[col] = df_data[col].fillna(0)
        df_output[col] = df_data[col]

    garage_cols = ['GarageCars', 'GarageArea']
    for col in garage_cols:
        df_data[col] = df_data[col].fillna(0)
        df_output[col] = df_data[col]


    for col in remaining_cols:
        if pd.api.types.is_numeric_dtype(df_data[col]):
            df_output[col] = df_data[col]
        elif pd.api.types.is_object_dtype(df_data[col]):
            df_data[col] = df_data[col].fillna('Unknown').astype(str)
            le = LabelEncoder()
            df_output[col] = le.fit_transform(df_data[col])

    # ====== Debug ======
    if is_debug:
        print("===== KIỂM TRA DỮ LIỆU SAU XỬ LÝ =====")
        print(f"Tổng số cột đầu ra: {len(df_output.columns)}")

        print("\n==> 5 dòng đầu:")
        display.display(df_output.head())

        print("\n==> Số lượng giá trị thiếu còn lại:")
        display.display(df_output.isna().sum()[df_output.isna().sum() > 0])

        print("\n==> Phân loại cột:")
        print(f"- Đã xử lý tay: {len(processed_cols)}")
        print(f"- Numeric tự thêm: {len([c for c in remaining_cols if pd.api.types.is_numeric_dtype(df_data[c])])}")
        print(f"- Object được encode: {len([c for c in remaining_cols if pd.api.types.is_object_dtype(df_data[c])])}")

        globals().update(**locals())

    return df_output, None


In [None]:
def preprocessing_feature_02(df_data, is_train=True, is_debug=True, **kwargs):
    # Tận dụng lại feature_01
    df_output, _ = preprocessing_feature_01(df_data, is_train=is_train, is_debug=is_debug, **kwargs)

    # ====== Feature Engineering (thêm đặc trưng mới) ======
    df_output['TotalLiving'] = df_data['GrLivArea'] + df_data['TotalBsmtSF']

    df_output['Qual_LivArea'] = df_data['OverallQual'] * df_output['TotalLiving']

    df_output['GarageScore'] = df_data['GarageArea'] + df_data['GarageCars']

    df_output['TotalAmenities'] = (
        df_data['Fireplaces'] + df_data['FullBath'] + df_data['KitchenAbvGr']
    )

    # df_output['HouseAge'] = df_data['YearBuilt'] - df_data['GarageYrBlt']


    df_output['Loc_Style_Cond'] = (
        df_data['Neighborhood'].astype(str)
        + '_' + df_data['HouseStyle'].astype(str)
        + '_' + df_data['SaleCondition'].astype(str)
    )

    df_output['Qual_Exter_Overall'] = (
        df_data['ExterQual'].astype(str)
        + '_' + df_data['OverallQual'].astype(str)
    )

    for col in df_output.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        df_output[col] = le.fit_transform(df_output[col].astype(str))

    return df_output, None


# Main

In [None]:
def main_feat01(**kwargs):
  # load data
  df_train = pd.read_csv(f'{data_dir}/train.csv')
  df_test = pd.read_csv(f'{data_dir}/test.csv')
  # preprocessing
  df_output_train, _ = preprocessing_feature_01(df_train, is_train=True, is_debug=False)
  df_output_test, _ = preprocessing_feature_01(df_test, is_train=False, is_debug=False)

  # saving
  os.makedirs(save_dir, exist_ok=True)

  # Lưu trực tiếp DataFrame objects (cần allow_pickle=True khi load)
  np.savez(f'{save_dir}/data01.npz',
             train_data=df_output_train.values,
             test_data=df_output_test.values,
             train_columns=df_output_train.columns.values,  # Lưu tên cột train
             test_columns=df_output_test.columns.values,
             allow_pickle=True)

  print("Đã lưu DataFrame 01 với đầy đủ thông tin cột")

  kwargs.get('global_cfg', {}).update(**locals())

def main_feat02(**kwargs):
  # load data
  df_train = pd.read_csv(f'{data_dir}/train.csv')
  df_test = pd.read_csv(f'{data_dir}/test.csv')
  # preprocessing
  df_output_train, _ = preprocessing_feature_02(df_train, is_train=True, is_debug=False)
  df_output_test, _ = preprocessing_feature_02(df_test, is_train=False, is_debug=False)

  # saving
  os.makedirs(save_dir, exist_ok=True)

  # Lưu trực tiếp DataFrame objects (cần allow_pickle=True khi load)
  np.savez(f'{save_dir}/data02.npz',
             train_data=df_output_train.values,
             test_data=df_output_test.values,
             train_columns=df_output_train.columns.values,  # Lưu tên cột train
             test_columns=df_output_test.columns.values,
             allow_pickle=True)

  print("Đã lưu DataFrame 02 với đầy đủ thông tin cột")

  kwargs.get('global_cfg', {}).update(**locals())

if params_cfg["action"] == "train_feat01":
    print("Runing ... [train_feat01]")
    main_feat01(global_cfg = globals())
elif params_cfg["action"] == "train_feat02":
    print("Runing ... [train_feat02]")
    main_feat02(global_cfg=globals())



Runing ... [train_feat02]
Đã lưu DataFrame 02 với đầy đủ thông tin cột


##