# Import Libraries

In [1]:
import importlib
import sys
sys.modules['imp'] = importlib

In [2]:
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
params_cfg = {
    "action"   : "train_feat01",
    "feat_path": "/content/drive/MyDrive/house_predict/process/exps/data.npz",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('/content/drive/MyDrive/house_predict/process/exps'),
    'exp_name': 'trainbase_27102025',
    "data_dir": os.path.abspath("/content/drive/MyDrive/house_predict/data"),
    "verbose" : True,
}

params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)

+ action: train_feat01
+ feat_path: /content/drive/MyDrive/house_predict/process/exps/data.npz
+ seed: 42
+ exp_dir: /content/drive/MyDrive/house_predict/process/exps
+ exp_name: trainbase_27102025
+ data_dir: /content/drive/MyDrive/house_predict/data
+ verbose: True
+ save_dir: /content/drive/MyDrive/house_predict/process/exps/trainbase_27102025


# Data Load

In [4]:
df_train = pd.read_csv(f'{data_dir}/train.csv')
df_test = pd.read_csv(f'{data_dir}/test.csv')

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

---------- information ----------
train-col: {'LotShape', 'FireplaceQu', '3SsnPorch', 'BsmtFinType1', 'HeatingQC', 'CentralAir', 'Alley', '1stFlrSF', 'ExterCond', 'GarageType', 'MoSold', 'Exterior1st', 'WoodDeckSF', 'BsmtFinSF1', 'MiscVal', 'YearRemodAdd', 'KitchenAbvGr', 'Neighborhood', 'LowQualFinSF', 'GarageQual', 'OverallQual', 'Utilities', 'YrSold', 'Street', 'SaleType', 'OpenPorchSF', 'Functional', 'PoolQC', 'BsmtFullBath', 'LotFrontage', 'GrLivArea', 'Fence', 'EnclosedPorch', 'TotRmsAbvGrd', 'GarageYrBlt', 'OverallCond', 'BldgType', 'PavedDrive', 'TotalBsmtSF', 'Fireplaces', 'GarageCond', 'ExterQual', 'LotArea', 'RoofMatl', 'LandSlope', 'FullBath', 'BsmtUnfSF', '2ndFlrSF', 'Condition1', 'GarageCars', 'Condition2', 'Electrical', 'HouseStyle', 'RoofStyle', 'KitchenQual', 'BsmtCond', 'MSSubClass', 'MasVnrType', 'Exterior2nd', 'BsmtHalfBath', 'SalePrice', 'BsmtFinType2', 'BsmtQual', 'LotConfig', 'BedroomAbvGr', 'BsmtFinSF2', 'HalfBath', 'MSZoning', 'ScreenPorch', 'GarageFinish', 'Po

# Processing

## X·ª≠ l√≠ c√°c d·ªØ li·ªáu thi·∫øu trong vi·ªác ph√¢n t√≠ch EDA

In [5]:
df_output = pd.DataFrame()
df_output = df_train.copy()

### X√≥a nh·ªØng c·ªôt tr√™n 50% d·ªØ li·ªáu thi·∫øu

In [6]:
missing_ratio = df_output.isnull().sum() / len(df_output)
cols_to_drop = missing_ratio[missing_ratio > 0.5].index
df_output = df_output.drop(cols_to_drop, axis=1)

In [7]:
num_cols = df_output.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df_output.select_dtypes(include=['object','category']).columns

### X·ª≠ l√≠ missing value cho c√°c c·ªôt num_cols v√† cat_cols

In [8]:
for col in num_cols:
    if df_output[col].isnull().sum() > 0:
        median_value = df_output[col].median()
        df_output[col] = df_output[col].fillna(median_value)


In [9]:
cols_fill_none = [
    'FireplaceQu', 'GarageFinish', 'GarageType', 'GarageQual', 'GarageCond',
    'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1', 'BsmtCond', 'BsmtQual'
]

for col in cols_fill_none:
    df_output[col] = df_output[col].fillna('None')

df_output['Electrical'] = df_output['Electrical'].fillna(df_output['Electrical'].mode()[0])


### T·∫°o c√°c feature m·ªõi

In [10]:
# df_output['Qual_LivArea'] = df_output['OverallQual'] * df_output['TotalLiving']
df_output['GarageScore'] = df_output['GarageArea'] + df_output['GarageCars']
df_output['TotalAmenities'] = (
    df_output['Fireplaces'] + df_output['FullBath'] + df_output['KitchenAbvGr']
)
df_output['Loc_Style_Cond'] = (
    df_output['Neighborhood'] + '_' + df_output['HouseStyle'] + '_' + df_output['SaleCondition']
)
df_output['Qual_Exter_Overall'] = (
    df_output['ExterQual'] + '_' + df_output['OverallQual'].astype(str)
)


# T·ªïng di·ªán t√≠ch s·ª≠ d·ª•ng ƒë∆∞·ª£c
df_output['TotalArea'] = (
    df_output['GrLivArea'] + df_output['TotalBsmtSF'] +
    df_output['GarageArea'] + df_output['WoodDeckSF'] +
    df_output['OpenPorchSF']
)

# Di·ªán t√≠ch trung b√¨nh m·ªói ph√≤ng
df_output['AreaPerRoom'] = df_output['GrLivArea'] / (df_output['TotRmsAbvGrd'] + 1)

# Tu·ªïi nh√† v√† th·ªùi gian s·ª≠a
df_output['RemodAge'] = df_output['YrSold'] - df_output['YearRemodAdd']
df_output['AgeSinceBuilt'] = df_output['YrSold'] - df_output['YearBuilt']

# T·ªïng s·ªë ph√≤ng t·∫Øm (t√≠nh c·∫£ t·∫ßng h·∫ßm)
df_output['Baths_Total'] = (
    df_output['FullBath'] + 0.5 * df_output['HalfBath'] +
    df_output['BsmtFullBath'] + 0.5 * df_output['BsmtHalfBath']
)

# ƒê·∫∑c tr∆∞ng t∆∞∆°ng t√°c gi·ªØa di·ªán t√≠ch v√† ch·∫•t l∆∞·ª£ng
df_output['Qual_LotArea'] = df_output['OverallQual'] * df_output['LotArea']
df_output['TotalArea_Qual'] = df_output['TotalArea'] * df_output['OverallQual']

# Th·ªùi gian t·ª´ l·∫ßn s·ª≠a ƒë·∫øn nƒÉm b√°n
df_output['SoldSinceRemod'] = df_output['YrSold'] - df_output['YearRemodAdd']


new_features = [
    'GarageScore', 'TotalAmenities',
    'Loc_Style_Cond', 'Qual_Exter_Overall', 'TotalArea', 'AreaPerRoom',
    'RemodAge', 'AgeSinceBuilt', 'Baths_Total', 'Qual_LotArea',
    'TotalArea_Qual', 'SoldSinceRemod'
]

print("‚úÖ T·ªïng s·ªë feature m·ªõi ƒë∆∞·ª£c th√™m v√†o df_output:", len(new_features))
print(new_features)


‚úÖ T·ªïng s·ªë feature m·ªõi ƒë∆∞·ª£c th√™m v√†o df_output: 12
['GarageScore', 'TotalAmenities', 'Loc_Style_Cond', 'Qual_Exter_Overall', 'TotalArea', 'AreaPerRoom', 'RemodAge', 'AgeSinceBuilt', 'Baths_Total', 'Qual_LotArea', 'TotalArea_Qual', 'SoldSinceRemod']


In [11]:
def preprocessing_feature_01(df_data, is_train=True, is_debug=True, **kwargs):
    """
    H√†m ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu m·ªü r·ªông cho b√†i House Prices.
    - ƒêi·ªÅn missing value
    - T·∫°o feature m·ªõi
    - Encode d·ªØ li·ªáu d·∫°ng category b·∫±ng One-Hot Encoding
    """
    df_output = df_data.copy()

    missing_ratio = df_output.isnull().sum() / len(df_output)
    cols_to_drop = missing_ratio[missing_ratio > 0.5].index
    df_output = df_output.drop(cols_to_drop, axis=1)

    # ---- 1Ô∏è‚É£ ƒêi·ªÅn gi√° tr·ªã thi·∫øu cho d·∫°ng s·ªë ----
    num_cols = df_output.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        if df_output[col].isnull().sum() > 0:
            median_value = df_output[col].median()
            df_output[col] = df_output[col].fillna(median_value)

    # ---- 2Ô∏è‚É£ ƒêi·ªÅn gi√° tr·ªã thi·∫øu cho d·∫°ng category ----
    cols_fill_none = [
        'FireplaceQu', 'GarageFinish', 'GarageType', 'GarageQual', 'GarageCond',
        'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1', 'BsmtCond', 'BsmtQual'
    ]
    for col in cols_fill_none:
        if col in df_output.columns:
            df_output[col] = df_output[col].fillna('None')

    if 'Electrical' in df_output.columns:
        df_output['Electrical'] = df_output['Electrical'].fillna(df_output['Electrical'].mode()[0])

    # ---- 3Ô∏è‚É£ Feature engineering ----
    if all(c in df_output.columns for c in ['GarageArea', 'GarageCars']):
        df_output['GarageScore'] = df_output['GarageArea'] + df_output['GarageCars']

    if all(c in df_output.columns for c in ['Fireplaces', 'FullBath', 'KitchenAbvGr']):
        df_output['TotalAmenities'] = (
            df_output['Fireplaces'] + df_output['FullBath'] + df_output['KitchenAbvGr']
        )

    if all(c in df_output.columns for c in ['Neighborhood', 'HouseStyle', 'SaleCondition']):
        df_output['Loc_Style_Cond'] = (
            df_output['Neighborhood'] + '_' +
            df_output['HouseStyle'] + '_' +
            df_output['SaleCondition']
        )

    if all(c in df_output.columns for c in ['ExterQual', 'OverallQual']):
        df_output['Qual_Exter_Overall'] = (
            df_output['ExterQual'] + '_' + df_output['OverallQual'].astype(str)
        )

    if all(c in df_output.columns for c in ['GrLivArea', 'TotalBsmtSF', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF']):
        df_output['TotalArea'] = (
            df_output['GrLivArea'] + df_output['TotalBsmtSF'] +
            df_output['GarageArea'] + df_output['WoodDeckSF'] + df_output['OpenPorchSF']
        )

    if all(c in df_output.columns for c in ['GrLivArea', 'TotRmsAbvGrd']):
        df_output['AreaPerRoom'] = df_output['GrLivArea'] / (df_output['TotRmsAbvGrd'] + 1)

    if all(c in df_output.columns for c in ['YrSold', 'YearRemodAdd']):
        df_output['RemodAge'] = df_output['YrSold'] - df_output['YearRemodAdd']

    if all(c in df_output.columns for c in ['YrSold', 'YearBuilt']):
        df_output['AgeSinceBuilt'] = df_output['YrSold'] - df_output['YearBuilt']

    bath_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
    if all(c in df_output.columns for c in bath_cols):
        df_output['Baths_Total'] = (
            df_output['FullBath'] + 0.5 * df_output['HalfBath'] +
            df_output['BsmtFullBath'] + 0.5 * df_output['BsmtHalfBath']
        )

    if all(c in df_output.columns for c in ['OverallQual', 'LotArea']):
        df_output['Qual_LotArea'] = df_output['OverallQual'] * df_output['LotArea']

    if all(c in df_output.columns for c in ['TotalArea', 'OverallQual']):
        df_output['TotalArea_Qual'] = df_output['TotalArea'] * df_output['OverallQual']

    if all(c in df_output.columns for c in ['YrSold', 'YearRemodAdd']):
        df_output['SoldSinceRemod'] = df_output['YrSold'] - df_output['YearRemodAdd']

    cols_fill_mode = [
    'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd',
    'KitchenQual', 'Functional', 'SaleType'
    ]

    for col in cols_fill_mode:
        if col in df_output.columns:
            df_output[col] = df_output[col].fillna(df_output[col].mode()[0])



    # ---- 5Ô∏è‚É£ Debug ----
    if is_debug:
        print("‚úÖ Ho√†n t·∫•t preprocessing_feature_01")
        print(f"üîπ D·ªØ li·ªáu ƒë·∫ßu ra: {df_output.shape[0]} h√†ng, {df_output.shape[1]} c·ªôt")
        new_cols = [col for col in df_output.columns if col not in df_data.columns]
        print(f"‚ú® S·ªë l∆∞·ª£ng feature m·ªõi: {len(new_cols)}")
        print("üÜï Feature m·ªõi:", new_cols[:15], "..." if len(new_cols) > 15 else "")

    return df_output, None


In [12]:
def main_feat01(**kwargs):
  # load data
  df_train = pd.read_csv(f'{data_dir}/train.csv')
  df_test = pd.read_csv(f'{data_dir}/test.csv')
  # preprocessing
  df_output_train, _ = preprocessing_feature_01(df_train, is_train=True, is_debug=False)
  df_output_test, _ = preprocessing_feature_01(df_test, is_train=False, is_debug=False)

  # saving
  os.makedirs(save_dir, exist_ok=True)

  # L∆∞u tr·ª±c ti·∫øp DataFrame objects (c·∫ßn allow_pickle=True khi load)
  np.savez(f'{save_dir}/data01.npz',
             train_data=df_output_train.values,
             test_data=df_output_test.values,
             train_columns=df_output_train.columns.values,  # L∆∞u t√™n c·ªôt train
             test_columns=df_output_test.columns.values,
             allow_pickle=True)

  print("ƒê√£ l∆∞u DataFrame 01 v·ªõi ƒë·∫ßy ƒë·ªß th√¥ng tin c·ªôt")

  kwargs.get('global_cfg', {}).update(**locals())

if params_cfg["action"] == "train_feat01":
    print("Runing ... [train_feat01]")
    main_feat01(global_cfg = globals())

Runing ... [train_feat01]
ƒê√£ l∆∞u DataFrame 01 v·ªõi ƒë·∫ßy ƒë·ªß th√¥ng tin c·ªôt
