## Import Libraries

In [2]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

## Data Load

In [28]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [4]:
df_train.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [6]:
df_test.isna().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [7]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
df_train.shape

(1460, 81)

In [9]:
df_test.shape

(1459, 80)

## Preprocessing

In [29]:
# xóa các cột có nhiều Missing Values

df_train = df_train.drop(columns=['Alley','MasVnrType', 'FireplaceQu','PoolQC','Fence','MiscFeature'])
df_test = df_test.drop(columns=['Alley','MasVnrType', 'FireplaceQu','PoolQC','Fence','MiscFeature'])
print("---")

---


In [30]:
# Điền median các cột numeric 
num_cols=df_train.drop(columns=['SalePrice']).select_dtypes(include=['int64', 'float64','int32']).columns

median_values = df_train[num_cols].median()

df_train[num_cols] = df_train[num_cols].fillna(median_values)
df_test[num_cols] = df_test[num_cols].fillna(median_values)
print("---")

---


In [31]:
for cols in df_train.drop(columns=['SalePrice']).columns:
    mode = df_train[cols].mode()[0]
    df_train[cols].fillna(mode, inplace=True)
    df_test[cols].fillna(mode, inplace=True)
print("---")

---


In [8]:
pd.set_option('display.max_rows', None)

In [9]:
df_train.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd

In [10]:
df_test.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd

## Feature Engineering

In [32]:
def create_new_features(all_data):
    """
    Tạo các features mới dựa trên domain knowledge
    """
    print("Creating new features...")
    
    # === DIỆN TÍCH ===
    # Tổng diện tích nhà
    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
    print("  - Created TotalSF")
    
    # Diện tích sàn trung bình mỗi phòng
    all_data['SFPerRoom'] = all_data['TotalSF'] / all_data['TotRmsAbvGrd']
    all_data['SFPerRoom'].replace([np.inf, -np.inf], 0, inplace=True)
    print("  - Created SFPerRoom")
    
    # Tỷ lệ diện tích basement
    all_data['BsmtRatio'] = all_data['TotalBsmtSF'] / all_data['TotalSF']
    all_data['BsmtRatio'].replace([np.inf, -np.inf], 0, inplace=True)
    print("  - Created BsmtRatio")
    
    # === PHÒNG TẮM ===
    # Tổng số phòng tắm
    all_data['TotalBath'] = (all_data['FullBath'] + 
                           0.5 * all_data['HalfBath'] + 
                           all_data['BsmtFullBath'] + 
                           0.5 * all_data['BsmtHalfBath'])
    print("  - Created TotalBath")
    
    # === TUỔI VÀ THỜI GIAN ===
    # Tuổi nhà và tuổi cải tạo
    all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
    all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
    print("  - Created HouseAge, RemodAge")
    
    # Đã cải tạo hay chưa
    all_data['IsRemodeled'] = (all_data['YearBuilt'] != all_data['YearRemodAdd']).astype(int)
    print("  - Created IsRemodeled")
    
    # === CHẤT LƯỢNG VÀ ĐIỂM SỐ ===
    # Chất lượng tổng thể (kết hợp nhiều yếu tố)
    all_data['OverallGrade'] = all_data['OverallQual'] * all_data['OverallCond']
    print("  - Created OverallGrade")
    
    # Điểm chất lượng bếp
    all_data['KitchenScore'] = all_data['KitchenQual'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    print("  - Created KitchenScore")
    
    # === BINARY FEATURES ===
    # Có garage hay không
    all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
    
    # Có basement hay không
    all_data['HasBasement'] = (all_data['TotalBsmtSF'] > 0).astype(int)
    
    # Có pool hay không
    all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
    
    # Có lò sưởi hay không
    all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)
    print("  - Created binary features (HasGarage, HasBasement, HasPool, HasFireplace)")
    
    # === THỜI GIAN BÁN ===
    # Mùa bán nhà
    all_data['SeasonSold'] = all_data['MoSold'].map({
        12: 1, 1: 1, 2: 1,  # Winter
        3: 2, 4: 2, 5: 2,   # Spring
        6: 3, 7: 3, 8: 3,   # Summer
        9: 4, 10: 4, 11: 4  # Fall
    })
    print("  - Created SeasonSold")
    
    return all_data

In [33]:
def transform_features(all_data):
    """
    Áp dụng các transformation để cải thiện phân phối của features
    """
    print("Transforming features...")
    
    # Log transformation cho các feature skewed
    skewed_features = ['LotArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'LotFrontage', 'MasVnrArea']
    
    transformed_count = 0
    for feature in skewed_features:
        if feature in all_data.columns:
            # Thêm 1 để tránh log(0)
            all_data[f'Log_{feature}'] = np.log1p(all_data[feature])
            transformed_count += 1
    
    if transformed_count > 0:
        print(f"  - Log transformed: {transformed_count} features")
    
    # Binning cho các continuous variables (chỉ áp dụng nếu feature tồn tại)
    if 'LotArea' in all_data.columns:
        all_data['LotArea_binned'] = pd.cut(all_data['LotArea'], bins=5, labels=[1, 2, 3, 4, 5])
        all_data['LotArea_binned'].fillna(3, inplace=True)  # fill NA với category trung bình
    
    if 'TotalSF' in all_data.columns:
        all_data['TotalSF_binned'] = pd.cut(all_data['TotalSF'], bins=5, labels=[1, 2, 3, 4, 5])
        all_data['TotalSF_binned'].fillna(3, inplace=True)
        print("  - Created binned features")
    
    return all_data

In [34]:
def advanced_feature_engineering_pipeline(train, test):
    """
    Pipeline chính cho feature engineering - kết hợp tất cả chiến lược
    """
    print("Starting advanced feature engineering pipeline...")
    
    # Lưu ID và target trước khi xử lý
    train_ids = train['Id']
    test_ids = test['Id']
    y_train = train['SalePrice']
    
    # Kết hợp train và test để xử lý đồng nhất
    all_data = pd.concat([train.drop('SalePrice', axis=1), test], ignore_index=True)
    
    print(f"Initial data shape: {all_data.shape}")
    
    # Áp dụng từng chiến lược theo thứ tự
    all_data = create_new_features(all_data)
    all_data = transform_features(all_data)
    
    # Tách lại thành train và test
    train_processed = all_data.iloc[:len(train)].copy()
    test_processed = all_data.iloc[len(train):].copy()
    
    # Thêm lại target variable
    train_processed['SalePrice'] = y_train
    
    # Đảm bảo Id được giữ nguyên
    train_processed['Id'] = train_ids
    test_processed['Id'] = test_ids
    
    print(f"Final train shape: {train_processed.shape}")
    print(f"Final test shape: {test_processed.shape}")
    
    # Thông tin về features mới
    new_features = [col for col in train_processed.columns if col not in train.columns]
    print(f"Total new features created: {len(new_features)}")
    
    return train_processed, test_processed

In [35]:
def analyze_features(train_processed):
    """
    Phân tích các features đã được tạo
    """
    print("\n=== FEATURE ANALYSIS ===")
    
    # Phân loại features
    numerical = [f for f in train_processed.columns if train_processed[f].dtype in ['int64', 'float64']]
    categorical = [f for f in train_processed.columns if train_processed[f].dtype == 'object']
    
    print(f"Numerical features: {len(numerical)}")
    print(f"Categorical features: {len(categorical)}")
    
    # Features mới được tạo
    original_features = set(df_train.columns)
    new_features = set(train_processed.columns) - original_features
    
    print(f"\nNew features created ({len(new_features)}):")
    for feature in sorted(new_features):
        print(f"  - {feature}")
    
    return new_features

In [36]:
if __name__ == "__main__":
    # Giả sử train và test đã được định nghĩa
    df_train, df_test = advanced_feature_engineering_pipeline(df_train, df_test)
    
    # Phân tích kết quả
    new_features = analyze_features(df_train)
    
    print("\n=== PROCESSING COMPLETED ===")
    print("Data is ready for modeling!")

Starting advanced feature engineering pipeline...
Initial data shape: (2919, 74)
Creating new features...
  - Created TotalSF
  - Created SFPerRoom
  - Created BsmtRatio
  - Created TotalBath
  - Created HouseAge, RemodAge
  - Created IsRemodeled
  - Created OverallGrade
  - Created KitchenScore
  - Created binary features (HasGarage, HasBasement, HasPool, HasFireplace)
  - Created SeasonSold
Transforming features...
  - Log transformed: 6 features
  - Created binned features
Final train shape: (1460, 97)
Final test shape: (1459, 96)
Total new features created: 22

=== FEATURE ANALYSIS ===
Numerical features: 58
Categorical features: 37

New features created (0):

=== PROCESSING COMPLETED ===
Data is ready for modeling!


## ENCODING

 ### onehot các cột object 

In [37]:

cat_columns=df_train.select_dtypes(include=['object']).columns
num_columns=df_train.select_dtypes(include=['int64','float64']).columns
num_columns_test=df_train.select_dtypes(include=['int64','float64']).drop(columns=['SalePrice']).columns

In [38]:
ohe=OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False)
encoded_train = ohe.fit_transform(df_train[cat_columns])
encoded_test = ohe.transform(df_test[cat_columns])

In [39]:
encoded_df_train = pd.DataFrame(encoded_train,columns=ohe.get_feature_names_out(cat_columns), index=df_train.index)
encoded_df_test = pd.DataFrame(encoded_test, columns=ohe.get_feature_names_out(cat_columns), index=df_test.index)

In [40]:
num_df_train = df_train[num_columns]
num_df_test = df_test[num_columns_test]

In [41]:
df_train = pd.concat([num_df_train,encoded_df_train], axis=1)
df_test = pd.concat([num_df_test,encoded_df_test], axis=1)

In [42]:
df_train.shape

(1460, 251)

In [43]:
df_test.shape

(1459, 250)

## SCALING

In [44]:
scaler = StandardScaler()


# num_columns_test không chứa cột SalePrice, được định nghĩa ở onehot 
df_train[num_columns_test] = scaler.fit_transform(df_train[num_columns_test])
df_test[num_columns_test]  = scaler.transform(df_test[num_columns_test])

print("✓ Hoàn thành Scaling")

✓ Hoàn thành Scaling


In [45]:
df_train = df_train.drop(columns=['Id'])
df_test = df_test.drop(columns=['Id'])

In [46]:
df_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.073375,-0.220875,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.46032,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,-0.084636,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.44794,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.641972,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [47]:
df_test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1460,-0.872563,0.46032,0.110763,-0.795151,0.381743,-0.340077,-1.15638,-0.57075,0.053428,0.604293,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1461,-0.872563,0.505733,0.37585,-0.071836,0.381743,-0.43944,-1.30174,0.027027,1.051363,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1462,0.073375,0.187842,0.332053,-0.795151,-0.5172,0.852269,0.6364,-0.57075,0.761852,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1463,0.073375,0.369494,-0.054002,-0.071836,0.381743,0.88539,0.6364,-0.460051,0.347326,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1464,1.492282,-1.219961,-0.552407,1.374795,-0.5172,0.686666,0.345679,-0.57075,-0.39619,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Train

In [48]:
print("\n=== TRAINING FULL MODEL (KNN & Decision Tree) ===")

# === Dữ liệu ===
X = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

# Log-transform target
y_log = np.log1p(y)

# Điền thiếu và one-hot
X = X.fillna(X.median(numeric_only=True))
X = pd.get_dummies(X, drop_first=True)

# === KFold ===
cv = KFold(n_splits=5, shuffle=True, random_state=42)

lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42, n_jobs=-1)

model_map = {
    'LinearRegression': lr,
    'RandomForestRegressor': rf
}

# === Đánh giá ===
best_score = np.inf
best_name = None
best_model = None
results = {}

# Scorer = RMSE trên log-target  => tương đương RMSLE
rmse_scorer = 'neg_root_mean_squared_error'

for name, clf in model_map.items():

    # Tính RMSLE (trên y_log)
    scores = cross_val_score(clf, X, y_log, cv=cv, scoring=rmse_scorer)
    rmsle_scores = -scores
    mean_rmsle = rmsle_scores.mean()
    std_rmsle = rmsle_scores.std()

    # Tính R² (trên y thật)
    r2_scores = cross_val_score(clf, X, y, cv=cv, scoring='r2')
    mean_r2 = r2_scores.mean()
    std_r2 = r2_scores.std()

    # Lưu kết quả
    results[name] = {'r2': mean_r2, 'rmsle': mean_rmsle}
    print(f"{name}: RMSLE (mean±std) = {mean_rmsle:.4f} ± {std_rmsle:.4f} | R² (mean±std) = {mean_r2:.4f} ± {std_r2:.4f}")

    # Cập nhật best model
    if mean_rmsle < best_score:
        best_score = mean_rmsle
        best_name = name
        best_model = clf

# === Tổng hợp kết quả ===
print("\n=== Tổng hợp KFold results ===")
for name, met in results.items():
    print(f"{name:22s} R²={met['r2']:.4f} | RMSLE={met['rmsle']:.4f}")

print(f"\nBest model by CV RMSLE: {best_name} (RMSLE = {best_score:.4f})")

# === Fit lại best model trên toàn bộ dữ liệu train (y_log để nhất quán) ===
best_model.fit(X, y_log)

print(f"✓ Đã huấn luyện mô hình {best_name} trên toàn bộ dữ liệu train")



=== TRAINING FULL MODEL (KNN & Decision Tree) ===
LinearRegression: RMSLE (mean±std) = 111751666.5327 ± 196369993.8687 | R² (mean±std) = -774220938719908608.0000 ± 1538703700636756992.0000
RandomForestRegressor: RMSLE (mean±std) = 0.1408 ± 0.0194 | R² (mean±std) = 0.8442 ± 0.1079

=== Tổng hợp KFold results ===
LinearRegression       R²=-774220938719908608.0000 | RMSLE=111751666.5327
RandomForestRegressor  R²=0.8442 | RMSLE=0.1408

Best model by CV RMSLE: RandomForestRegressor (RMSLE = 0.1408)
✓ Đã huấn luyện mô hình RandomForestRegressor trên toàn bộ dữ liệu train


## Submission

In [49]:
test_raw = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

pred_log = best_model.predict(df_test)
pred = np.expm1(pred_log)            # đảo log1p → SalePrice dương

submission = pd.DataFrame({"Id": test_raw["Id"], "SalePrice": pred})
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' created.")


Submission file 'submission.csv' created.
