In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import warnings
import os

# Scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# XGBoost
import xgboost as xgb

# C·∫•u h√¨nh hi·ªÉn th·ªã
pd.set_option('display.float_format', lambda x: '%.4f' % x)
warnings.filterwarnings('ignore')

# Ki·ªÉm tra GPU
import tensorflow as tf
print("üî• GPU Status:", tf.config.list_physical_devices('GPU'))
print("‚úÖ S·∫µn s√†ng chi·∫øn ƒë·∫•u v·ªõi 75k d√≤ng d·ªØ li·ªáu!")

üî• GPU Status: []
‚úÖ S·∫µn s√†ng chi·∫øn ƒë·∫•u v·ªõi 75k d√≤ng d·ªØ li·ªáu!


In [19]:
# ƒê∆∞·ªùng d·∫´n file m·ªõi (73k d√≤ng)
DATA_PATH = '../data/processed/clean_vn_housing.csv'

df = pd.read_csv(DATA_PATH)
print(f"üìÇ ƒê√£ load d·ªØ li·ªáu: {df.shape}")
# K·ª≥ v·ªçng: (73652, 358)

# 1. T√°ch Feature & Target
target_col = 'Total_Price_Billion'

# Lo·∫°i b·ªè c√°c c·ªôt kh√¥ng d√πng ƒë·ªÉ train
# L∆∞u √Ω: File s·∫°ch ƒë√£ b·ªè Address v√† Price_per_m2, nh∆∞ng ta c·ª© drop l·∫ßn n·ªØa cho ch·∫Øc
drop_cols = [target_col, 'Price_per_m2', 'Address', 'Unnamed: 0']
X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

# 2. Log Transform Target
# B·∫ÆT BU·ªòC: Gi√∫p model d·ª± ƒëo√°n t·ªët d·∫£i gi√° r·ªông t·ª´ 1 t·ª∑ ƒë·∫øn 100 t·ª∑
y = np.log1p(df[target_col])

# 3. Chia t·∫≠p Train/Test (80/20)
# V·ªõi 73k d√≤ng, t·∫≠p test s·∫Ω c√≥ kho·∫£ng 14.7k d√≤ng -> R·∫•t ƒë·ªß ƒë·ªÉ ƒë√°nh gi√° tin c·∫≠y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Scaling (RobustScaler)
# C·ª±c quan tr·ªçng khi c√≥ c·ªôt Length/Width/Area bi√™n ƒë·ªô ch√™nh l·ªách l·ªõn
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# L∆∞u Scaler cho App
if not os.path.exists('../models'): os.makedirs('../models')
joblib.dump(scaler, '../models/scaler_final.pkl')

print(f"‚úÖ D·ªØ li·ªáu ƒë√£ chu·∫©n h√≥a. S·ªë Features: {X_train.shape[1]}")

üìÇ ƒê√£ load d·ªØ li·ªáu: (65468, 346)
‚úÖ D·ªØ li·ªáu ƒë√£ chu·∫©n h√≥a. S·ªë Features: 345


In [20]:
def danh_gia_hieu_nang(model, X_test, y_test_log, ten_mo_hinh):
    # D·ª± ƒëo√°n
    y_pred_log = model.predict(X_test)

    # Chuy·ªÉn ng∆∞·ª£c Log -> Gi√° th·ª±c (T·ª∑ VNƒê)
    y_true = np.expm1(y_test_log)
    y_pred = np.expm1(y_pred_log)
    y_pred = np.maximum(y_pred, 0) # Ch·∫∑n s·ªë √¢m

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(f"\n{'='*40}")
    print(f"üìä K·∫æT QU·∫¢: {ten_mo_hinh.upper()}")
    print(f"{'='*40}")
    print(f"MAE  : {mae:.4f} T·ª∑")
    print(f"RMSE : {rmse:.4f} T·ª∑")
    print(f"R2   : {r2:.4f} ({r2*100:.2f}%)")

    if r2 > 0.7: print("   üöÄ Model ƒë√£ ƒë·∫°t m·ª©c ƒë·ªô ch√≠nh x√°c cao!")
    elif r2 > 0.8: print("   üèÜ Model xu·∫•t s·∫Øc (SOTA)!")

    return {'Model': ten_mo_hinh, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

In [21]:
print(">>> 1. Tuning Random Forest (Big Data Optimized)...")
t0 = time.time()

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid_rf = {
   'n_estimators': [400, 500],    # S·ªë l∆∞·ª£ng c√¢y l·ªõn ƒë·ªÉ ·ªïn ƒë·ªãnh
    'max_depth': [25, 30],         # Gi·ªõi h·∫°n ƒë·ªô s√¢u v·ª´a ph·∫£i
    'max_features': [0.5, 'sqrt'], # Nh√¨n 50% s·ªë c·ªôt ƒë·ªÉ n·∫Øm b·∫Øt T√™n ƒë∆∞·ªùng t·ªët h∆°n

    # QUAN TR·ªåNG: min_samples_leaf = 2
    # B·∫Øt bu·ªôc m·ªói nh√°nh cu·ªëi c√πng ph·∫£i c√≥ √≠t nh·∫•t 2 cƒÉn nh√†.
    # ƒêi·ªÅu n√†y ngƒÉn model "nh·ªõ v·∫πt" m·ªôt cƒÉn nh√† duy nh·∫•t (Ch·ªëng Overfitting hi·ªáu qu·∫£ nh·∫•t).
    'min_samples_leaf': [2],
    'min_samples_split': [5]
}

# D√πng n_jobs=-1 ƒë·ªÉ √©p CPU ch·∫°y 100% c√¥ng su·∫•t
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)

best_rf = grid_rf.best_estimator_
print(f"‚è±Ô∏è RF Time: {time.time() - t0:.1f}s")
print("Best Params RF:", grid_rf.best_params_)

# L∆∞u v√† ƒë√°nh gi√°
joblib.dump(best_rf, '../models/rf_final.pkl')
res_rf = danh_gia_hieu_nang(best_rf, X_test_scaled, y_test, "Random Forest (Full Data)")

>>> 1. Tuning Random Forest (Big Data Optimized)...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
‚è±Ô∏è RF Time: 747.9s
Best Params RF: {'max_depth': 30, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}

üìä K·∫æT QU·∫¢: RANDOM FOREST (FULL DATA)
MAE  : 0.6049 T·ª∑
RMSE : 1.0079 T·ª∑
R2   : 0.8366 (83.66%)
   üöÄ Model ƒë√£ ƒë·∫°t m·ª©c ƒë·ªô ch√≠nh x√°c cao!


In [22]:
train_r2 = r2_score(y_train, best_rf.predict(X_train_scaled))
test_r2  = r2_score(y_test, best_rf.predict(X_test_scaled))

print("Train R2:", train_r2)
print("Test  R2:", test_r2)
print("Gap     :", train_r2 - test_r2)

Train R2: 0.8928784011454947
Test  R2: 0.8174614186143531
Gap     : 0.0754169825311416


In [23]:
print(">>> 2. Tuning XGBoost (GPU RTX 3050 Power)...")
t0 = time.time()

# C·∫•u h√¨nh chu·∫©n cho GPU
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist', # B·∫Øt bu·ªôc cho d·ªØ li·ªáu l·ªõn (>50k d√≤ng)
    device='cuda',      # K√≠ch ho·∫°t GPU
    random_state=42,
    n_jobs=1
)

param_grid_xgb = {
   'n_estimators': [3000],          # 3000 c√¢y l√† ƒë·ªß cho d·ªØ li·ªáu n√†y
    'learning_rate': [0.01],         # H·ªçc ch·∫≠m ƒë·ªÉ ch·∫Øc ch·∫Øn
    'max_depth': [10, 12],           # ƒê·ªô s√¢u ƒë·ªß ƒë·ªÉ hi·ªÉu T√™n ƒë∆∞·ªùng

    # CH·ªêNG OVERFITTING:
    'reg_alpha': [1.0, 2.0],         # TƒÉng ph·∫°t L1 (C≈© l√† 0.1 -> Gi·ªù tƒÉng l√™n 1.0)
    'reg_lambda': [5.0],             # Ph·∫°t L2
    'subsample': [0.85],             # L·∫•y m·∫´u ng·∫´u nhi√™n 85%
    'colsample_bytree': [0.6]        # Nh√¨n 60% s·ªë c·ªôt
}

grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=3, scoring='r2', verbose=1, n_jobs=1)
grid_xgb.fit(X_train_scaled, y_train)

best_xgb = grid_xgb.best_estimator_
print(f"‚è±Ô∏è XGB Time: {time.time() - t0:.1f}s")
print("Best Params XGB:", grid_xgb.best_params_)

# L∆∞u model (Model n√†y s·∫Ω d√πng cho App)
res_xgb = danh_gia_hieu_nang(best_xgb, X_test_scaled, y_test, "XGBoost (Full Data)")
joblib.dump(best_xgb, '../models/xgb_final.pkl')

>>> 2. Tuning XGBoost (GPU RTX 3050 Power)...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
‚è±Ô∏è XGB Time: 558.6s
Best Params XGB: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 12, 'n_estimators': 3000, 'reg_alpha': 1.0, 'reg_lambda': 5.0, 'subsample': 0.85}

üìä K·∫æT QU·∫¢: XGBOOST (FULL DATA)
MAE  : 0.6045 T·ª∑
RMSE : 1.0191 T·ª∑
R2   : 0.8329 (83.29%)
   üöÄ Model ƒë√£ ƒë·∫°t m·ª©c ƒë·ªô ch√≠nh x√°c cao!


['../models/xgb_final.pkl']

In [24]:
train_r2 = r2_score(y_train, best_xgb.predict(X_train_scaled))
test_r2  = r2_score(y_test, best_xgb.predict(X_test_scaled))

print("Train R2:", train_r2)
print("Test  R2:", test_r2)
print("Gap     :", train_r2 - test_r2)

Train R2: 0.8555807924910348
Test  R2: 0.8211855216874435
Gap     : 0.03439527080359128


In [26]:
import pandas as pd

# 1. Gom k·∫øt qu·∫£ t·ª´ 3 model (ƒê·∫£m b·∫£o b·∫°n ƒë√£ ch·∫°y xong 3 cell tr√™n)
results_list = [res_rf, res_xgb]

# 2. T·∫°o b·∫£ng DataFrame
df_results = pd.DataFrame(results_list)

# 3. S·∫Øp x·∫øp theo R2 gi·∫£m d·∫ßn (Model x·ªãn nh·∫•t l√™n ƒë·∫ßu)
df_results = df_results.sort_values(by='R2', ascending=False).reset_index(drop=True)

# 4. Hi·ªÉn th·ªã b·∫£ng ƒë∆°n gi·∫£n
print(">>> B·∫¢NG SO S√ÅNH K·∫æT QU·∫¢ CU·ªêI C√ôNG")
display(df_results)

# (T√πy ch·ªçn) L∆∞u ra file Excel/CSV ƒë·ªÉ n·ªôp b√°o c√°o
df_results.to_csv('../data/processed/ket_qua_so_sanh.csv', index=False)

>>> B·∫¢NG SO S√ÅNH K·∫æT QU·∫¢ CU·ªêI C√ôNG


Unnamed: 0,Model,MAE,RMSE,R2
0,Random Forest (Full Data),0.6049,1.0079,0.8366
1,XGBoost (Full Data),0.6045,1.0191,0.8329
