In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import pickle
import numpy as np
import pandas as pd

# Scalers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import category_encoders as ce
from prophet import Prophet
import statsmodels.api as sm

df = pd.read_csv("train.csv")
print(f"Data shape: {df.shape}")
print(f"Columns: {len(df.columns)}")


# Train/Test split theo nƒÉm
df_train = df[df['YrSold'] < 2010].copy()
df_test  = df[df['YrSold'] >= 2010].copy()

print(f"Train: {len(df_train)} rows ({len(df_train)/len(df)*100:.1f}%)")
print(f"Test:  {len(df_test)} rows ({len(df_test)/len(df)*100:.1f}%)")

print(f"\nTrain years: {df_train['YrSold'].min()} - {df_train['YrSold'].max()}")
print(f"Test years:  {df_test['YrSold'].min()} - {df_test['YrSold'].max()}")


def basic_impute(df_in):
    """Impute missing: median for numeric, mode for categorical."""
    df = df_in.copy()

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

    for c in num_cols:
        if df[c].isna().any():
            df[c].fillna(df[c].median(), inplace=True)

    for c in cat_cols:
        if df[c].isna().any():
            mode_val = df[c].mode().iloc[0] if not df[c].mode().empty else "NA"
            df[c].fillna(mode_val, inplace=True)

    return df

print("basic_impute() defined")

quality_map = {
    "Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0
}

print("quality_map defined")
print(quality_map)


location_lot_cols = [
    "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
    "Street", "Alley", "LotShape", "LandContour",
    "Utilities", "LotConfig", "LandSlope",
    "Neighborhood", "Condition1", "Condition2"
]

size_interior_cols = [
    "GrLivArea", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
    "LowQualFinSF", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotRmsAbvGrd", "BedroomAbvGr", "KitchenAbvGr",
    "GarageArea", "GarageCars", "MasVnrArea"
]

quality_condition_cols = [
    "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd",
    "ExterQual", "ExterCond", "BsmtQual", "BsmtCond",
    "HeatingQC", "KitchenQual", "FireplaceQu",
    "GarageQual", "GarageCond", "PoolQC"
]

amenities_sale_time_cols = [
    "Fireplaces", "GarageYrBlt", "PoolArea", "Fence",
    "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition"
]

groups_info = {
    "Location & Lot": location_lot_cols,
    "Size & Interior": size_interior_cols,
    "Quality & Condition": quality_condition_cols,
    "Amenities, Sale & Time": amenities_sale_time_cols
}

# Ch·ªâ gi·ªØ c·ªôt t·ªìn t·∫°i trong dataset
location_lot_cols        = [c for c in location_lot_cols if c in df.columns]
size_interior_cols       = [c for c in size_interior_cols if c in df.columns]
quality_condition_cols   = [c for c in quality_condition_cols if c in df.columns]
amenities_sale_time_cols = [c for c in amenities_sale_time_cols if c in df.columns]

print("Feature groups loaded:")
print(f" Location & Lot:     {len(location_lot_cols)}")
print(f" Size & Interior:    {len(size_interior_cols)}")
print(f" Quality & Condition:{len(quality_condition_cols)}")
print(f" Amenities/Sale:     {len(amenities_sale_time_cols)}")

Data shape: (1460, 81)
Columns: 81
Train: 1285 rows (88.0%)
Test:  175 rows (12.0%)

Train years: 2006 - 2009
Test years:  2010 - 2010
basic_impute() defined
quality_map defined
{'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}
Feature groups loaded:
 Location & Lot:     14
 Size & Interior:    14
 Quality & Condition:14
 Amenities/Sale:     9


In [2]:
def strategy1_onehot_standard(df_in, reference_columns=None):
    """One-Hot Encoding cho categorical + StandardScaler cho numeric"""
    df = basic_impute(df_in)
    
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in df.columns if c not in cat_cols and c != "SalePrice"]
    
    # One-hot encoding
    df_cat = pd.get_dummies(df[cat_cols].astype(str), drop_first=False)
    
    # If reference columns provided, align to them
    if reference_columns is not None:
        cat_ref_cols = [c for c in reference_columns if c not in num_cols and c != "SalePrice"]
        for col in cat_ref_cols:
            if col not in df_cat.columns:
                df_cat[col] = 0
        df_cat = df_cat[cat_ref_cols]
    
    # Standard scaling
    scaler = StandardScaler()
    df_num = pd.DataFrame(
        scaler.fit_transform(df[num_cols]),
        columns=num_cols,
        index=df.index
    )
    
    result = pd.concat([df_num, df_cat], axis=1)
    if "SalePrice" in df.columns:
        result["SalePrice"] = df["SalePrice"]
    
    return result

print("strategy1_onehot_standard() defined")


proc1_train = strategy1_onehot_standard(df_train)
proc1_test = strategy1_onehot_standard(df_test, reference_columns=proc1_train.columns)

print("\n  Strategy 1 Results:")
print(f"   Train shape: {proc1_train.shape}")
print(f"   Test shape: {proc1_test.shape}")
print(f"   Total features: {proc1_train.shape[1] - 1}")
print(f"\nSample features:")
print(proc1_train.columns[:10].tolist())

# Verify columns match
assert list(proc1_train.columns) == list(proc1_test.columns), "Column mismatch!"
print("Train and test have matching columns")


print("\nTrain Statistics:")
print(proc1_train.describe().iloc[:, :5])


def strategy2_target_robust(df_in, target="SalePrice", reference_columns=None):
    """Target Encoding cho categorical + RobustScaler cho numeric"""
    if ce is None:
        print("  category_encoders not available, skipping Strategy 2")
        return None
    
    df = basic_impute(df_in)
    
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in df.columns if c not in cat_cols and c != target]
    
    # Target encoding
    te = ce.TargetEncoder(cols=cat_cols, smoothing=0.3)
    df_cat = te.fit_transform(
        df[cat_cols],
        df[target] if target in df.columns else None
    )
    
    # Robust scaling
    scaler = RobustScaler()
    df_num = pd.DataFrame(
        scaler.fit_transform(df[num_cols]),
        columns=num_cols,
        index=df.index
    )
    
    result = pd.concat([df_num, df_cat], axis=1)
    
    # Align columns if reference provided
    if reference_columns is not None:
        ref_cols = [c for c in reference_columns if c != target]
        for col in ref_cols:
            if col not in result.columns:
                result[col] = 0
        result = result[ref_cols]
    
    if target in df.columns:
        result[target] = df[target]
    
    return result

print("strategy2_target_robust() defined")


if ce is not None:
    proc2_train = strategy2_target_robust(df_train)
    proc2_test = strategy2_target_robust(df_test, reference_columns=proc2_train.columns)
    
    print("\n  Strategy 2 Results:")
    print(f"   Train shape: {proc2_train.shape}")
    print(f"   Test shape: {proc2_test.shape}")
    print(f"   Total features: {proc2_train.shape[1] - 1}")
    
    # Verify columns match
    assert list(proc2_train.columns) == list(proc2_test.columns), "Column mismatch!"
    print("Train and test have matching columns")
    
    print("\nTrain Statistics:")
    print(proc2_train.describe().iloc[:, :5])
else:
    proc2_train = proc2_test = None
    print("\n  Strategy 2 skipped (category_encoders not available)")


# Compare strategies
print("\n  Strategy Comparison:")
print(f"Strategy 1: {proc1_train.shape[1] - 1} features")
if proc2_train is not None:
    print(f"Strategy 2: {proc2_train.shape[1] - 1} features")

strategy1_onehot_standard() defined

  Strategy 1 Results:
   Train shape: (1285, 287)
   Test shape: (175, 287)
   Total features: 286

Sample features:
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1']
Train and test have matching columns

Train Statistics:
                 Id    MSSubClass   LotFrontage      LotArea   OverallQual
count  1.285000e+03  1.285000e+03  1.285000e+03  1285.000000  1.285000e+03
mean   1.769445e-16  7.326608e-17  1.009137e-16     0.000000  2.737110e-16
std    1.000389e+00  1.000389e+00  1.000389e+00     1.000389  1.000389e+00
min   -1.752369e+00 -8.799154e-01 -2.189551e+00    -0.892342 -3.676904e+00
25%   -8.523952e-01 -8.799154e-01 -4.357896e-01    -0.294695 -8.003002e-01
50%   -2.685988e-03 -1.642880e-01 -3.107533e-02    -0.102869 -8.114932e-02
75%    8.685652e-01  3.127969e-01  4.186071e-01     0.104862  6.380015e-01
max    1.739816e+00  3.175306e+00  1.094118e+01    19.73

In [3]:
# ============================================================
# STRATEGY 3: PCA FOR SIZE + ORDINAL FOR QUALITY
# ============================================================

def strategy3_pca_ordinal(df_in, pca_components=3, reference_columns=None):
    df = basic_impute(df_in)

    # ==== 1. SIZE FEATURES PCA ====
    size_present = [c for c in size_interior_cols if c in df.columns]

    if size_present:
        scaler_size = StandardScaler()
        size_scaled = scaler_size.fit_transform(df[size_present])

        n_components = min(pca_components, len(size_present))
        pca = PCA(n_components=n_components)
        size_pca = pca.fit_transform(size_scaled)

        size_pca_df = pd.DataFrame(
            size_pca,
            columns=[f"SizePCA{i+1}" for i in range(n_components)],
            index=df.index
        )
        print(f"[Strategy 3] PCA variance explained: {pca.explained_variance_ratio_.sum():.2%}")
    else:
        size_pca_df = pd.DataFrame(index=df.index)

    # ==== 2. QUALITY FEATURES ORDINAL ====
    quality_present = [c for c in quality_condition_cols if c in df.columns]

    quality_df = pd.DataFrame(index=df.index)
    for c in quality_present:
        if df[c].dtype == object:
            quality_df[c + "_ord"] = df[c].map(quality_map).fillna(0).astype(int)
        else:
            quality_df[c + "_num"] = df[c]

    # ==== 3. OTHER NUMERIC ====
    other_cols = [
        c for c in df.select_dtypes(include=[np.number]).columns
        if c not in size_present + quality_present + ["SalePrice"]
    ]

    if other_cols:
        scaler_other = StandardScaler()
        other_df = pd.DataFrame(
            scaler_other.fit_transform(df[other_cols]),
            columns=other_cols,
            index=df.index
        )
    else:
        other_df = pd.DataFrame(index=df.index)

    # Combine all
    result = pd.concat([size_pca_df, quality_df, other_df], axis=1)

    # ==== ALIGN TEST SET ====
    if reference_columns is not None:
        ref_cols = [c for c in reference_columns if c != "SalePrice"]
        for col in ref_cols:
            if col not in result.columns:
                result[col] = 0
        result = result[ref_cols]

    # Add target
    if "SalePrice" in df.columns:
        result["SalePrice"] = df["SalePrice"]

    return result

print("strategy3_pca_ordinal() ready")

# Apply Strategy 3
proc3_train = strategy3_pca_ordinal(df_train, pca_components=3)
proc3_test  = strategy3_pca_ordinal(df_test, pca_components=3, reference_columns=proc3_train.columns)

print("\nStrategy 3 Results:")
print(f" Train shape: {proc3_train.shape}")
print(f" Test shape:  {proc3_test.shape}")
print(f" Features: {proc3_train.shape[1] - 1}")
assert list(proc3_train.columns) == list(proc3_test.columns)
print(" Columns match ")


# ============================================================
# STRATEGY 4: MINMAX + EMBEDDING
# ============================================================

def strategy4_minmax_embedding(df_in, reference_columns=None):
    df = basic_impute(df_in)

    # === NUMERIC ===
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols = [c for c in num_cols if c != "SalePrice"]

    scaler = MinMaxScaler()
    num_scaled = pd.DataFrame(
        scaler.fit_transform(df[num_cols]),
        columns=num_cols,
        index=df.index
    )

    # === CATEGORICAL ===
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    cat_df = pd.DataFrame(index=df.index)
    emb_info = {}

    for c in cat_cols:
        nunique = df[c].nunique(dropna=False)

        if nunique > 10:
            # Integer encoding for embedding
            codes, uniques = pd.factorize(df[c].astype(str))
            cat_df[c + "_idx"] = codes
            emb_info[c] = {
                "vocab_size": len(uniques),
                "embedding_dim": min(50, (len(uniques) + 1) // 2)
            }
        else:
            # One-hot encoding for small cardinality
            dummies = pd.get_dummies(df[c].astype(str), prefix=c, drop_first=False)
            cat_df = pd.concat([cat_df, dummies], axis=1)

    # Combine numeric + categorical
    result = pd.concat([num_scaled, cat_df], axis=1)

    # Align with train reference
    if reference_columns is not None:
        ref_cols = [c for c in reference_columns if c != "SalePrice"]
        for col in ref_cols:
            if col not in result.columns:
                result[col] = 0
        result = result[ref_cols]

    # Add target
    if "SalePrice" in df.columns:
        result["SalePrice"] = df["SalePrice"]

    return result, emb_info

print("strategy4_minmax_embedding() ready")

# Apply Strategy 4
proc4_train, emb_info = strategy4_minmax_embedding(df_train)
proc4_test, _ = strategy4_minmax_embedding(df_test, reference_columns=proc4_train.columns)

print("\nStrategy 4 Results:")
print(f" Train shape: {proc4_train.shape}")
print(f" Test shape:  {proc4_test.shape}")
print(f" Features: {proc4_train.shape[1] - 1}")
assert list(proc4_train.columns) == list(proc4_test.columns)
print(" Columns match ")

# Sample em


strategy3_pca_ordinal() ready
[Strategy 3] PCA variance explained: 58.86%
[Strategy 3] PCA variance explained: 60.05%

Strategy 3 Results:
 Train shape: (1285, 37)
 Test shape:  (175, 37)
 Features: 36
 Columns match 
strategy4_minmax_embedding() ready

Strategy 4 Results:
 Train shape: (1285, 235)
 Test shape:  (175, 235)
 Features: 234
 Columns match 


In [4]:
def compute_correlations(df_in, target="SalePrice"):
    """T√≠nh Pearson v√† Spearman correlation"""
    numeric = df_in.select_dtypes(include=[np.number]).columns.tolist()
    
    if target not in numeric:
        raise ValueError(f"{target} not in numeric columns")
    
    numeric = [c for c in numeric if c != target]
    
    pearson_dict = {}
    spearman_dict = {}
    
    for c in numeric:
        try:
            pearson_dict[c] = df_in[c].corr(df_in[target], method="pearson")
            spearman_dict[c] = df_in[c].corr(df_in[target], method="spearman")
        except:
            pearson_dict[c] = np.nan
            spearman_dict[c] = np.nan
    
    corr_df = pd.DataFrame({
        "pearson": pd.Series(pearson_dict),
        "spearman": pd.Series(spearman_dict)
    })
    
    corr_df = corr_df.sort_values(by="pearson", key=abs, ascending=False)
    return corr_df

print(" compute_correlations() defined")


df_train_imputed = basic_impute(df_train)
correlations = compute_correlations(df_train_imputed)

print("  Top 15 Features by Pearson Correlation:")
print(correlations.head(15))


top_features = correlations.head(10).index.tolist()
print("\n Top 10 Most Correlated Features:")
for i, feat in enumerate(top_features, 1):
    print(f"{i:2d}. {feat:20s} = {correlations.loc[feat, 'pearson']:+.3f}")


def group_correlation_summary(corr_df, group_map):
    """T√≠nh mean absolute correlation cho t·ª´ng nh√≥m"""
    summary = []
    
    for group_name, cols in group_map.items():
        present = [c for c in cols if c in corr_df.index]
        
        if present:
            mean_p = corr_df.loc[present, "pearson"].abs().mean()
            mean_s = corr_df.loc[present, "spearman"].abs().mean()
            max_p = corr_df.loc[present, "pearson"].abs().max()
        else:
            mean_p = mean_s = max_p = np.nan
        
        summary.append({
            "group": group_name,
            "n_features": len(present),
            "mean_abs_pearson": mean_p,
            "mean_abs_spearman": mean_s,
            "max_abs_pearson": max_p
        })
    
    return pd.DataFrame(summary).sort_values(by="mean_abs_pearson", ascending=False)

print(" group_correlation_summary() defined")


group_summary = group_correlation_summary(correlations, groups_info)

print("\n  Correlation Summary by Feature Group:")
print(group_summary)

print("\n  Detailed Group Analysis:")
for group_name, cols in groups_info.items():
    present = [c for c in cols if c in correlations.index]
    if present:
        group_corr = correlations.loc[present].sort_values(
            by="pearson", key=abs, ascending=False
        )
        print(f"\n{group_name.upper()} (top 3):")
        print(group_corr.head(3))



strong_corr = correlations[correlations['pearson'].abs() > 0.5]

print(f"\n  Strong Correlations (|r| > 0.5): {len(strong_corr)} features")
print("\nTop features:")
print(strong_corr.head(10))


 compute_correlations() defined
  Top 15 Features by Pearson Correlation:
               pearson  spearman
OverallQual   0.791936  0.806647
GrLivArea     0.707788  0.732569
GarageCars    0.641778  0.692497
GarageArea    0.625583  0.648883
TotalBsmtSF   0.604576  0.599329
1stFlrSF      0.600848  0.574046
FullBath      0.565798  0.640359
TotRmsAbvGrd  0.528824  0.532821
YearBuilt     0.516608  0.646911
YearRemodAdd  0.510059  0.578902
Fireplaces    0.470974  0.528224
MasVnrArea    0.467104  0.416202
GarageYrBlt   0.463615  0.563399
BsmtFinSF1    0.367683  0.293354
LotFrontage   0.344275  0.381988

 Top 10 Most Correlated Features:
 1. OverallQual          = +0.792
 2. GrLivArea            = +0.708
 3. GarageCars           = +0.642
 4. GarageArea           = +0.626
 5. TotalBsmtSF          = +0.605
 6. 1stFlrSF             = +0.601
 7. FullBath             = +0.566
 8. TotRmsAbvGrd         = +0.529
 9. YearBuilt            = +0.517
10. YearRemodAdd         = +0.510
 group_correlation_summ

In [5]:
df_train_clean = basic_impute(df_train)
corr_matrix = df_train_clean.corr(numeric_only=True)

print("="*100)
print("PH√ÇN T√çCH 4 NH√ìM FEATURES ‚Äî FULL AUTOMATION, NO HARD CODE")
print("="*100)


print("\n" + "="*80)
print("1. T·ªîNG QUAN 4 NH√ìM FEATURES")
print("="*80)

for name, cols in groups_info.items():
    valid = [c for c in cols if c in df_train_clean.columns]
    print(f"\n{name}")
    print(f"   S·ªë features h·ª£p l·ªá: {len(valid)}")
    if len(valid) > 0:
        print(f"   V√≠ d·ª• 5 features: {valid[:5]}")


# =============================================================================
# 2. Ph√¢n t√≠ch correlation t·ª´ng nh√≥m 
# =============================================================================
print("\n" + "="*80)
print("2. PH√ÇN T√çCH CORRELATION T·ª™NG NH√ìM")
print("="*80)

group_corr_results = {}

for name, cols in groups_info.items():
    numeric_cols = [
        c for c in cols 
        if c in corr_matrix.columns and c != 'SalePrice'
    ]

    if not numeric_cols:
        print(f"\n  {name}: Kh√¥ng c√≥ numeric feature.")
        continue

    corrs = corr_matrix.loc[numeric_cols, 'SalePrice']
    corrs = corrs.dropna()

    if corrs.empty:
        print(f"\n  {name}: Kh√¥ng c√≥ gi√° tr·ªã correlation.")
        continue

    corrs_sorted = corrs.abs().sort_values(ascending=False)

    group_corr_results[name] = {
        'mean_abs_corr': corrs_sorted.mean(),
        'top_features': corrs_sorted.head(5)
    }

    print(f"\n{name}")
    print(f"   Mean |corr|: {corrs_sorted.mean():.3f}")
    print(f"   Top 5 strongest correlated:")

    for feat, val in corrs_sorted.head(5).items():
        print(f"      {feat:25s} = {val:+.3f}")


# =============================================================================
# 3. So s√°nh s·ª©c m·∫°nh gi·ªØa c√°c nh√≥m 
# =============================================================================
print("\n" + "="*80)
print("3. SO S√ÅNH S·ª®C M·∫†NH GI·ªÆA C√ÅC NH√ìM")
print("="*80)

group_strength_df = pd.DataFrame([
    {
        'Group': name,
        'Mean_Abs_Corr': info['mean_abs_corr'],
        'Top_Feature': info['top_features'].index[0],
        'Top_Corr': info['top_features'].iloc[0]
    }
    for name, info in group_corr_results.items()
]).sort_values('Mean_Abs_Corr', ascending=False)

print(group_strength_df.to_string(index=False))


# Visualization
print("\n  Visual Ranking:")
for _, row in group_strength_df.iterrows():
    bar = "‚ñà" * int(row['Mean_Abs_Corr'] * 50)
    print(f"{row['Group']:35s} {bar} {row['Mean_Abs_Corr']:.3f}")


# =============================================================================
# 4. Ph√¢n t√≠ch chi ti·∫øt t·ª´ng nh√≥m 
# =============================================================================

# ---------- Group 1 ----------
print("\n" + "="*80)
print("4.1 CHI TI·∫æT NH√ìM 1 ‚Äî LOCATION & LOT")
print("="*80)

cat_loc = [c for c in location_lot_cols if c in df_train_clean.columns and df_train_clean[c].dtype == 'object']

print(f"\nCategorical features: {len(cat_loc)}")

for col in cat_loc[:3]:
    print(f"\nüìç {col}:")
    vals = df_train_clean[col].value_counts()
    print(f"   Unique: {df_train_clean[col].nunique()}")
    if df_train_clean[col].nunique() < 50:
        mean_price = df_train_clean.groupby(col)['SalePrice'].mean().sort_values(ascending=False).head(5)
        print("   Top 5 categories by mean price:")
        for v, p in mean_price.items():
            print(f"      {v:20s}: ${p:,.0f}")


# ---------- Group 2 ----------
print("\n" + "="*80)
print("4.2 CHI TI·∫æT NH√ìM 2 ‚Äî SIZE & INTERIOR")
print("="*80)

num_size = [c for c in size_interior_cols if c in df_train_clean.columns and np.issubdtype(df_train_clean[c].dtype, np.number)]

print(f"\nNumeric features: {len(num_size)}")

print("\nTop 8 size features by mean value:")
print(df_train_clean[num_size].describe().T[['mean','std','min','max']].sort_values('mean', ascending=False).head(8))


# ---------- Group 3 ----------
print("\n" + "="*80)
print("4.3 CHI TI·∫æT NH√ìM 3 ‚Äî QUALITY & CONDITION")
print("="*80)

if 'OverallQual' in df_train_clean.columns:
    qual_stat = df_train_clean.groupby('OverallQual')['SalePrice'].agg(['mean','count'])
    print("\n OverallQual vs Price:")
    for q, r in qual_stat.iterrows():
        bar = "‚ñà" * int(r['count'] / 10)
        print(f"   Qual {q}: ${r['mean']:,.0f} | {bar} ({int(r['count'])} homes)")


# ---------- Group 4 ----------
print("\n" + "="*80)
print("4.4 CHI TI·∫æT NH√ìM 4 ‚Äî AMENITIES, SALE & TIME")
print("="*80)

if 'YrSold' in df_train_clean.columns:
    print("\nPrice by Year Sold:")
    print(df_train_clean.groupby('YrSold')['SalePrice'].mean())

if 'SaleCondition' in df_train_clean.columns:
    cond = df_train_clean.groupby('SaleCondition')['SalePrice'].mean().sort_values(ascending=False)
    print("\nPrice by Sale Condition:")
    for c, v in cond.items():
        print(f"   {c:15s}: ${v:,.0f}")


# =============================================================================
# 5. AUTO-GENERATED INSIGHTS SUMMARY
# =============================================================================

print("\n" + "="*80)
print("5. INSIGHTS T·ª∞ ƒê·ªòNG SINH T·ª™ 4 NH√ìM (NO HARD CODE)")
print("="*80)

def auto_insight(group_name, mean_corr):
    if mean_corr > 0.45:
        return "Nh√≥m n√†y c√≥ t√°c ƒë·ªông r·∫•t m·∫°nh ƒë·∫øn SalePrice v√† c·∫ßn ∆∞u ti√™n trong modeling."
    if mean_corr > 0.30:
        return "Nh√≥m n√†y c√≥ ·∫£nh h∆∞·ªüng ƒë√°ng k·ªÉ v√† ƒë√≥ng vai tr√≤ quan tr·ªçng."
    if mean_corr > 0.15:
        return "·∫¢nh h∆∞·ªüng ·ªü m·ª©c trung b√¨nh, h·ªó tr·ª£ fine-tuning m√¥ h√¨nh."
    return "·∫¢nh h∆∞·ªüng th·∫•p, mang t√≠nh b·ªï sung."

for _, row in group_strength_df.iterrows():
    strength = row['Mean_Abs_Corr']
    stars = "" * max(1, min(5, int(strength * 10)))

    print(f"\n{row['Group']:35s} {stars}")
    print(f"   ‚Üí Mean |corr|: {strength:.3f}")
    print(f"   ‚Üí Strongest: {row['Top_Feature']} ({row['Top_Corr']:+.3f})")
    print(f"   ‚Üí Insight: {auto_insight(row['Group'], strength)}")

print("\n" + "="*80)
print("FULL FEATURE GROUP ANALYSIS COMPLETED")
print("="*80)


PH√ÇN T√çCH 4 NH√ìM FEATURES ‚Äî FULL AUTOMATION, NO HARD CODE

1. T·ªîNG QUAN 4 NH√ìM FEATURES

Location & Lot
   S·ªë features h·ª£p l·ªá: 14
   V√≠ d·ª• 5 features: ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street']

Size & Interior
   S·ªë features h·ª£p l·ªá: 14
   V√≠ d·ª• 5 features: ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF']

Quality & Condition
   S·ªë features h·ª£p l·ªá: 14
   V√≠ d·ª• 5 features: ['OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'ExterQual']

Amenities, Sale & Time
   S·ªë features h·ª£p l·ªá: 9
   V√≠ d·ª• 5 features: ['Fireplaces', 'GarageYrBlt', 'PoolArea', 'Fence', 'MiscVal']

2. PH√ÇN T√çCH CORRELATION T·ª™NG NH√ìM

Location & Lot
   Mean |corr|: 0.224
   Top 5 strongest correlated:
      LotFrontage               = +0.344
      LotArea                   = +0.257
      MSSubClass                = +0.072

Size & Interior
   Mean |corr|: 0.387
   Top 5 strongest correlated:
      GrLivArea                 =

In [None]:
def to_monthly_timeseries(df_in, reference_columns=None):
    """Chuy·ªÉn ƒë·ªïi sang monthly aggregation"""
    df = df_in.copy()
    
    if "YrSold" not in df.columns or "MoSold" not in df.columns:
        raise ValueError("Missing YrSold or MoSold columns")
    
    # Create datetime index
    df["date"] = pd.to_datetime(
        df["YrSold"].astype(int).astype(str) + "-" +
        df["MoSold"].astype(int).astype(str) + "-01"
    )
    df = df.set_index("date")
    
    # Aggregate target (mean price per month)
    monthly_target = df["SalePrice"].resample("MS").mean()
    
    # Aggregate exogenous features
    exog = pd.DataFrame(index=monthly_target.index)
    
    # Key numeric features
    if "OverallQual" in df.columns:
        exog["OverallQual_mean"] = df["OverallQual"].resample("MS").mean()
    
    if "GrLivArea" in df.columns:
        exog["GrLivArea_mean"] = df["GrLivArea"].resample("MS").mean()
    
    if "TotRmsAbvGrd" in df.columns:
        exog["TotRmsAbvGrd_mean"] = df["TotRmsAbvGrd"].resample("MS").mean()
    
    # Categorical features (proportions)
    if "SaleCondition" in df.columns:
        salecond = pd.get_dummies(df["SaleCondition"].astype(str))
        salecond_monthly = salecond.resample("MS").mean()
        salecond_monthly = salecond_monthly.add_prefix("SaleCond_")
        exog = pd.concat([exog, salecond_monthly], axis=1)
    
    # If reference columns provided, align to them
    if reference_columns is not None:
        for col in reference_columns:
            if col not in exog.columns:
                exog[col] = 0
        exog = exog[reference_columns]
    
    # Create full date range
    full_range = pd.date_range(
        start=monthly_target.index.min(),
        end=monthly_target.index.max(),
        freq="MS"
    )
    
    # Reindex
    monthly_target = monthly_target.reindex(full_range)
    exog = exog.reindex(full_range)
    
    # Interpolate missing
    monthly_target = monthly_target.interpolate(method="linear").ffill().bfill()
    exog = exog.interpolate(method="linear").ffill().bfill()
    
    # Fill any remaining NaN
    for col in exog.columns:
        if exog[col].isna().any():
            exog[col] = exog[col].fillna(exog[col].mean())
    
    # Format for Prophet
    ts_df = pd.DataFrame({
        "ds": monthly_target.index,
        "y": monthly_target.values
    })
    
    return ts_df, exog

print(" to_monthly_timeseries() defined")


# Create monthly time series
ts_train, exog_train = to_monthly_timeseries(df_train)

# Create test with same columns as train
ts_test, exog_test = to_monthly_timeseries(df_test, reference_columns=exog_train.columns)

print("\n  Monthly Time Series:")
print(f"   Train: {len(ts_train)} months")
print(f"   Test: {len(ts_test)} months")
print(f"   Exogenous features: {len(exog_train.columns)}")

# Verify columns match
assert list(exog_train.columns) == list(exog_test.columns), "Column mismatch!"
print(" Train and test have matching columns")

# Show train series
print("\n  Train Time Series (first 5 months):")
print(ts_train.head())

print("\nTrain Time Series (last 5 months):")
print(ts_train.tail())


# Show exogenous features
print("\n  Exogenous Features (first 5 months):")
print(exog_train.head())


# Show test series
print("\n  Test Time Series:")
print(ts_test.head())
print(f"\nTest period: {ts_test['ds'].min()} to {ts_test['ds'].max()}")


# Statistics
print("\n  Time Series Statistics:")
print(f"   Train mean price: ${ts_train['y'].mean():,.2f}")
print(f"   Train std: ${ts_train['y'].std():,.2f}")
print(f"   Test mean price: ${ts_test['y'].mean():,.2f}")
print(f"   Test std: ${ts_test['y'].std():,.2f}")


# Check for missing values
print("\n  Missing Values Check:")
print(f"   ts_train: {ts_train['y'].isna().sum()} NaN")
print(f"   ts_test: {ts_test['y'].isna().sum()} NaN")
print(f"   exog_train: {exog_train.isna().sum().sum()} NaN")
print(f"   exog_test: {exog_test.isna().sum().sum()} NaN")


def create_lstm_windows(series_df, exog_df=None, lags=12):
    """T·∫°o sliding windows cho LSTM"""
    s = series_df.copy().reset_index(drop=True)
    ex = exog_df.copy().reset_index(drop=True) if exog_df is not None else None
    
    values = s["y"].values
    
    X_list = []
    y_list = []
    
    for i in range(len(values) - lags):
        # Target window
        x_window = values[i:i+lags]
        y_val = values[i+lags]
        
        # Add exogenous features
        if ex is not None:
            ex_window = ex.iloc[i:i+lags].values
            x_combined = np.hstack([
                x_window.reshape(-1, 1),
                ex_window
            ])
        else:
            x_combined = x_window.reshape(-1, 1)
        
        X_list.append(x_combined)
        y_list.append(y_val)
    
    X = np.array(X_list)
    y = np.array(y_list)
    
    return X, y

print(" create_lstm_windows() defined")

# Scale target for LSTM
scaler_y = MinMaxScaler()
ts_train_scaled = ts_train.copy()
ts_train_scaled["y"] = scaler_y.fit_transform(ts_train[["y"]])

# Create LSTM windows
X_lstm_train, y_lstm_train = create_lstm_windows(
    ts_train_scaled,
    exog_df=exog_train,
    lags=12
)

print("\n  LSTM Windows:")
print(f"   X shape: {X_lstm_train.shape}")
print(f"   y shape: {y_lstm_train.shape}")
print(f"   Window size: 12 months")
print(f"   Features per timestep: {X_lstm_train.shape[2]}")

# Show sample window
print("\n  Sample LSTM Window (first window):")
print(f"   Input shape: {X_lstm_train[0].shape}")
print(f"   Output value: {y_lstm_train[0]:.4f}")

 to_monthly_timeseries() defined

  Monthly Time Series:
   Train: 48 months
   Test: 7 months
   Exogenous features: 9
 Train and test have matching columns

  Train Time Series (first 5 months):
          ds              y
0 2006-01-01  201090.000000
1 2006-02-01  194322.222222
2 2006-03-01  184982.200000
3 2006-04-01  174312.814815
4 2006-05-01  158928.289474

Train Time Series (last 5 months):
           ds              y
43 2009-08-01  165670.966667
44 2009-09-01  196849.350000
45 2009-10-01  175206.592593
46 2009-11-01  156381.818182
47 2009-12-01  164014.533333

  Exogenous Features (first 5 months):
            OverallQual_mean  GrLivArea_mean  TotRmsAbvGrd_mean  \
2006-01-01          6.000000     1517.500000           6.400000   
2006-02-01          6.777778     1758.555556           7.111111   
2006-03-01          6.360000     1558.440000           6.720000   
2006-04-01          6.259259     1541.777778           6.518519   
2006-05-01          5.605263     1365.605263      

In [7]:
def fit_prophet_model(ts_train, exog_train, ts_test, exog_test):
    """Fit Prophet model v·ªõi exogenous regressors"""
    
    if Prophet is None:
        print("  Prophet not available")
        return None
    
    print("  Checking data quality...")
    print(f"   ts_train NaN: {ts_train['y'].isna().sum()}")
    print(f"   exog_train NaN: {exog_train.isna().sum().sum()}")
    
    # Initialize Prophet
    m = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.05
    )
    
    # Add regressors
    for col in exog_train.columns:
        m.add_regressor(col)
    
    print(f"\n Added {len(exog_train.columns)} regressors")
    
    # Prepare training data
    df_train = ts_train.copy().reset_index(drop=True)
    exog_train_reset = exog_train.reset_index(drop=True)
    df_fit = pd.concat([df_train, exog_train_reset], axis=1)
    
    # Fill any NaN
    if df_fit.isna().any().any():
        print(" Filling NaN in training data...")
        df_fit = df_fit.fillna(df_fit.mean())
    
    # Fit model
    print("\n Fitting Prophet model...")
    m.fit(df_fit)
    print(" Model fitted")
    
    return m

# %%
# Train Prophet
if Prophet is not None:
    prophet_model = fit_prophet_model(ts_train, exog_train, ts_test, exog_test)
else:
    prophet_model = None
    print("  Skipping Prophet (not installed)")


def evaluate_prophet(model, ts_test, exog_test):
    """Evaluate Prophet tr√™n test set"""
    
    # Prepare test data
    df_test = ts_test.copy().reset_index(drop=True)
    exog_test_reset = exog_test.reset_index(drop=True)
    df_test_full = pd.concat([df_test[['ds']], exog_test_reset], axis=1)
    
    # Fill NaN
    if df_test_full.isna().any().any():
        print("  Filling NaN in test data...")
        df_test_full = df_test_full.fillna(exog_test_reset.mean())
    
    # Predict
    print("\nMaking predictions...")
    forecast = model.predict(df_test_full)
    
    # Calculate metrics
    y_true = ts_test["y"].values
    y_pred = forecast["yhat"].values[:len(y_true)]
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    return {
        "forecast": forecast,
        "y_true": y_true,
        "y_pred": y_pred,
        "rmse": rmse,
        "mae": mae,
        "mape": mape
    }


# Evaluate Prophet
if prophet_model is not None:
    prophet_results = evaluate_prophet(prophet_model, ts_test, exog_test)
    
    print("\n  Prophet Results:")
    print(f"   RMSE: ${prophet_results['rmse']:,.2f}")
    print(f"   MAE: ${prophet_results['mae']:,.2f}")
    print(f"   MAPE: {prophet_results['mape']:.2f}%")
else:
    prophet_results = None


# Show predictions vs actual
if prophet_results is not None:
    comparison = pd.DataFrame({
        'Date': ts_test['ds'].values,
        'Actual': prophet_results['y_true'],
        'Predicted': prophet_results['y_pred'],
        'Error': prophet_results['y_true'] - prophet_results['y_pred'],
        'Error_%': ((prophet_results['y_true'] - prophet_results['y_pred']) / 
                    prophet_results['y_true'] * 100)
    })
    
    print("\n  Predictions vs Actual (first 5):")
    print(comparison.head())
    
    print("\n  Error Statistics:")
    print(f"   Mean Error: ${comparison['Error'].mean():,.2f}")
    print(f"   Std Error: ${comparison['Error'].std():,.2f}")
    print(f"   Mean Absolute Error %: {comparison['Error_%'].abs().mean():.2f}%")


if prophet_model is not None:
    # Get components
    future = prophet_model.make_future_dataframe(periods=0, freq='MS')
    
    # Add regressors to future
    exog_full = pd.concat([exog_train, exog_test]).reset_index(drop=True)
    for col in exog_train.columns:
        future[col] = exog_full[col].values[:len(future)]
    
    forecast_full = prophet_model.predict(future)
    
    print("\n  Forecast Components:")
    print(forecast_full[['ds', 'trend', 'yearly']].tail())


if prophet_results is not None:
    print("\n" + "="*50)
    print("PROPHET MODEL SUMMARY")
    print("="*50)
    print(f"Training period: {ts_train['ds'].min()} to {ts_train['ds'].max()}")
    print(f"Test period: {ts_test['ds'].min()} to {ts_test['ds'].max()}")
    print(f"\nPerformance Metrics:")
    print(f"  RMSE: ${prophet_results['rmse']:,.2f}")
    print(f"  MAE: ${prophet_results['mae']:,.2f}")
    print(f"  MAPE: {prophet_results['mape']:.2f}%")
    print("="*50)

  Checking data quality...
   ts_train NaN: 0
   exog_train NaN: 0


13:59:17 - cmdstanpy - INFO - Chain [1] start processing



 Added 9 regressors

 Fitting Prophet model...


13:59:18 - cmdstanpy - INFO - Chain [1] done processing


 Model fitted

Making predictions...

  Prophet Results:
   RMSE: $18,670.57
   MAE: $14,850.68
   MAPE: 9.59%

  Predictions vs Actual (first 5):
        Date         Actual      Predicted         Error    Error_%
0 2010-01-01  163852.600000  186575.934880 -22723.334880 -13.868156
1 2010-02-01  174823.333333  163357.686867  11465.646466   6.558419
2 2010-03-01  203181.285714  181247.127209  21934.158505  10.795364
3 2010-04-01  171344.025641  171544.564265   -200.538624  -0.117039
4 2010-05-01  178422.250000  171300.351953   7121.898047   3.991597

  Error Statistics:
   Mean Error: $-1,723.63
   Std Error: $20,080.40
   Mean Absolute Error %: 9.59%

  Forecast Components:
           ds          trend        yearly
43 2009-08-01  176631.588353   8508.731065
44 2009-09-01  176537.424116   4951.720849
45 2009-10-01  176446.297434  -2284.256042
46 2009-11-01  176352.133197  12417.765627
47 2009-12-01  176261.006515  -3647.312127

PROPHET MODEL SUMMARY
Training period: 2006-01-01 00:00:00

In [None]:
def fit_sarimax_model(ts_train, exog_train):
    """Fit SARIMAX model v√† t√≠nh AIC/BIC"""
    
    if sm is None:
        print("  statsmodels not available")
        return None
    
    try:
        # Prepare data
        s = ts_train.set_index("ds")["y"]
        
        print("  Fitting SARIMAX(1,1,1)x(1,1,1,12)...")
        
        # SARIMAX model
        mod = sm.tsa.statespace.SARIMAX(
            s,
            exog=exog_train,
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 12),
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        
        # Fit
        res = mod.fit(disp=False, maxiter=200)
        
        print(" SARIMAX fitted successfully")
        
        return {
            "model": res,
            "aic": res.aic,
            "bic": res.bic,
            "params": res.params
        }
        
    except Exception as e:
        print(f"  SARIMAX error: {e}")
        return None

sarimax_results = fit_sarimax_model(ts_train, exog_train)

if sarimax_results is not None:
    print("\n  SARIMAX Model Information:")
    print(f"   AIC: {sarimax_results['aic']:.2f}")
    print(f"   BIC: {sarimax_results['bic']:.2f}")
    
    print("\n  Model Parameters (first 10):")
    print(sarimax_results['params'].head(10))


if sarimax_results is not None:
    model = sarimax_results['model']
    
    # Summary statistics
    print("\n  Model Summary:")
    print(f"   Log Likelihood: {model.llf:.2f}")
    print(f"   AIC: {model.aic:.2f}")
    print(f"   BIC: {model.bic:.2f}")
    print(f"   HQIC: {model.hqic:.2f}")


if sarimax_results is not None:
    residuals = sarimax_results['model'].resid
    
    print("\n  Residual Diagnostics:")
    print(f"   Mean: {residuals.mean():.4f}")
    print(f"   Std: {residuals.std():.2f}")
    print(f"   Min: {residuals.min():.2f}")
    print(f"   Max: {residuals.max():.2f}")


def predict_sarimax(model_result, exog_test, steps):
    """Make predictions v·ªõi SARIMAX"""
    try:
        # Forecast
        forecast = model_result['model'].forecast(
            steps=steps,
            exog=exog_test.iloc[:steps]
        )
        
        return forecast
    except Exception as e:
        print(f"  Prediction error: {e}")
        return None

if sarimax_results is not None:
    # Predict
    n_test = len(ts_test)
    sarimax_pred = predict_sarimax(sarimax_results, exog_test, n_test)
    
    if sarimax_pred is not None:
        # Calculate metrics
        y_true = ts_test['y'].values
        y_pred = sarimax_pred.values
        
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        
        print("\n  SARIMAX Test Performance:")
        print(f"   RMSE: ${rmse:,.2f}")
        print(f"   MAE: ${mae:,.2f}")
        print(f"   MAPE: {mape:.2f}%")
        
        # Store results
        sarimax_results['test_metrics'] = {
            'rmse': rmse,
            'mae': mae,
            'mape': mape,
            'y_true': y_true,
            'y_pred': y_pred
        }


if sarimax_results is not None and 'test_metrics' in sarimax_results:
    metrics = sarimax_results['test_metrics']
    
    comparison = pd.DataFrame({
        'Date': ts_test['ds'].values,
        'Actual': metrics['y_true'],
        'Predicted': metrics['y_pred'],
        'Error': metrics['y_true'] - metrics['y_pred']
    })
    
    print("\n  SARIMAX Predictions (first 5):")
    print(comparison.head())


print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)

if prophet_results is not None:
    print("\nProphet:")
    print(f"  RMSE: ${prophet_results['rmse']:,.2f}")
    print(f"  MAE: ${prophet_results['mae']:,.2f}")
    print(f"  MAPE: {prophet_results['mape']:.2f}%")

if sarimax_results is not None and 'test_metrics' in sarimax_results:
    metrics = sarimax_results['test_metrics']
    print("\nSARIMAX:")
    print(f"  RMSE: ${metrics['rmse']:,.2f}")
    print(f"  MAE: ${metrics['mae']:,.2f}")
    print(f"  MAPE: {metrics['mape']:.2f}%")
    print(f"  AIC: {sarimax_results['aic']:.2f}")
    print(f"  BIC: {sarimax_results['bic']:.2f}")

print("="*60)

if prophet_results is not None and sarimax_results is not None:
    if 'test_metrics' in sarimax_results:
        prophet_rmse = prophet_results['rmse']
        sarimax_rmse = sarimax_results['test_metrics']['rmse']
        
        best_model = "Prophet" if prophet_rmse < sarimax_rmse else "SARIMAX"
        improvement = abs(prophet_rmse - sarimax_rmse) / max(prophet_rmse, sarimax_rmse) * 100
        
        print(f"\n Best Model: {best_model}")
        print(f"   Improvement: {improvement:.2f}%")

  Fitting SARIMAX(1,1,1)x(1,1,1,12)...
 SARIMAX fitted successfully

  SARIMAX Model Information:
   AIC: 490.37
   BIC: 505.00

  Model Parameters (first 10):
OverallQual_mean     29551.214864
GrLivArea_mean         -20.278695
TotRmsAbvGrd_mean    16090.252344
SaleCond_Abnorml    -17935.374897
SaleCond_AdjLand     34249.609225
SaleCond_Alloca     -39546.453081
SaleCond_Family       -264.697633
SaleCond_Normal     -22221.174124
SaleCond_Partial     45718.090116
ar.L1                   -0.318855
dtype: float64

  Model Summary:
   Log Likelihood: -231.19
   AIC: 490.37
   BIC: 505.00
   HQIC: 493.55

  Residual Diagnostics:
   Mean: 367.9125
   Std: 18439.99
   Min: -35190.38
   Max: 44783.39

  SARIMAX Test Performance:
   RMSE: $14,379.86
   MAE: $12,791.10
   MAPE: 8.09%

  SARIMAX Predictions (first 5):
        Date         Actual      Predicted         Error
0 2010-01-01  163852.600000  182001.545790 -18148.945790
1 2010-02-01  174823.333333  163108.455690  11714.877643
2 2010-03-0

In [None]:
train_values = ts_train['y'].values.reshape(-1, 1)

scaler_y = MinMaxScaler()
train_scaled = scaler_y.fit_transform(train_values)

print(" Scaled training data")

# =============================================================================
# 2. CREATE SEQUENCES
# =============================================================================
def create_sequences(data, seq_len=12):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data[i:i+seq_len])
        y.append(data[i+seq_len])
    return np.array(X), np.array(y)

SEQ_LEN = 12
X_train, y_train = create_sequences(train_scaled, SEQ_LEN)

print(f" Created sequences: X={X_train.shape}, y={y_train.shape}")


# =============================================================================
# 3. BUILD LSTM MODEL
# =============================================================================

model_lstm = Sequential([
    LSTM(64, return_sequences=True, input_shape=(SEQ_LEN, 1)),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])

model_lstm.compile(optimizer="adam", loss="mse")
print(" LSTM model built")


# =============================================================================
# 4. TRAIN WITH CALLBACKS (BEST MODEL AUTO SAVE)
# =============================================================================

os.makedirs("best_model", exist_ok=True)
best_model_path = "best_model/lstm_best_model.h5"

callbacks = [
    EarlyStopping(patience=20, restore_best_weights=True),
    ModelCheckpoint(best_model_path, save_best_only=True)
]

history = model_lstm.fit(
    X_train, y_train,
    epochs=200,
    batch_size=16,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1
)

print(f" Training completed")
print(f" Best model saved automatically to {best_model_path}")

# =============================================================================
# 5. SAVE SCALER
# =============================================================================

with open("best_model/lstm_scaler.pkl", "wb") as f:
    pickle.dump(scaler_y, f)

print(" Scaler saved: best_model/lstm_scaler.pkl")

# =============================================================================
# 6. SAVE METADATA
# =============================================================================

metadata = {
    "sequence_length": SEQ_LEN,
    "n_features": 1,
    "target": "SalePrice",
    "model_type": "LSTM"
}

import json
with open("best_model/metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print(" Metadata saved: best_model/metadata.json")

# =============================================================================
# DONE
# =============================================================================

print("\n========================================")
print("LSTM TRAINING + BEST MODEL EXPORT DONE")
print("========================================")


 Scaled training data
 Created sequences: X=(36, 12, 1), y=(36, 1)
 LSTM model built
Epoch 1/200
[1m1/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m2s[0m 2s/step - loss: 0.1596



[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 392ms/step - loss: 0.1550 - val_loss: 0.0683
Epoch 2/200
[1m1/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m0s[0m 31ms/step - loss: 0.0994



[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 83ms/step - loss: 0.1054 - val_loss: 0.0532
Epoch 3/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 63ms/step - loss: 0.0692 - val_loss: 0.0597
Epoch 4/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.0502 - val_loss: 0.0825
Epoch 5/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 54ms/step - loss: 0.0629 - val_loss: 0.1065
Epoch 6/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 62ms/step - loss: 0.0701 - val_loss: 0.0991
Epoch 7/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 63ms/step - loss: 0.0687 - val_loss: 0.0802
Epoch 8/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î



[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 101ms/step - loss: 0.0583 - val_loss: 0.0531
Epoch 12/200
[1m1/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m0s[0m 28ms/step - loss: 0.0460



[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 90ms/step - loss: 0.0566 - val_loss: 0.0529
Epoch 13/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.0626 - val_loss: 0.0534
Epoch 14/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 54ms/step - loss: 0.0614 - val_loss: 0.0548
Epoch 15/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 52ms/step - loss: 0.0503 - val_loss: 0.0578
Epoch 16/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 59ms/step - loss: 0.0533 - val_loss: 0.0620
Epoch 17/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 63ms/step - loss: 0.0586 - val_loss: 0.0665
Epoch 18/200
[1m2/2[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [14]:
# ================================================
# PART 9 ‚Äì So S√°nh C√°c Chi·∫øn L∆∞·ª£c Ti·ªÅn X·ª≠ L√Ω
# ================================================

# 1. Summary Table
strategies_summary = []

strategies_summary.append({
    'Strategy': '1. OneHot + Standard',
    'Train_Shape': proc1_train.shape,
    'Test_Shape': proc1_test.shape,
    'N_Features': proc1_train.shape[1] - 1,
    'Description': 'One-Hot Encoding + StandardScaler'
})

if proc2_train is not None:
    strategies_summary.append({
        'Strategy': '2. Target + Robust',
        'Train_Shape': proc2_train.shape,
        'Test_Shape': proc2_test.shape,
        'N_Features': proc2_train.shape[1] - 1,
        'Description': 'Target Encoding + RobustScaler'
    })

strategies_summary.append({
    'Strategy': '3. PCA + Ordinal',
    'Train_Shape': proc3_train.shape,
    'Test_Shape': proc3_test.shape,
    'N_Features': proc3_train.shape[1] - 1,
    'Description': 'PCA + Ordinal Encoding'
})

strategies_summary.append({
    'Strategy': '4. MinMax + Embedding',
    'Train_Shape': proc4_train.shape,
    'Test_Shape': proc4_test.shape,
    'N_Features': proc4_train.shape[1] - 1,
    'Description': 'MinMaxScaler + Embedding Preparation'
})

print("=== STRATEGIES OVERVIEW ===")
print(pd.DataFrame(strategies_summary).to_string(index=False))


# 2. ƒê√°nh gi√° ch·∫•t l∆∞·ª£ng feature
def evaluate_strategy_quality(proc_train, strategy_name):
    if proc_train is None:
        return None
    
    features = [c for c in proc_train.columns if c != 'SalePrice']
    X = proc_train[features]

    n_missing = X.isna().sum().sum()
    n_constant = (X.nunique() == 1).sum()

    corr_matrix = X.corr().abs()
    upper_tri = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    n_high_corr = (upper_tri > 0.95).sum().sum()

    memory_mb = X.memory_usage(deep=True).sum() / 1024 / 1024
    
    return {
        'strategy': strategy_name,
        'n_features': len(features),
        'n_missing': n_missing,
        'n_constant': n_constant,
        'n_high_corr_pairs': n_high_corr,
        'memory_mb': memory_mb
    }

quality_results = []
quality_results.append(evaluate_strategy_quality(proc1_train, "Strategy 1"))
if proc2_train is not None:
    quality_results.append(evaluate_strategy_quality(proc2_train, "Strategy 2"))
quality_results.append(evaluate_strategy_quality(proc3_train, "Strategy 3"))
quality_results.append(evaluate_strategy_quality(proc4_train, "Strategy 4"))

quality_df = pd.DataFrame(quality_results)
print("\n=== FEATURE QUALITY ===")
print(quality_df.to_string(index=False))


# 3. Quick Ridge Regression test
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

def quick_model_test(proc_train, proc_test, strategy_name):
    if proc_train is None or proc_test is None:
        return None
    
    y_train = proc_train['SalePrice']
    y_test = proc_test['SalePrice']

    train_cols = [c for c in proc_train.columns if c != 'SalePrice']
    test_cols = [c for c in proc_test.columns if c != 'SalePrice']
    common_cols = list(set(train_cols) & set(test_cols))

    X_train = proc_train[common_cols].fillna(0)
    X_test = proc_test[common_cols].fillna(0)

    model = Ridge(alpha=1.0)
    cv_scores = cross_val_score(model, X_train, y_train,
                                cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    test_mae = mean_absolute_error(y_test, y_pred)

    return {
        'strategy': strategy_name,
        'cv_rmse': cv_rmse,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'features_used': len(common_cols)
    }

model_results = []
model_results.append(quick_model_test(proc1_train, proc1_test, "Strategy 1"))

if proc2_train is not None:
    model_results.append(quick_model_test(proc2_train, proc2_test, "Strategy 2"))

model_results.append(quick_model_test(proc3_train, proc3_test, "Strategy 3"))
model_results.append(quick_model_test(proc4_train, proc4_test, "Strategy 4"))

model_df = pd.DataFrame(model_results)
print("\n=== MODEL PERFORMANCE (Ridge) ===")
print(model_df.to_string(index=False))


# 4. K·∫øt lu·∫≠n ‚Äì chi·∫øn l∆∞·ª£c t·ªët nh·∫•t
best_model = model_df.iloc[model_df['test_rmse'].idxmin()]
best_quality = quality_df.iloc[quality_df['n_features'].idxmin()]

print("\n=====================================")
print(" K·∫æT LU·∫¨N CU·ªêI C√ôNG")
print("=====================================")

print(f"\n Best RMSE: {best_model['strategy']}")
print(f"   ‚Üí Test RMSE: ${best_model['test_rmse']:,.2f}")
print(f"   ‚Üí MAE: ${best_model['test_mae']:,.2f}")

print(f"\n Most Compact: {best_quality['strategy']}")
print(f"   ‚Üí Features: {best_quality['n_features']}")
print(f"   ‚Üí Memory: {best_quality['memory_mb']:.2f} MB")

print(f"\n Best for Deep Learning: Strategy 4 (MinMax + Embedding)")
print("=====================================")


=== STRATEGIES OVERVIEW ===
             Strategy Train_Shape Test_Shape  N_Features                          Description
 1. OneHot + Standard (1285, 287) (175, 287)         286    One-Hot Encoding + StandardScaler
   2. Target + Robust  (1285, 81)  (175, 81)          80       Target Encoding + RobustScaler
     3. PCA + Ordinal  (1285, 37)  (175, 37)          36               PCA + Ordinal Encoding
4. MinMax + Embedding (1285, 235) (175, 235)         234 MinMaxScaler + Embedding Preparation

=== FEATURE QUALITY ===
  strategy  n_features  n_missing  n_constant  n_high_corr_pairs  memory_mb
Strategy 1         286          0           0                 10   0.677686
Strategy 2          80          0           0                  2   0.794106
Strategy 3          36          0           0                  0   0.313721
Strategy 4         234          0           0                  6   0.639696

=== MODEL PERFORMANCE (Ridge) ===
  strategy      cv_rmse    test_rmse     test_mae  features_us