# Linear Regression Model

+ Mục tiêu hiểu rỏ luồng khi train mô hình sẽ như thế nào.

# I.Load data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# target to show plot in jupyter notebook and not external window
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

## 1. Data normal

In [2]:
df_data = pd.read_csv("./Data/data_predict_price_house.csv")
# Display the first few rows of the dataframe
df_data.head()

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
0,Gò Vấp,147.0,479.0,17.0,18.0,16.8
1,Tân Phú,180.0,179.0,13.0,13.0,23.0
2,10,53.0,53.0,2.0,2.0,4.3
3,9,233.0,233.0,28.0,28.0,26.0
4,10,102.0,102.0,2.0,2.0,3.7


In [3]:
# Print the shape of the dataframe
print(df_data.shape)

(4293, 6)


In [4]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4293 entries, 0 to 4292
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   quan                  4293 non-null   object 
 1   dien_tich_dat_m2      4293 non-null   float64
 2   dien_tich_su_dung_m2  4293 non-null   float64
 3   phong_ngu             4293 non-null   float64
 4   nha_tam               4293 non-null   float64
 5   gia                   4293 non-null   float64
dtypes: float64(5), object(1)
memory usage: 201.4+ KB


In [5]:
# Get the names of non-object (numerical) columns
df_data.select_dtypes(exclude=['object']).shape

(4293, 5)

In [6]:
df_data.select_dtypes(exclude=['object']).columns

Index(['dien_tich_dat_m2', 'dien_tich_su_dung_m2', 'phong_ngu', 'nha_tam',
       'gia'],
      dtype='object')

In [7]:
# Get the names of object (non-numerical) columns
df_data.select_dtypes(include=['object']).shape

(4293, 1)

In [8]:
df_data.select_dtypes(include=['object']).columns

Index(['quan'], dtype='object')

In [9]:
print(df_data['quan'].unique())
df_data['quan'].value_counts()

['Gò Vấp' 'Tân Phú' '10' '9' 'Bình Tân' '5' '7' 'Phú Nhuận' '3'
 'Bình Thạnh' '1' '2' '4' '6' '8' '11' '12' 'Tân Bình'
 'Thủ Đức (TP. Thủ Đức)']


quan
9                        358
6                        334
4                        300
1                        294
8                        278
12                       266
11                       261
Tân Phú                  251
5                        230
2                        221
7                        210
10                       190
Tân Bình                 186
Bình Tân                 184
Phú Nhuận                180
Thủ Đức (TP. Thủ Đức)    157
Bình Thạnh               137
Gò Vấp                   128
3                        128
Name: count, dtype: int64

In [10]:
# Get summary statistics of numerical columns
df_data.describe()

Unnamed: 0,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
count,4293.0,4293.0,4293.0,4293.0,4293.0
mean,126.256615,551.3882,4.610063,4.554391,24.983213
std,295.29825,18319.14,4.999423,5.284465,57.364136
min,1.0,1.0,1.0,1.0,0.0
25%,58.0,82.0,3.0,2.0,5.1
50%,78.0,139.0,4.0,3.0,9.7
75%,120.0,290.0,5.0,5.0,22.0
max,10000.0,1200000.0,100.0,100.0,1279.999


## 2. Data with clean

In [11]:
df_clean = df_data[
    (df_data['dien_tich_dat_m2'] <= 1000) &
    (df_data['dien_tich_su_dung_m2'] <= 2000) &
    (df_data['phong_ngu'] <= 10) &
    (df_data['nha_tam'] <= 10)
]
print(df_clean.shape)

(4011, 6)


In [12]:
df_clean.head()

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
2,10,53.0,53.0,2.0,2.0,4.3
4,10,102.0,102.0,2.0,2.0,3.7
5,10,79.0,79.0,2.0,2.0,4.9
6,10,72.0,72.0,2.0,2.0,5.5
7,10,103.0,103.0,3.0,2.0,8.2


In [13]:
df_clean.describe()

Unnamed: 0,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
count,4011.0,4011.0,4011.0,4011.0,4011.0
mean,101.288369,212.188125,3.744453,3.621541,21.184398
std,92.054532,233.141943,1.768036,1.928491,45.9738
min,1.0,1.0,1.0,1.0,0.0
25%,57.0,80.0,2.5,2.0,4.9
50%,75.0,130.0,3.0,3.0,8.99
75%,110.0,250.0,5.0,5.0,18.5
max,1000.0,2000.0,10.0,10.0,1279.999


# II Pre-processing data

+ Ở đây có 2 trường hợp:
    + Muốn sử dụng full data để train thì: 
    
        data = df_data.copy()
    
    + Muốn sử dụng data sau khi lọc thì:
    
        data = df_clean.copy()
    

In [14]:
data = df_clean.copy()

## 1. Slipt data into train and test

### 1.1 Slipt train and test

In [15]:
# Create training and validation sets
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    data, 
    test_size=0.2, 
    random_state=42)
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (3208, 6)
Test data shape: (803, 6)


In [16]:
train_data.to_csv("./Data/train_data.csv", index=False)
test_data.to_csv("./Data/test_data.csv", index=False)

### 1.2 Extract label is gia and drop in feature

In [17]:
y_train = train_data['gia'].values
y_test = test_data['gia'].values
train_data = train_data.drop(columns=['gia'], axis=1)
test_data = test_data.drop(columns=['gia'], axis=1)

In [18]:
train_data.shape, test_data.shape

((3208, 5), (803, 5))

### 1.3 Check train data having value is non-numeric or other type

In [19]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ["float64","int64"]]
cat_cols = [col for col in train_data.columns if train_data[col].dtype not in ["float64","int64"]]

In [20]:
len(num_cols), len(cat_cols)

(4, 1)

In [21]:
# fill none for categorical columns
train_data[cat_cols] = train_data[cat_cols].fillna("none")
test_data[cat_cols] = test_data[cat_cols].fillna("none")

In [22]:
print("X_train shape:", train_data.shape)
print("X_test shape:", test_data.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3208, 5)
X_test shape: (803, 5)
y_train shape: (3208,)
y_test shape: (803,)


### 1.4 One-hot encoding feature "quan"

+ Nếu gặp một cái mới ở test thì set one-hot là 0

#### 1.4.1 One-hot encoding with catery in "quan"

In [23]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encoding feature(s) in `cat_cols` with robust assignment
try:
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit encoder on training categorical columns
encoder.fit(train_data[cat_cols])

# Transform and build DataFrames with proper columns and index to avoid shape errors
encoded_train = encoder.transform(train_data[cat_cols])
encoded_test = encoder.transform(test_data[cat_cols])

encoded_cols = list(encoder.get_feature_names_out())
encoded_train_df = pd.DataFrame(encoded_train, columns=encoded_cols, index=train_data.index)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_cols, index=test_data.index)

# Concatenate encoded columns to original dataframes (reset index to be safe)
train_data = pd.concat([train_data.reset_index(drop=True), encoded_train_df.reset_index(drop=True)], axis=1)
test_data = pd.concat([test_data.reset_index(drop=True), encoded_test_df.reset_index(drop=True)], axis=1)

#### 1.4.2 Print information

In [24]:
train_data

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,quan_1,quan_10,quan_11,quan_12,quan_2,...,quan_7,quan_8,quan_9,quan_Bình Thạnh,quan_Bình Tân,quan_Gò Vấp,quan_Phú Nhuận,quan_Thủ Đức (TP. Thủ Đức),quan_Tân Bình,quan_Tân Phú
0,4,55.0,165.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,53.0,53.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,30.0,50.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,120.0,600.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,61.0,122.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3203,6,80.0,80.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3204,7,60.0,120.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3205,5,68.0,136.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3206,Thủ Đức (TP. Thủ Đức),20.0,31.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
test_data

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,quan_1,quan_10,quan_11,quan_12,quan_2,...,quan_7,quan_8,quan_9,quan_Bình Thạnh,quan_Bình Tân,quan_Gò Vấp,quan_Phú Nhuận,quan_Thủ Đức (TP. Thủ Đức),quan_Tân Bình,quan_Tân Phú
0,2,72.0,72.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bình Thạnh,249.0,1000.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Phú Nhuận,64.0,128.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Bình Tân,96.0,96.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Tân Phú,32.0,60.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798,7,140.0,140.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
799,2,98.0,98.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
800,4,105.0,105.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
801,2,433.0,433.0,4.0,4.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# III Polynomial Feature Engineer model

In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import os

# 1. Định nghĩa Class biến đổi dữ liệu (FeatureEngineer)
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if isinstance(X, np.ndarray):
            feature_names = ['dien_tich_dat_m2', 'dien_tich_su_dung_m2', 'phong_ngu', 'nha_tam', 'quan']
            X = pd.DataFrame(X, columns=feature_names)
            
        X_new = X.copy()
        X_new['dien_tich_trung_binh_phong'] = X_new['dien_tich_su_dung_m2'] / (X_new['phong_ngu'] + 1e-5)
        X_new['tong_so_phong'] = X_new['phong_ngu'] + X_new['nha_tam']
        return X_new

# 2. Hàm huấn luyện và đánh giá
def train_polynomial_model(X_train, y_train, X_test, y_test):
    """
    Hàm xây dựng, huấn luyện mô hình Polynomial Regression với Feature Engineering 
    và trả về kết quả đánh giá chi tiết dạng DataFrame.
    """
    
    # --- 1. Định nghĩa các cột ---
    num_cols_base = ["dien_tich_dat_m2", "dien_tich_su_dung_m2", "phong_ngu", "nha_tam"]
    cat_cols = ["quan"]
    num_cols_eng = num_cols_base + ["dien_tich_trung_binh_phong", "tong_so_phong"]

    # --- 2. Xây dựng Transformers ---
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("scaler", MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    # --- 3. Preprocessor ---
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols_eng),
            ("cat", categorical_transformer, cat_cols),
        ]
    )

    # --- 4. Pipeline ---
    model_pipeline = Pipeline(steps=[
        ("feature_eng", FeatureEngineer()), 
        ("preprocessor", preprocessor),     
        ("regressor", LinearRegression())   
    ])

    # --- 5. Huấn luyện ---
    if isinstance(y_train, np.ndarray):
        y_train = pd.Series(y_train, name='gia')
    
    print(X_train.shape, y_train.shape)
    model_pipeline.fit(X_train, y_train)
    print("Model đã được huấn luyện.")

    # --- 6. Đánh giá (Tính toán Metrics) ---
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # --- 7. Tạo DataFrame kết quả ---
    results_df = pd.DataFrame({
        "Model": ["Polynomial FeatureEng Model 4"],
        "Train RMSE": [train_rmse],
        "Test RMSE": [test_rmse],
        "Train R2": [train_r2],
        "Test R2": [test_r2]
    })
    
    print("Huấn luyện và đánh giá hoàn tất.")
    return model_pipeline, results_df

# --- Cách sử dụng ---
X_train = train_data.copy()
X_test = test_data.copy()
print(X_train.shape, X_test.shape)
model, result_poly = train_polynomial_model(X_train, y_train, X_test, y_test)
print(result_poly)

(3208, 24) (803, 24)
(3208, 24) (3208,)
Model đã được huấn luyện.
Huấn luyện và đánh giá hoàn tất.
                           Model  Train RMSE  Test RMSE  Train R2   Test R2
0  Polynomial FeatureEng Model 4    35.43985  23.689813   0.44515  0.628431


# III.1 Save Model

In [27]:
# 3. Lưu mô hình bằng joblib
# Tạo thư mục Models nếu chưa có
folder_path_save = "Full_Pipeline_Models"
if not os.path.exists(folder_path_save):
    os.makedirs(folder_path_save)

# Đặt tên file mô hình
model_filename = f"{folder_path_save}/house_price_Polynomial_FeatureEng_model_4.joblib"

# Lưu file
folder_path_save = "Full_Pipeline_Models"
if not os.path.exists(folder_path_save):
    os.makedirs(folder_path_save)
joblib.dump(model, model_filename)

['Full_Pipeline_Models/house_price_Polynomial_FeatureEng_model_4.joblib']

## III.2 Save performance

In [28]:
import pandas as pd

path = "./Performance_Model/model_performance_report.csv"

# Đọc file hiện tại
result = pd.read_csv(path)

# Lấy row log model (giả sử result_log_models là DataFrame, lấy row 0)
new_row = result_poly.iloc[0].copy()
new_row["Model"] = "Polynomial FeatureEng Model 4"   # đảm bảo tên model đúng

mask = result["Model"] == "Polynomial FeatureEng Model 4"

if mask.any():
    # Đã có rồi -> cập nhật lại toàn bộ row
    result.loc[mask, :] = new_row.values
    msg = "Updated row for 'Polynomial FeatureEng Model 4'."
else:
    # Chưa có -> thêm row mới
    result = pd.concat(
        [result, pd.DataFrame([new_row])],
        ignore_index=True
    )
    msg = "Added new row for 'Polynomial FeatureEng Model 4'."

# Ghi đè file CSV
result.sort_values(by="Test R2", ascending=False)
result.to_csv(path, index=False)
best_model_name = result.iloc[0]["Model"]
if best_model_name == "Polynomial FeatureEng Model 4":
    print("Best model is now 'Polynomial FeatureEng Model 4'.")
    model = joblib.load(f"./Full_Pipeline_Models/house_price_Polynomial_FeatureEng_model_4.joblib")
    joblib.dump(model, f"./Full_Pipeline_Models/house_price_best_model.joblib")
    print(f"Successfully updated the best model to 'Polynomial FeatureEng Model 4'.")
print(f"The best model is: {best_model_name}")

The best model is: OLS with Interaction Features


# IV Tham số của mô hình

In [29]:
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
# Import các thuật toán để kiểm tra
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor 

# --- 1. BẮT BUỘC: Định nghĩa lại Class FeatureEngineer ---
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, np.ndarray):
            feature_names = [f"col_{i}" for i in range(X.shape[1])]
            if len(feature_names) == 5:
                 feature_names = ['dien_tich_dat_m2', 'dien_tich_su_dung_m2', 'phong_ngu', 'nha_tam', 'quan']
            X = pd.DataFrame(X, columns=feature_names)
        
        X_new = X.copy()
        if all(col in X_new.columns for col in ['dien_tich_su_dung_m2', 'phong_ngu', 'nha_tam']):
             X_new['dien_tich_trung_binh_phong'] = X_new['dien_tich_su_dung_m2'] / (X_new['phong_ngu'] + 1e-5)
             X_new['tong_so_phong'] = X_new['phong_ngu'] + X_new['nha_tam']
        return X_new

# --- 2. Load Model ---
model_path = "Full_Pipeline_Models/house_price_Polynomial_FeatureEng_model_4.joblib"
try:
    model = joblib.load(model_path)
    print(f"Đã load model từ: {model_path}")
    # In ra tên các bước để bạn kiểm tra (Ví dụ: 'preprocess', 'regressor')
    print("Các bước trong Pipeline:", list(model.named_steps.keys())) 
except Exception as e:
    print(f"Lỗi load model: {e}")
    # Tạo model giả để code dưới không crash khi demo
    exit()

# --- 3. TỰ ĐỘNG TÌM CÁC BƯỚC (FIX LỖI KEY ERROR & NAME ERROR) ---
preprocessor_step = None
regressor_step = None

for name, step in model.named_steps.items():
    # Tìm bước tiền xử lý (ColumnTransformer)
    if isinstance(step, ColumnTransformer):
        preprocessor_step = step
    
    # Tìm bước huấn luyện (Có thuộc tính coef_ hoặc là các class mô hình)
    # Lưu ý: Đã import HuberRegressor ở trên nên sẽ không bị lỗi NameError nữa
    if hasattr(step, 'coef_') or isinstance(step, (LinearRegression, Ridge, Lasso, HuberRegressor)):
        regressor_step = step

if preprocessor_step is None or regressor_step is None:
    print("LỖI: Không tìm thấy Preprocessor hoặc Regressor. Hãy kiểm tra lại Pipeline.")
else:
    # --- 4. HIỂN THỊ THAM SỐ ---
    print("\n" + "="*30)
    print("THAM SỐ CỦA MÔ HÌNH")
    print("="*30)

    # A. Hệ số chặn (Intercept)
    # Một số mô hình như Huber có thể không có intercept_, cần dùng getattr để an toàn
    intercept = getattr(regressor_step, 'intercept_', 0.0)
    print(f"Hệ số chặn (Intercept/Bias): {intercept:.4f}")

    # B. Lấy tên đặc trưng (Features)
    try:
        feature_names = preprocessor_step.get_feature_names_out()
    except Exception as e:
        print(f"Dùng index thay thế tên đặc trưng (Lỗi: {e})")
        feature_names = [f"Feature_{i}" for i in range(len(regressor_step.coef_))]

    # C. Tạo DataFrame tham số
    if len(feature_names) == len(regressor_step.coef_):
        params_df = pd.DataFrame({
            'Feature_Name': feature_names,
            'Coefficient (Weight)': regressor_step.coef_
        })

        # Sắp xếp theo độ tác động tuyệt đối
        params_df['Impact_Level'] = params_df['Coefficient (Weight)'].abs()
        params_df = params_df.sort_values(by='Impact_Level', ascending=False).drop(columns=['Impact_Level'])

        print("\nTham số từng đặc trưng:")
        # Làm sạch tên cột
        params_df['Feature_Name'] = params_df['Feature_Name'].astype(str).str.replace('num__', '').str.replace('cat__', '')
        print(params_df.to_string(index=False))
    else:
        print(f"Lệch kích thước: {len(feature_names)} tên đặc trưng vs {len(regressor_step.coef_)} hệ số.")

Đã load model từ: Full_Pipeline_Models/house_price_Polynomial_FeatureEng_model_4.joblib
Các bước trong Pipeline: ['feature_eng', 'preprocessor', 'regressor']

THAM SỐ CỦA MÔ HÌNH
Hệ số chặn (Intercept/Bias): -10.8574

Tham số từng đặc trưng:
                                   Feature_Name  Coefficient (Weight)
dien_tich_su_dung_m2 dien_tich_trung_binh_phong           2540.821520
                   dien_tich_trung_binh_phong^2          -2269.045813
                     dien_tich_trung_binh_phong            442.325528
          dien_tich_dat_m2 dien_tich_su_dung_m2           -367.768758
                       dien_tich_dat_m2 nha_tam            297.693203
                 dien_tich_su_dung_m2 phong_ngu            253.953542
                         dien_tich_su_dung_m2^2           -228.924271
                               dien_tich_dat_m2            209.339092
                     dien_tich_dat_m2 phong_ngu           -203.381383
             dien_tich_su_dung_m2 tong_so_phong           

# TEST

In [30]:
# 1. Load lại model y chang app.py
model = joblib.load(f"Full_Pipeline_Models/house_price_Polynomial_FeatureEng_model_4.joblib")

# 2. Tạo đúng input như web
input_df = pd.DataFrame({
    "quan": ["1"],
    "dien_tich_dat_m2": [40],
    "dien_tich_su_dung_m2": [200],
    "phong_ngu": [10],
    "nha_tam": [11],
})

y_pred = model.predict(input_df)
print(f"Predicted house price: {y_pred[0]}")

Predicted house price: 16.96402216841024
