# 1. Load data

In [271]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# target to show plot in jupyter notebook and not external window
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

df_data = pd.read_csv("Data/data_predict_price_house.csv")

# 2. Xử lí outlier

In [272]:
df_clean = df_data[
    (df_data['dien_tich_dat_m2'] <= 1000) &
    (df_data['dien_tich_su_dung_m2'] <= 2000) &
    (df_data['phong_ngu'] <= 10) &
    (df_data['nha_tam'] <= 10)
]

# 3.Chia tập dữ liệu

In [273]:
data = df_clean.copy()
train_data, test_data = train_test_split(
    data, 
    test_size=0.2, 
    random_state=42)

# 4. Trích xuất nhãn giá

In [274]:
y_train = train_data['gia']
y_test = test_data['gia']
train_data = train_data.drop(columns=['gia'], axis=1)
test_data = test_data.drop(columns=['gia'], axis=1)

# 5. Điền giá trị none

In [275]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ["float64","int64"]]
cat_cols = [col for col in train_data.columns if train_data[col].dtype not in ["float64","int64"]]

train_data[cat_cols] = train_data[cat_cols].fillna("none")
test_data[cat_cols] = test_data[cat_cols].fillna("none")

# 6. One-hot encoding feature "quan"

In [276]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ["float64","int64"]]
cat_cols = [col for col in train_data.columns if train_data[col].dtype not in ["float64","int64"]]
train_data[cat_cols] = train_data[cat_cols].fillna("none")
test_data[cat_cols] = test_data[cat_cols].fillna("none")

In [277]:
from sklearn.preprocessing import OneHotEncoder

### One-hot encoding feature "quan"
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(train_data[cat_cols])

encoder_cols = list(encoder.get_feature_names_out(cat_cols))
train_data[encoder_cols] = encoder.transform(train_data[cat_cols])
test_data[encoder_cols] = encoder.transform(test_data[cat_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoder.fit(train_data[cat_cols])

encoded_cols = list(encoder.get_feature_names_out(cat_cols))

train_data[encoded_cols] = encoder.transform(train_data[cat_cols])
test_data[encoded_cols] = encoder.transform(test_data[cat_cols])



# 8. Xây dựng và huấn luyện mô hình

In [278]:
class OLSLinearRegression:
    def fit(self, X, y):
        ''' 
        This function is used to fit the model to the data. It uses the Ordinary Least Squares method to find the optimal parameters.

        Parameters
        ----------
        X : np.array
            Input data
        y : np.array
            Output data

        Returns
        -------
        self : object
            Returns the instance of the class
        
        '''

        X_pinv = np.linalg.inv(X.T @ X) @ X.T    # np.linalg.pinv(X)
        self.w = X_pinv @ y

        return self


    def get_params(self):
        ''' 
        This function is used to get the parameters of the model.

        Returns
        -------
        self.w : np.array
            Optimal parameters (column vector)
        '''

        return self.w

    def set_params(self, w):
        ''' 
        This function is used to set the parameters of the model.

        Parameters
        ----------
        w : np.array
            Optimal parameters (column vector)
        '''

        self.w = w
        
    def predict(self, X):
        ''' 
        This function is used to predict the output of the model.

        Parameters
        ----------
        X : np.array
            Input data

        Returns
        -------
        X @ self.w : np.array
            Predicted output
        '''

        return X @ self.w 

In [279]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model_df = train_data.copy()
model_df = model_df.drop(columns=['quan'], axis=1)

quan_cols = [col for col in model_df.columns if 'quan' in col]
print(quan_cols)
new_feature = []
for col in quan_cols:
    interaction_col_name = f"{col}_x_dientichdat"
    new_feature.append(interaction_col_name)
    model_df[interaction_col_name] = model_df[col] * model_df['dien_tich_dat_m2']

x_model3 = model_df[new_feature].values

x_model3_bias = np.c_[np.ones(x_model3.shape[0]), x_model3]

model3 = OLSLinearRegression().fit(x_model3_bias, y_train)  

weights_model3 = np.round(model3.get_params(), 3)

print("Trọng số hồi quy:",weights_model3)

['quan_1', 'quan_10', 'quan_11', 'quan_12', 'quan_2', 'quan_3', 'quan_4', 'quan_5', 'quan_6', 'quan_7', 'quan_8', 'quan_9', 'quan_Bình Thạnh', 'quan_Bình Tân', 'quan_Gò Vấp', 'quan_Phú Nhuận', 'quan_Thủ Đức (TP. Thủ Đức)', 'quan_Tân Bình', 'quan_Tân Phú']
Trọng số hồi quy: [-1.994  0.513  0.319  0.228  0.071  0.299  0.499  0.093  0.378  0.17
  0.182  0.118  0.089  0.241  0.123  0.16   0.367  0.127  0.194  0.16 ]


# 9. Đánh giá mô hình

In [280]:
from sklearn.metrics import mean_squared_error, r2_score

test_df = test_data.copy()
test_df = test_df.drop(columns=['quan'], axis=1)

quan_cols = [col for col in test_df.columns if 'quan' in col]

new_feature_test = []
for col in quan_cols:
    interaction_col_name = f"{col}_x_dientichdat"
    new_feature_test.append(interaction_col_name)
    test_df[interaction_col_name] = test_df[col] * test_df['dien_tich_dat_m2']

x_model3_test = test_df[new_feature_test].values

x_model3_test_bias = np.c_[np.ones(x_model3_test.shape[0]), x_model3_test]

y_test_pred = model3.predict(x_model3_test_bias)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

test_rmse = np.sqrt(test_mse)
print("Độ lỗi của mô hình đo bằng RMSE là :" ,test_rmse)
print("Hệ số xác định R2 của mô hình trên tập test là :", test_r2)

Độ lỗi của mô hình đo bằng RMSE là : 18.874282088565977
Hệ số xác định R2 của mô hình trên tập test là : 0.76413839661441


# 10 Đóng gói mô hình

## Mô hình

In [281]:
import numpy as np

class OLSLinearRegression_1:
    def fit(self, X, y):
        """
        Fit mô hình OLS: w = (X^T X)^(-1) X^T y  (ở đây dùng pseudo-inverse cho an toàn)
        X: (n_samples, n_features)
        y: (n_samples,) hoặc (n_samples, 1)
        """
        X = np.asarray(X)
        y = np.asarray(y)

        if y.ndim == 1:
            y = y.reshape(-1, 1)

        # pseudo-inverse: (X^T X)^(-1) X^T = pinv(X)
        X_pinv = np.linalg.pinv(X)
        self.w = X_pinv @ y   # (n_features, 1)

        return self

    def get_params(self):
        # Trả về weight như bạn muốn
        return self.w

    def set_params(self, w):
        self.w = np.asarray(w)
        return self
        
    def predict(self, X):
        X = np.asarray(X)
        y_pred = X @ self.w    # (n_samples, 1)
        return y_pred.ravel()


## Đóng gói chuẩn hoá dữ liệu:

In [282]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class Model3Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, add_bias=True):
        self.add_bias = add_bias

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        # encoder cho cột 'quan'
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.encoder.fit(X[['quan']])

        # tên cột one-hot
        self.quan_feature_names_ = self.encoder.get_feature_names_out(['quan'])
        # tên feature interaction
        self.new_feature_names_ = [
            f"{name}_x_dientichdat" for name in self.quan_feature_names_
        ]
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()

        # one-hot cho 'quan'
        quan_ohe = self.encoder.transform(X[['quan']]).toarray()   # (n_samples, n_quan)
        dientich = X['dien_tich_dat_m2'].to_numpy().reshape(-1, 1) # (n_samples, 1)

        # interaction: quan_i * dien_tich_dat_m2
        interaction = quan_ohe * dientich                           # (n_samples, n_quan)

        if self.add_bias:
            bias = np.ones((interaction.shape[0], 1))
            X_out = np.hstack([bias, interaction])                  # (n_samples, 1+n_quan)
        else:
            X_out = interaction

        return X_out


## Train model

In [283]:
from sklearn.pipeline import Pipeline

X_train_raw = train_data.copy()
X_test_raw  = test_data.copy()

# Ở Model 3 bạn chỉ dùng 'quan' và 'dien_tich_dat_m2',
X_train_m3 = X_train_raw[['quan', 'dien_tich_dat_m2']]
X_test_m3  = X_test_raw[['quan', 'dien_tich_dat_m2']]

model3_pipeline = Pipeline(steps=[
    ("preprocess", Model3Preprocessor(add_bias=True)),
    ("regressor", OLSLinearRegression())
])

model3_pipeline.fit(X_train_m3, y_train)

# Đánh giá
y_train_pred = model3_pipeline.predict(X_train_m3)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

y_test_pred = model3_pipeline.predict(X_test_m3)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print("Train RMSE:", train_rmse)
print("Train R2:", train_r2)
print("Test RMSE:", test_rmse)
print("Test R2:", test_r2)

results = {
    "Model": ["OLS with Interaction Features"],
    "Train RMSE": [train_rmse],
    "Test RMSE":[test_rmse],
    "Train R2": [train_r2],
    "Test R2": [test_r2]
}
results_OLS = pd.DataFrame(results)
print(results_OLS)


Train RMSE: 33.81437091132598
Train R2: 0.49488037853448974
Test RMSE: 18.874282088565977
Test R2: 0.76413839661441
                           Model  Train RMSE  Test RMSE  Train R2   Test R2
0  OLS with Interaction Features   33.814371  18.874282   0.49488  0.764138


## Save model and Save performance

In [284]:
import joblib
import pandas as pd

# Save model
joblib.dump(model3_pipeline, "Full_Pipeline_Models/house_price_model_OLS_model_3.joblib")


['Full_Pipeline_Models/house_price_model_OLS_model_3.joblib']

In [285]:
import pandas as pd
import os

path = "./Performance_Model/model_performance_report.csv"

# Đọc file hiện tại
if not os.path.exists(path):
    # Nếu file chưa tồn tại, tạo mới với header
    result = pd.DataFrame(columns=["Model", "Train RMSE", "Test RMSE", "Train R2", "Test R2"])
    result.to_csv(path, index=False)
    result = pd.read_csv(path)
else :
    result = pd.read_csv(path)
# Lấy row log model (giả sử result_log_models là DataFrame, lấy row 0)
new_row = results_OLS.iloc[0].copy()
new_row["Model"] = "OLS with Interaction Features"   # đảm bảo tên model đúng

mask = result["Model"] == "OLS with Interaction Features"

if mask.any():
    # Đã có rồi -> cập nhật lại toàn bộ row
    result.loc[mask, :] = new_row.values
    msg = "Updated row for 'OLS with Interaction Features'."
else:
    # Chưa có -> thêm row mới
    result = pd.concat(
        [result, pd.DataFrame([new_row])],
        ignore_index=True
    )
    msg = "Added new row for 'OLS with Interaction Features'."

# Ghi đè file CSV
result.sort_values(by="Test R2", ascending=False, inplace=True)
result.to_csv(path, index=False)
best_model_name = result.iloc[0]['Model']
if best_model_name == "OLS with Interaction Features":
    print("Best model is now 'OLS with Interaction Features'.")
    model = joblib.load(f"./Full_Pipeline_Models/house_price_model_OLS_model_3.joblib")
    joblib.dump(model, f"./Full_Pipeline_Models/house_price_best_model.joblib")
    print(f"Successfully updated the best model to 'OLS with Interaction Features'.")
print(f"The best model is: {best_model_name}")

Best model is now 'OLS with Interaction Features'.
Successfully updated the best model to 'OLS with Interaction Features'.
The best model is: OLS with Interaction Features


# TEST

In [286]:
model_1 = joblib.load("./Full_Pipeline_Models/house_price_model_OLS_model_3.joblib")

input_df = pd.DataFrame({
    "quan": ["1"],
    "dien_tich_dat_m2": [40],
    "dien_tich_su_dung_m2": [200],
    "phong_ngu": [10],
    "nha_tam": [11],
})

y_pred = model_1.predict(input_df[['quan', 'dien_tich_dat_m2']])
print(y_pred)

[18.54581902]
