# Linear Regression Model

+ Mục tiêu hiểu rỏ luồng khi train mô hình sẽ như thế nào.

# I.Load data

In [588]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# target to show plot in jupyter notebook and not external window
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

## 1. Data normal

In [589]:
df_data = pd.read_csv("./Data/data_predict_price_house.csv")
# Display the first few rows of the dataframe
df_data.head()

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
0,Gò Vấp,147.0,479.0,17.0,18.0,16.8
1,Tân Phú,180.0,179.0,13.0,13.0,23.0
2,10,53.0,53.0,2.0,2.0,4.3
3,9,233.0,233.0,28.0,28.0,26.0
4,10,102.0,102.0,2.0,2.0,3.7


In [590]:
# Print the shape of the dataframe
print(df_data.shape)

(4293, 6)


In [591]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4293 entries, 0 to 4292
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   quan                  4293 non-null   object 
 1   dien_tich_dat_m2      4293 non-null   float64
 2   dien_tich_su_dung_m2  4293 non-null   float64
 3   phong_ngu             4293 non-null   float64
 4   nha_tam               4293 non-null   float64
 5   gia                   4293 non-null   float64
dtypes: float64(5), object(1)
memory usage: 201.4+ KB


In [592]:
# Get the names of non-object (numerical) columns
df_data.select_dtypes(exclude=['object']).shape

(4293, 5)

In [593]:
df_data.select_dtypes(exclude=['object']).columns

Index(['dien_tich_dat_m2', 'dien_tich_su_dung_m2', 'phong_ngu', 'nha_tam',
       'gia'],
      dtype='object')

In [594]:
# Get the names of object (non-numerical) columns
df_data.select_dtypes(include=['object']).shape

(4293, 1)

In [595]:
df_data.select_dtypes(include=['object']).columns

Index(['quan'], dtype='object')

In [596]:
print(df_data['quan'].unique())
df_data['quan'].value_counts()

['Gò Vấp' 'Tân Phú' '10' '9' 'Bình Tân' '5' '7' 'Phú Nhuận' '3'
 'Bình Thạnh' '1' '2' '4' '6' '8' '11' '12' 'Tân Bình'
 'Thủ Đức (TP. Thủ Đức)']


quan
9                        358
6                        334
4                        300
1                        294
8                        278
12                       266
11                       261
Tân Phú                  251
5                        230
2                        221
7                        210
10                       190
Tân Bình                 186
Bình Tân                 184
Phú Nhuận                180
Thủ Đức (TP. Thủ Đức)    157
Bình Thạnh               137
Gò Vấp                   128
3                        128
Name: count, dtype: int64

In [597]:
# Get summary statistics of numerical columns
df_data.describe()

Unnamed: 0,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
count,4293.0,4293.0,4293.0,4293.0,4293.0
mean,126.256615,551.3882,4.610063,4.554391,24.983213
std,295.29825,18319.14,4.999423,5.284465,57.364136
min,1.0,1.0,1.0,1.0,0.0
25%,58.0,82.0,3.0,2.0,5.1
50%,78.0,139.0,4.0,3.0,9.7
75%,120.0,290.0,5.0,5.0,22.0
max,10000.0,1200000.0,100.0,100.0,1279.999


## 2. Data with clean

In [598]:
df_clean = df_data[
    (df_data['dien_tich_dat_m2'] <= 1000) &
    (df_data['dien_tich_su_dung_m2'] <= 2000) &
    (df_data['phong_ngu'] <= 10) &
    (df_data['nha_tam'] <= 10)
]
print(df_clean.shape)

(4011, 6)


In [599]:
df_clean.head()

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
2,10,53.0,53.0,2.0,2.0,4.3
4,10,102.0,102.0,2.0,2.0,3.7
5,10,79.0,79.0,2.0,2.0,4.9
6,10,72.0,72.0,2.0,2.0,5.5
7,10,103.0,103.0,3.0,2.0,8.2


In [600]:
df_clean.describe()

Unnamed: 0,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
count,4011.0,4011.0,4011.0,4011.0,4011.0
mean,101.288369,212.188125,3.744453,3.621541,21.184398
std,92.054532,233.141943,1.768036,1.928491,45.9738
min,1.0,1.0,1.0,1.0,0.0
25%,57.0,80.0,2.5,2.0,4.9
50%,75.0,130.0,3.0,3.0,8.99
75%,110.0,250.0,5.0,5.0,18.5
max,1000.0,2000.0,10.0,10.0,1279.999


# II Pre-processing data

+ Ở đây có 2 trường hợp:
    + Muốn sử dụng full data để train thì: 
    
        data = df_data.copy()
    
    + Muốn sử dụng data sau khi lọc thì:
    
        data = df_clean.copy()
    

In [601]:
data = df_clean.copy()

## 1. Slipt data into train and test

### 1.1 Slipt train and test

In [602]:
# Create training and validation sets
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    data, 
    test_size=0.2, 
    random_state=42)
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (3208, 6)
Test data shape: (803, 6)


In [603]:
train_data.to_csv("./Data/train_data.csv", index=False)
test_data.to_csv("./Data/test_data.csv", index=False)

### 1.2 Extract label is gia and drop in feature

In [604]:
y_train = train_data['gia'].values
y_test = test_data['gia'].values
train_data = train_data.drop(columns=['gia'], axis=1)
test_data = test_data.drop(columns=['gia'], axis=1)

In [605]:
train_data.shape, test_data.shape

((3208, 5), (803, 5))

### 1.3 Check train data having value is non-numeric or other type

In [606]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ["float64","int64"]]
cat_cols = [col for col in train_data.columns if train_data[col].dtype not in ["float64","int64"]]

In [607]:
len(num_cols), len(cat_cols)

(4, 1)

In [608]:
# fill none for categorical columns
train_data[cat_cols] = train_data[cat_cols].fillna("none")
test_data[cat_cols] = test_data[cat_cols].fillna("none")

In [609]:
print("X_train shape:", train_data.shape)
print("X_test shape:", test_data.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3208, 5)
X_test shape: (803, 5)
y_train shape: (3208,)
y_test shape: (803,)


### 1.4 One-hot encoding feature "quan"

+ Nếu gặp một cái mới ở test thì set one-hot là 0

#### 1.4.1 One-hot encoding with catery in "quan"

In [610]:
from sklearn.preprocessing import OneHotEncoder

### One-hot encoding feature "quan"
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(train_data[cat_cols])

encoder_cols = list(encoder.get_feature_names_out(cat_cols))
train_data[encoder_cols] = encoder.transform(train_data[cat_cols])
test_data[encoder_cols] = encoder.transform(test_data[cat_cols])

#### 1.4.2 Print information

In [611]:
train_data

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,quan_1,quan_10,quan_11,quan_12,quan_2,...,quan_7,quan_8,quan_9,quan_Bình Thạnh,quan_Bình Tân,quan_Gò Vấp,quan_Phú Nhuận,quan_Thủ Đức (TP. Thủ Đức),quan_Tân Bình,quan_Tân Phú
650,4,55.0,165.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1534,8,53.0,53.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2339,12,30.0,50.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1702,8,120.0,600.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1699,8,61.0,122.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182,6,80.0,80.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1357,7,60.0,120.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
901,5,68.0,136.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3751,Thủ Đức (TP. Thủ Đức),20.0,31.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [612]:
test_data

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,quan_1,quan_10,quan_11,quan_12,quan_2,...,quan_7,quan_8,quan_9,quan_Bình Thạnh,quan_Bình Tân,quan_Gò Vấp,quan_Phú Nhuận,quan_Thủ Đức (TP. Thủ Đức),quan_Tân Bình,quan_Tân Phú
336,2,72.0,72.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2753,Bình Thạnh,249.0,1000.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3090,Phú Nhuận,64.0,128.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2629,Bình Tân,96.0,96.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3470,Tân Phú,32.0,60.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,7,140.0,140.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
294,2,98.0,98.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
743,4,105.0,105.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
285,2,433.0,433.0,4.0,4.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# III Normalization

+ Mục tiêu đưa các giá trị về [0, 1]
+ Chuẩn hoá theo min, max
+ value_new = (value_old - min)/(max - min)

## 1. Normalization

In [613]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_num_feature = scaler.fit_transform(train_data[num_cols])
test_num_feature = scaler.transform(test_data[num_cols])

In [614]:
print("X_train shape:", train_num_feature.shape)
print("X_test shape:", test_num_feature.shape)

X_train shape: (3208, 4)
X_test shape: (803, 4)


In [615]:
train_num_feature

array([[0.05405405, 0.08204102, 0.33333333, 0.22222222],
       [0.05205205, 0.02601301, 0.11111111, 0.11111111],
       [0.02902903, 0.02451226, 0.11111111, 0.11111111],
       ...,
       [0.06706707, 0.06753377, 0.11111111, 0.11111111],
       [0.01901902, 0.0150075 , 0.11111111, 0.        ],
       [0.12612613, 0.19009505, 0.22222222, 0.22222222]])

In [616]:
test_num_feature

array([[0.07107107, 0.03551776, 0.11111111, 0.11111111],
       [0.24824825, 0.49974987, 0.66666667, 0.66666667],
       [0.06306306, 0.06353177, 0.22222222, 0.33333333],
       ...,
       [0.1041041 , 0.05202601, 0.22222222, 0.11111111],
       [0.43243243, 0.21610805, 0.33333333, 0.33333333],
       [0.04204204, 0.05952976, 0.33333333, 0.44444444]])

## 2. Add X_train, X_test in stack follow horizontal

+ Mục tiêu: nối theo chiều ngang (mảng các vector).

In [617]:
X_train = np.hstack([train_num_feature, train_data[encoder_cols].values])
X_test = np.hstack([test_num_feature, test_data[encoder_cols].values])

In [618]:
X_train.shape, X_test.shape

((3208, 23), (803, 23))

In [619]:
X_train

array([[0.05405405, 0.08204102, 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.05205205, 0.02601301, 0.11111111, ..., 0.        , 0.        ,
        0.        ],
       [0.02902903, 0.02451226, 0.11111111, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06706707, 0.06753377, 0.11111111, ..., 0.        , 0.        ,
        0.        ],
       [0.01901902, 0.0150075 , 0.11111111, ..., 1.        , 0.        ,
        0.        ],
       [0.12612613, 0.19009505, 0.22222222, ..., 0.        , 0.        ,
        1.        ]])

In [620]:
X_test

array([[0.07107107, 0.03551776, 0.11111111, ..., 0.        , 0.        ,
        0.        ],
       [0.24824825, 0.49974987, 0.66666667, ..., 0.        , 0.        ,
        0.        ],
       [0.06306306, 0.06353177, 0.22222222, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.1041041 , 0.05202601, 0.22222222, ..., 0.        , 0.        ,
        0.        ],
       [0.43243243, 0.21610805, 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.04204204, 0.05952976, 0.33333333, ..., 0.        , 0.        ,
        0.        ]])

In [621]:
y_train

array([ 1.1  ,  1.739,  1.38 , ...,  1.24 ,  1.8  , 19.5  ])

In [622]:
y_test

array([5.99900e+00, 6.30000e+01, 5.00000e+01, 7.98000e+00, 3.59000e+00,
       2.45500e+00, 6.50000e+00, 1.55000e+01, 2.90000e+00, 9.80000e+00,
       4.60000e+00, 9.28000e-01, 8.60000e+00, 1.80000e+01, 1.45000e+01,
       4.50000e+00, 7.50000e+00, 5.90000e+01, 2.69000e+00, 1.89000e+00,
       8.80000e-01, 2.65000e+01, 2.25000e+01, 5.10000e+00, 3.40000e+01,
       5.90000e+00, 3.60000e+01, 4.30000e+00, 2.75000e+00, 7.90000e+00,
       9.70000e-01, 9.50000e+00, 4.80000e+00, 9.98000e+00, 1.10000e+01,
       6.70000e+00, 1.90000e+01, 2.00000e+01, 3.20000e+00, 1.68000e+01,
       1.50000e+01, 8.70000e+00, 1.70500e+00, 1.03300e+00, 7.80000e+01,
       8.90000e+00, 1.00000e-01, 8.90000e+00, 1.20000e+01, 3.35000e+01,
       5.35000e+00, 6.58000e+00, 9.90000e+00, 4.40000e+00, 9.10000e+00,
       2.80000e+01, 1.29000e+01, 5.00000e+01, 7.95000e+00, 3.79990e+01,
       6.99900e+00, 9.70000e-01, 2.30000e+01, 2.28000e+00, 1.55000e+01,
       2.25000e+01, 3.60000e+01, 3.95000e+00, 5.50000e+00, 8.200

# III Train Regression Model

+ Ridge = Linear Regression + L2 regularization

+ Lasso = Linear Regression + L1 regularization

## 2. Model 1: Training model linear basic (degree 1) with full feature

In [623]:
# Tạo thư mục lưu mô hình
import os
folder_path_save = "Models"
if not os.path.exists(folder_path_save):
    os.makedirs(folder_path_save)


+ Hàm fit() ở đây dùng để train model bằng cách:
    +  Giải một hệ phương trình tuyến tính để tìm w tốt nhất (ít sai số bình phương nhất).
    + w_best = (X^T * X)^{-1} * X^T * y - với linear và 2 còn lại là thêm regularization.

### 2.1 Setup Model

In [624]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Tạo danh sách model
models_1 = {
    "Linear_Basic_model_1": LinearRegression(),
    "Ridge_Basic_model_1": Ridge(),
    "Lasso_Basic_model_1": Lasso()
}

### 2.2 Training

In [625]:
import joblib # Lib save model
    
def train_and_evaluate(models, X_train, y_train, X_test, y_test, folder_path_save="Models"):
    model_names = []
    train_rmse_results = []
    test_rmse_results = []
    train_r2_results = []
    test_r2_results = []

    for name, model in models.items():
        # Train model
        regression = model.fit(X_train, y_train)
        joblib.dump(regression, f"{folder_path_save}/house_price_{name}.joblib")

        # Predict
        y_train_pred = regression.predict(X_train)
        y_test_pred = regression.predict(X_test)
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

        # R^2 score
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)

        # Store results
        model_names.append(name)
        train_rmse_results.append(train_rmse)
        test_rmse_results.append(test_rmse)
        train_r2_results.append(train_r2)
        test_r2_results.append(test_r2)

    
    # Create results DataFrame
    results_df = pd.DataFrame({
        "Model": model_names,
        "Train RMSE": train_rmse_results,
        "Test RMSE": test_rmse_results,
        "Train R2": train_r2_results,
        "Test R2": test_r2_results
    }).sort_values(by="Test R2", ascending=False)
    return results_df

In [626]:
result_model_1 = train_and_evaluate(models_1, X_train, y_train, X_test, y_test, folder_path_save)
result_model_1

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_Basic_model_1,36.204135,24.655174,0.420961,0.597531
1,Ridge_Basic_model_1,36.210789,24.672542,0.420748,0.596964
2,Lasso_Basic_model_1,38.501908,28.165345,0.345128,0.474774


## 3. Model 2: Training Poly-nomial Regression Model (degree 2) with full feature

In [627]:
train_data[num_cols]

Unnamed: 0,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam
650,55.0,165.0,4.0,3.0
1534,53.0,53.0,2.0,2.0
2339,30.0,50.0,2.0,2.0
1702,120.0,600.0,3.0,4.0
1699,61.0,122.0,3.0,2.0
...,...,...,...,...
1182,80.0,80.0,2.0,2.0
1357,60.0,120.0,3.0,3.0
901,68.0,136.0,2.0,2.0
3751,20.0,31.0,2.0,1.0


### 3.1 Init model poly-monial

+ Using model degree 2

In [628]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
train_poly_feat = poly_features.fit_transform(train_data[num_cols])
test_poly_feat = poly_features.transform(test_data[num_cols])

In [629]:
train_poly_feat

array([[  55.,  165.,    4., ...,  660.,  495.,   12.],
       [  53.,   53.,    2., ...,  106.,  106.,    4.],
       [  30.,   50.,    2., ...,  100.,  100.,    4.],
       ...,
       [  68.,  136.,    2., ...,  272.,  272.,    4.],
       [  20.,   31.,    2., ...,   62.,   31.,    2.],
       [ 127.,  381.,    3., ..., 1143., 1143.,    9.]])

In [630]:
test_poly_feat

array([[7.200e+01, 7.200e+01, 2.000e+00, ..., 1.440e+02, 1.440e+02,
        4.000e+00],
       [2.490e+02, 1.000e+03, 7.000e+00, ..., 7.000e+03, 7.000e+03,
        4.900e+01],
       [6.400e+01, 1.280e+02, 3.000e+00, ..., 3.840e+02, 5.120e+02,
        1.200e+01],
       ...,
       [1.050e+02, 1.050e+02, 3.000e+00, ..., 3.150e+02, 2.100e+02,
        6.000e+00],
       [4.330e+02, 4.330e+02, 4.000e+00, ..., 1.732e+03, 1.732e+03,
        1.600e+01],
       [4.300e+01, 1.200e+02, 4.000e+00, ..., 4.800e+02, 6.000e+02,
        2.000e+01]])

In [631]:
X_train_poly = np.hstack([train_poly_feat, train_data[encoder_cols].values])
X_test_poly = np.hstack([test_poly_feat, test_data[encoder_cols].values])

In [632]:
X_train_poly

array([[ 55., 165.,   4., ...,   0.,   0.,   0.],
       [ 53.,  53.,   2., ...,   0.,   0.,   0.],
       [ 30.,  50.,   2., ...,   0.,   0.,   0.],
       ...,
       [ 68., 136.,   2., ...,   0.,   0.,   0.],
       [ 20.,  31.,   2., ...,   1.,   0.,   0.],
       [127., 381.,   3., ...,   0.,   0.,   1.]])

In [633]:
X_train_poly.shape, X_test_poly.shape

((3208, 29), (803, 29))

In [634]:
y_train.shape, y_test.shape

((3208,), (803,))

In [635]:
models_2 = {
    "Linear_Poly_model_2": LinearRegression(),
    "Ridge_Poly_model_2": Ridge(),
    "Lasso_Poly_model_2": Lasso()
}
result_poly = train_and_evaluate(models_2, X_train_poly, y_train, X_test_poly, y_test, folder_path_save)
result_poly

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_Poly_model_2,35.835354,24.076107,0.432697,0.616214
1,Ridge_Poly_model_2,35.835512,24.089027,0.432692,0.615802
2,Lasso_Poly_model_2,37.009355,26.583048,0.394917,0.532129


# IV Result Compare Model 1 and Model 2:

In [636]:
def compare_results(result1, result2):
    combined_results = pd.concat([result1, result2], axis=0).sort_values(by="Test R2", ascending=False)
    return combined_results
result_compare = compare_results(result_model_1, result_poly)
result_compare

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_Poly_model_2,35.835354,24.076107,0.432697,0.616214
1,Ridge_Poly_model_2,35.835512,24.089027,0.432692,0.615802
0,Linear_Basic_model_1,36.204135,24.655174,0.420961,0.597531
1,Ridge_Basic_model_1,36.210789,24.672542,0.420748,0.596964
2,Lasso_Poly_model_2,37.009355,26.583048,0.394917,0.532129
2,Lasso_Basic_model_1,38.501908,28.165345,0.345128,0.474774


## Nhận xét:

+ Về mặt metric tốt nhất: LinearRegression_poly

+ Về mặt ổn định/regularization: Ridge_poly cũng là lựa chọn rất hợp lý (gần như ngang nhau).
+ Nhóm sử dụng các chỉ số RMSE và R² trên tập validation/test để chọn mô hình.
Trong các mô hình so sánh, LinearRegression_poly cho Test RMSE thấp nhất (~28.40) và Test R² cao nhất (~0.615), do đó được chọn là mô hình tốt nhất cho bài toán này.

+ Mô hình Ridge_poly cho kết quả rất gần tương đương, có thể được cân nhắc khi ưu tiên tính ổn định do có regularization L2.

# V Full pipeline from normalization to predict

In [637]:
folder_full_pipeline = "Full_Pipeline_Models"
if not os.path.exists(folder_full_pipeline):
    os.makedirs(folder_full_pipeline)

## 1. Setup dataset

In [638]:
data_model = data.copy()

In [639]:
data_model

Unnamed: 0,quan,dien_tich_dat_m2,dien_tich_su_dung_m2,phong_ngu,nha_tam,gia
2,10,53.0,53.0,2.0,2.0,4.3
4,10,102.0,102.0,2.0,2.0,3.7
5,10,79.0,79.0,2.0,2.0,4.9
6,10,72.0,72.0,2.0,2.0,5.5
7,10,103.0,103.0,3.0,2.0,8.2
...,...,...,...,...,...,...
4288,1,50.0,50.0,1.0,1.0,5.6
4289,1,50.0,50.0,1.0,1.0,6.0
4290,1,72.0,72.0,2.0,2.0,8.0
4291,1,121.0,121.0,3.0,2.0,16.5


In [640]:
# Create training and validation sets
from sklearn.model_selection import train_test_split

train_data_new, test_data_new = train_test_split(
    data_model, 
    test_size=0.2, 
    random_state=42)
print("Train data shape:", train_data_new.shape)
print("Test data shape:", test_data_new.shape)

Train data shape: (3208, 6)
Test data shape: (803, 6)


In [641]:
y_train_new = train_data_new['gia'].values
y_test_new = test_data_new['gia'].values
train_data_new = train_data_new.drop(columns=['gia'], axis=1)
test_data_new = test_data_new.drop(columns=['gia'], axis=1)


In [642]:
print("X_train shape:", train_data_new.shape)
print("X_test shape:", test_data_new.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3208, 5)
X_test shape: (803, 5)
y_train shape: (3208,)
y_test shape: (803,)


## 2.Setup model

In [643]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Tạo danh sách model
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}
# List storage for results
train_rmse_results = []
test_rmse_results = []
train_r2_results = []
test_r2_results = []
model_names = []

## 3. Train model

### 3.1 Model 1: Model linear basic (degree 1) with full data and full feature:

#### 3.1.1 Pack preprocessing data and norm:

In [644]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

num_cols = ["dien_tich_dat_m2", "dien_tich_su_dung_m2", "phong_ngu", "nha_tam"]
cat_cols = ["quan"]

numeric_transformer_1 = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess_1 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_1, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)


#### 3.1.2 Train model 1

In [645]:
import joblib # Lib save model

def train_and_evaluate_degree_1(models, X_train_new, y_train_new, X_test_new, y_test_new, folder_path_save="Models", names_list=[], preprocess=None):
    model_names = []
    train_rmse_results = []
    test_rmse_results = []
    train_r2_results = []
    test_r2_results = []


    for name in names_list.keys():
        # Train model
        regression = Pipeline(steps=[
            ("preprocess", preprocess),
            ("regressor", models[names_list[name]])
        ])
        regression = regression.fit(X_train_new, y_train_new)
        joblib.dump(regression, f"{folder_path_save}/house_price_{name}.joblib")

        # Predict
        y_train_pred = regression.predict(X_train_new)
        y_test_pred = regression.predict(X_test_new)
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train_new, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test_new, y_test_pred))

        # R^2 score
        train_r2 = r2_score(y_train_new, y_train_pred)
        test_r2 = r2_score(y_test_new, y_test_pred)
        # Store results
        model_names.append(name)
        train_rmse_results.append(train_rmse)
        test_rmse_results.append(test_rmse)
        train_r2_results.append(train_r2)
        test_r2_results.append(test_r2)

    
    # Create results DataFrame
    results_df = pd.DataFrame({
        "Model": model_names,
        "Train RMSE": train_rmse_results,
        "Test RMSE": test_rmse_results,
        "Train R2": train_r2_results,
        "Test R2": test_r2_results
    }).sort_values(by="Test R2", ascending=False)
    return results_df

In [646]:
names_list_1 = {
    "Linear_basic_model_1": "LinearRegression",
    "Ridge_basic_model_1": "Ridge",
    "Lasso_basic_model_1": "Lasso"
}
result_1 = train_and_evaluate_degree_1(models, train_data_new, y_train_new, test_data_new, y_test_new, folder_full_pipeline, names_list_1, preprocess_1)
result_1

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_basic_model_1,36.204135,24.655176,0.420961,0.597531
1,Ridge_basic_model_1,36.210789,24.672541,0.420748,0.596964
2,Lasso_basic_model_1,38.501908,28.165345,0.345128,0.474774


### 3.2 Model 2: Model linear poly (degree 2) with full data and full feature:

#### 3.2.1 Pack pre-processing data and norm

In [647]:
numeric_transformer_2 = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scaler", MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess_2 = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, cat_cols),
        ("num", numeric_transformer_2, num_cols),
    ]
)

#### 3.2.2 Train model 2

In [648]:
import joblib # Lib save model

def train_and_evaluate_poly(models, X_train_new, y_train_new, X_test_new, y_test_new, folder_path_save="Models", names_list=None, preprocess=None):
    model_names = []
    train_rmse_results = []
    test_rmse_results = []
    train_r2_results = []
    test_r2_results = []


    for name in names_list.keys():
        # Train model
        regression = Pipeline(steps=[
            ("preprocess", preprocess),
            ("regressor", models[names_list[name]])
        ])
        regression = regression.fit(X_train_new, y_train_new)
        joblib.dump(regression, f"{folder_path_save}/house_price_{name}.joblib")

        # Predict
        y_train_pred = regression.predict(X_train_new)
        y_test_pred = regression.predict(X_test_new)
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train_new, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test_new, y_test_pred))

        # R^2 score
        train_r2 = r2_score(y_train_new, y_train_pred)
        test_r2 = r2_score(y_test_new, y_test_pred)
        # Store results
        model_names.append(name)
        train_rmse_results.append(train_rmse)
        test_rmse_results.append(test_rmse)
        train_r2_results.append(train_r2)
        test_r2_results.append(test_r2)

    
    # Create results DataFrame
    results_df = pd.DataFrame({
        "Model": model_names,
        "Train RMSE": train_rmse_results,
        "Test RMSE": test_rmse_results,
        "Train R2": train_r2_results,
        "Test R2": test_r2_results
    }).sort_values(by="Test R2", ascending=False)
    return results_df

In [649]:
names_list_poly = {
    "Linear_poly_model_2": "LinearRegression",
    "Ridge_poly_model_2": "Ridge",
    "Lasso_poly_model_2": "Lasso",
}
result_model_2= train_and_evaluate_poly(models, train_data_new, y_train_new, test_data_new, y_test_new, folder_full_pipeline, names_list_poly, preprocess_2)
result_model_2

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_poly_model_2,35.835354,24.076107,0.432697,0.616214
1,Ridge_poly_model_2,36.007808,24.687376,0.427224,0.596479
2,Lasso_poly_model_2,38.501901,28.165338,0.345129,0.474774


## 4. Final Result

In [650]:
result_compare = compare_results(result_model_1, result_model_2)
result_compare = result_compare.reset_index(drop=True)
result_compare

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_poly_model_2,35.835354,24.076107,0.432697,0.616214
1,Linear_Basic_model_1,36.204135,24.655174,0.420961,0.597531
2,Ridge_Basic_model_1,36.210789,24.672542,0.420748,0.596964
3,Ridge_poly_model_2,36.007808,24.687376,0.427224,0.596479
4,Lasso_poly_model_2,38.501901,28.165338,0.345129,0.474774
5,Lasso_Basic_model_1,38.501908,28.165345,0.345128,0.474774


In [651]:
result_old = pd.read_csv("./Performance_Model/model_performance_report.csv")

# Merge
result_all = pd.concat([result_old, result_compare], axis=0, ignore_index=True)

# Sort to prioritize higher Test R2
result_all = result_all.sort_values(by="Test R2", ascending=False)

# Delete duplicate models, keep the first (best) one
result_all = result_all.drop_duplicates(subset=["Model"], keep="first").reset_index(drop=True)

result_all


Unnamed: 0,Model,Train RMSE,Test RMSE,Train R2,Test R2
0,Linear_poly_model_2,35.835354,24.076107,0.432697,0.616214
1,Linear_Basic_model_1,36.204135,24.655174,0.420961,0.597531
2,Ridge_Basic_model_1,36.210789,24.672542,0.420748,0.596964
3,Ridge_poly_model_2,36.007808,24.687376,0.427224,0.596479
4,Log Model 3,37.959297,27.570429,0.363457,0.496727
5,Lasso_poly_model_2,38.501901,28.165338,0.345129,0.474774
6,Lasso_Basic_model_1,38.501908,28.165345,0.345128,0.474774


In [652]:
result_all.to_csv(f"./Performance_Model/model_performance_report.csv", index=False)
best_model_name = result_all.iloc[0]['Model']

In [653]:
# Save the best model
if best_model_name in names_list_1 or best_model_name in names_list_poly:
    best_model = joblib.load(f"{folder_full_pipeline}/house_price_{best_model_name}.joblib")
    joblib.dump(best_model, f"{folder_full_pipeline}/house_price_best_model.joblib")
print(f"Best model '{best_model_name}")


Best model 'Linear_poly_model_2


# VI Test Model

In [654]:
# 1. Load lại model y chang app.py
model = joblib.load(f"{folder_full_pipeline}/house_price_best_model.joblib")

# 2. Tạo đúng input như web
input_df = pd.DataFrame({
    "quan": ["1"],
    "dien_tich_dat_m2": [40],
    "dien_tich_su_dung_m2": [200],
    "phong_ngu": [10],
    "nha_tam": [11],
})

y_pred = model.predict(input_df)
print(f"Predicted house price: {y_pred[0]}")

Predicted house price: 37.62237406067139
