# 1. Load data

In [13]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# target to show plot in jupyter notebook and not external window
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

df_data = pd.read_csv("Data/data_predict_price_house.csv")




# 2. Xử lí outlier

In [14]:
df_clean = df_data[
    (df_data['dien_tich_dat_m2'] <= 1000) &
    (df_data['dien_tich_su_dung_m2'] <= 2000) &
    (df_data['phong_ngu'] <= 10) &
    (df_data['nha_tam'] <= 10)
]

# 3.Chia tập dữ liệu

In [15]:
data = df_clean.copy()
train_data, test_data = train_test_split(
    data, 
    test_size=0.2, 
    random_state=42)

# 4. Trích xuất nhãn giá

In [16]:
y_train = train_data['gia']
y_test = test_data['gia']
train_data = train_data.drop(columns=['gia'], axis=1)
test_data = test_data.drop(columns=['gia'], axis=1)

# 5. Điền giá trị none

In [17]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ["float64","int64"]]
cat_cols = [col for col in train_data.columns if train_data[col].dtype not in ["float64","int64"]]

train_data[cat_cols] = train_data[cat_cols].fillna("none")
test_data[cat_cols] = test_data[cat_cols].fillna("none")

# 6. One-hot encoding feature "quan"

In [18]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ["float64","int64"]]
cat_cols = [col for col in train_data.columns if train_data[col].dtype not in ["float64","int64"]]
train_data[cat_cols] = train_data[cat_cols].fillna("none")
test_data[cat_cols] = test_data[cat_cols].fillna("none")

In [19]:
from sklearn.preprocessing import OneHotEncoder

### One-hot encoding feature "quan"
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(train_data[cat_cols])

encoder_cols = list(encoder.get_feature_names_out(cat_cols))
train_data[encoder_cols] = encoder.transform(train_data[cat_cols])
test_data[encoder_cols] = encoder.transform(test_data[cat_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoder.fit(train_data[cat_cols])

encoded_cols = list(encoder.get_feature_names_out(cat_cols))

train_data[encoded_cols] = encoder.transform(train_data[cat_cols])
test_data[encoded_cols] = encoder.transform(test_data[cat_cols])



# 8. Xây dựng và huấn luyện mô hình

In [20]:
class OLSLinearRegression:
    def fit(self, X, y):
        ''' 
        This function is used to fit the model to the data. It uses the Ordinary Least Squares method to find the optimal parameters.

        Parameters
        ----------
        X : np.array
            Input data
        y : np.array
            Output data

        Returns
        -------
        self : object
            Returns the instance of the class
        
        '''

        X_pinv = np.linalg.inv(X.T @ X) @ X.T    # np.linalg.pinv(X)
        self.w = X_pinv @ y

        return self


    def get_params(self):
        ''' 
        This function is used to get the parameters of the model.

        Returns
        -------
        self.w : np.array
            Optimal parameters (column vector)
        '''

        return self.w


    def predict(self, X):
        ''' 
        This function is used to predict the output of the model.

        Parameters
        ----------
        X : np.array
            Input data

        Returns
        -------
        X @ self.w : np.array
            Predicted output
        '''

        return X @ self.w 

In [21]:

from sklearn.metrics import mean_absolute_error

model_df = train_data.copy()
model_df = model_df.drop(columns=['quan'], axis=1)

quan_cols = [col for col in model_df.columns if 'quan' in col]
print(quan_cols)
new_feature = []
for col in quan_cols:
    interaction_col_name = f"{col}_x_dientichdat"
    new_feature.append(interaction_col_name)
    model_df[interaction_col_name] = model_df[col] * model_df['dien_tich_dat_m2']

x_model3 = model_df[new_feature].values

x_model3_bias = np.c_[np.ones(x_model3.shape[0]), x_model3]

model3 = OLSLinearRegression().fit(x_model3_bias, y_train)  

weights_model3 = np.round(model3.get_params(), 3)

print("Trọng số hồi quy:",weights_model3)





['quan_1', 'quan_10', 'quan_11', 'quan_12', 'quan_2', 'quan_3', 'quan_4', 'quan_5', 'quan_6', 'quan_7', 'quan_8', 'quan_9', 'quan_Bình Thạnh', 'quan_Bình Tân', 'quan_Gò Vấp', 'quan_Phú Nhuận', 'quan_Thủ Đức (TP. Thủ Đức)', 'quan_Tân Bình', 'quan_Tân Phú']
Trọng số hồi quy: [-1.994  0.513  0.319  0.228  0.071  0.299  0.499  0.093  0.378  0.17
  0.182  0.118  0.089  0.241  0.123  0.16   0.367  0.127  0.194  0.16 ]


# 9. Đánh giá mô hình

In [22]:
from sklearn.metrics import mean_squared_error

test_df = test_data.copy()
test_df = test_df.drop(columns=['quan'], axis=1)

quan_cols = [col for col in test_df.columns if 'quan' in col]

new_feature_test = []
for col in quan_cols:
    interaction_col_name = f"{col}_x_dientichdat"
    new_feature_test.append(interaction_col_name)
    test_df[interaction_col_name] = test_df[col] * test_df['dien_tich_dat_m2']

x_model3_test = test_df[new_feature_test].values

x_model3_test_bias = np.c_[np.ones(x_model3_test.shape[0]), x_model3_test]

y_test_pred = model3.predict(x_model3_test_bias)

test_mse = mean_squared_error(y_test, y_test_pred)

test_rmse = np.sqrt(test_mse)
print("Độ lỗi của mô hình đo bằng MSE là :" ,test_rmse)




Độ lỗi của mô hình đo bằng MSE là : 18.874282088565973
