In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

from xgboost import XGBRegressor
from catboost import CatBoostRegressor



In [70]:
df = pd.read_csv('./data/muanhadat_data_adjusted.csv')

print(df['address'].unique())

['Phường Vĩnh Tuy' 'Phường Tương Mai' 'Phường Lĩnh Nam' 'Phường Bạch Mai'
 'Phường Hoàng Mai' 'Phường Hai Bà Trưng' 'Phường Yên Sở'
 'Phường Kiến Hưng' 'Xã Bình Minh' 'Phường Cầu Diễn' 'Phường Việt Hưng'
 'Phường Nghĩa Đô' 'Phường Từ Liêm' 'Phường Hoàng Liệt' 'Phường Vĩnh Hưng'
 'Phường Định Công' 'Phường Thanh Xuân' 'Phường Khương Mai'
 'Phường Thanh Xuân Nam' 'Phường Thượng Đình' 'Phường Thanh Xuân Bắc'
 'Phường Thanh Xuân Trung' 'Xã Hoài Đức' 'Phường Xuân Phương'
 'Phường Long Biên' 'Phường Phúc Lợi' 'Phường Kim Liên' 'Phường Phú Diễn'
 'Phường Xuân Canh' 'Phường Ba Đình' 'Phường Cầu Giấy'
 'Phường Khương Đình' 'Phường Dương Nội' 'Phường Đống Đa' 'Phường Hà Đông']


In [71]:
df['direction'].unique()

array(['Không xác định', 'Tây - Bắc', 'Bắc', 'Đông', 'Nam', 'Đông - Bắc',
       'Đông - Nam', 'Tây', 'Tây - Nam'], dtype=object)

In [72]:
tier_map = {
    'Phường Ba Đình': 1,
    'Phường Đống Đa': 1,
    'Phường Bạch Mai' : 1,
    'Phường Hai Bà Trưng': 1,
    'Phường Kim Liên': 2,
    'Phường Cầu Giấy': 2,
    'Phường Long Biên': 2,
    
    'Phường Thanh Xuân': 3,
    'Phường Thanh Xuân Nam': 3,
    'Phường Thanh Xuân Bắc': 3,
    'Phường Thanh Xuân Trung': 3,
    'Phường Thượng Đình': 3,
    'Phường Nghĩa Đô': 3,
    
    'Phường Vĩnh Tuy': 4,
    'Phường Tương Mai': 4,
    'Phường Lĩnh Nam': 4,
    'Phường Hoàng Mai': 4,
    'Phường Cầu Diễn': 4,
    'Phường Từ Liêm' : 4,
    'Phường Việt Hưng': 4,
    'Phường Định Công': 4,
    'Phường Khương Mai': 4,
    'Phường Xuân Phương': 4,
    'Phường Phúc Lợi': 4,
    'Phường Phú Diễn': 4,
    'Phường Khương Đình': 4,
    
    'Phường Yên Sở': 5,
    'Phường Kiến Hưng': 5,
    'Phường Hoàng Liệt': 5,
    'Phường Vĩnh Hưng': 5,
    'Phường Dương Nội': 5,
    'Xã Hoài Đức': 5,
    'Xã Bình Minh': 5,
    'Phường Xuân Canh': 5,
    'Phường Hà Đông': 5,
}


df['address'] = df['address'].map(tier_map)

missing = df[df['address'].isna()]['address'].unique()
print("Chưa có tier cho:", missing)


tier_dir = {
    'Không xác định' : 0, 'Tây - Bắc' : 1, 'Bắc' : 1, 'Đông' : 3, 'Nam' : 3, 'Đông - Bắc' : 2,
       'Đông - Nam' : 3, 'Tây' : 1, 'Tây - Nam' : 2
}

df['direction'] = df['direction'].map(tier_dir)


missing = df[df['direction'].isna()]['direction'].unique()
print("Chưa có tier direction cho:", missing)


shuffled_df = df.sample(frac = 1).reset_index(drop = True)

Chưa có tier cho: []
Chưa có tier direction cho: []


In [73]:
shuffled_df

Unnamed: 0,area,address,street_in_front_of_house,width,height,floor_number,bedroom_number,bathroom_number,direction,law,price,room_density,bath_per_bed,wide_ratio,distance_center
0,38.0,4,3.0,9.0,4.0,4,4,4,0,1,7.00,0.105263,0.80,2.250000,8.847671
1,39.0,4,3.0,7.8,5.0,4,3,3,0,1,8.15,0.076923,0.75,1.560000,4.245306
2,45.0,4,4.0,11.0,4.0,4,4,3,3,1,7.95,0.088889,0.60,2.750000,6.954405
3,73.0,1,3.0,14.6,5.0,3,3,2,0,1,12.40,0.041096,0.50,2.920000,3.243790
4,88.0,4,3.0,22.0,4.0,4,4,3,0,1,14.20,0.045455,0.60,5.500000,2.957363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,39.0,4,3.0,13.0,3.0,5,3,4,0,1,10.60,0.076923,1.00,4.333333,3.264079
403,34.0,4,3.0,11.3,3.0,5,3,4,0,1,7.70,0.088235,1.00,3.766667,5.594222
404,41.0,4,3.0,8.2,5.0,5,4,3,0,1,10.50,0.097561,0.60,1.640000,3.304077
405,47.0,4,3.0,11.8,4.0,5,4,4,0,1,9.30,0.085106,0.80,2.950000,3.852082


In [74]:
X = shuffled_df.drop(['law', 'price'], axis = 1)
y = shuffled_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

import numpy as np

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [75]:
model = XGBRegressor (n_estimators = 600, max_depth = 4, 
            learning_rate = 0.05, subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            reg_alpha=0.5,
            random_state=42)

model.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose = True)

model.set_params(early_stopping_rounds = 5)



# model = CatBoostRegressor(
#     depth=6,
#     learning_rate=0.05,
#     n_estimators=1200,
#     loss_function='MAE',
#     random_seed=42,
#     verbose=False
# )

# model.fit(X_train, y_train)

[0]	validation_0-rmse:6.37434
[1]	validation_0-rmse:6.13537
[2]	validation_0-rmse:5.91954
[3]	validation_0-rmse:5.71628
[4]	validation_0-rmse:5.51873
[5]	validation_0-rmse:5.35320
[6]	validation_0-rmse:5.17587
[7]	validation_0-rmse:5.03798
[8]	validation_0-rmse:4.85660
[9]	validation_0-rmse:4.72228
[10]	validation_0-rmse:4.60454
[11]	validation_0-rmse:4.49415
[12]	validation_0-rmse:4.39901
[13]	validation_0-rmse:4.27482
[14]	validation_0-rmse:4.16964
[15]	validation_0-rmse:4.08339
[16]	validation_0-rmse:4.00227
[17]	validation_0-rmse:3.91301
[18]	validation_0-rmse:3.82537
[19]	validation_0-rmse:3.76542
[20]	validation_0-rmse:3.69055
[21]	validation_0-rmse:3.57188
[22]	validation_0-rmse:3.49449
[23]	validation_0-rmse:3.41969
[24]	validation_0-rmse:3.34262
[25]	validation_0-rmse:3.27519
[26]	validation_0-rmse:3.22590
[27]	validation_0-rmse:3.17654
[28]	validation_0-rmse:3.11179
[29]	validation_0-rmse:3.05702
[30]	validation_0-rmse:3.02967
[31]	validation_0-rmse:2.99976
[32]	validation_0-

In [76]:
predictions = model.predict(X_test)

e = mean_absolute_error(predictions, y_test)
print(e)

1.5254265050190252


In [77]:
import numpy as np

print(np.mean(y))

11.349582309582308


In [78]:
X

Unnamed: 0,area,address,street_in_front_of_house,width,height,floor_number,bedroom_number,bathroom_number,direction,room_density,bath_per_bed,wide_ratio,distance_center
0,38.0,4,3.0,9.0,4.0,4,4,4,0,0.105263,0.80,2.250000,8.847671
1,39.0,4,3.0,7.8,5.0,4,3,3,0,0.076923,0.75,1.560000,4.245306
2,45.0,4,4.0,11.0,4.0,4,4,3,3,0.088889,0.60,2.750000,6.954405
3,73.0,1,3.0,14.6,5.0,3,3,2,0,0.041096,0.50,2.920000,3.243790
4,88.0,4,3.0,22.0,4.0,4,4,3,0,0.045455,0.60,5.500000,2.957363
...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,39.0,4,3.0,13.0,3.0,5,3,4,0,0.076923,1.00,4.333333,3.264079
403,34.0,4,3.0,11.3,3.0,5,3,4,0,0.088235,1.00,3.766667,5.594222
404,41.0,4,3.0,8.2,5.0,5,4,3,0,0.097561,0.60,1.640000,3.304077
405,47.0,4,3.0,11.8,4.0,5,4,4,0,0.085106,0.80,2.950000,3.852082


In [79]:
model.save_model("house_price.model")

  self.get_booster().save_model(fname)


In [80]:
model.save_model("house_price_json.json")