In [2]:
!brew install libomp

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
astroterm           icann-rdap          terraform-cleaner   xlsclients
behaviortree.cpp    lazysql             tf-summarize        xprop
catgirl             martin              tfprovidercheck     xwininfo
hypopg              sdl3_image          xeyes
[34m==>[0m [1mNew Casks[0m
dana-dex                   gpt4all                    leader-key
freelens                   ik-product-manager         turbotax-2024

You have [1m11[0m outdated formulae installed.

[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/libomp/manifests/19.1.7[0m
######################################################################### 100.0%
[32m==>[0m [1mFetchin

In [3]:
!export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH

In [4]:
!echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> ~/.zshrc
!source ~/.zshrc

In [11]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os
os.chdir(os.getcwd().replace('/notebooks', ''))

# Đọc dữ liệu đã tiền xử lý
train_filepath = './data/preprocessed/train_preprocessed.csv'
test_filepath = './data/preprocessed/test_preprocessed.csv'

train_data = pd.read_csv(train_filepath)
test_data = pd.read_csv(test_filepath)

# Tách features và target
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Chia dữ liệu train - validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Tạo model XGBoost
model = xgb.XGBRegressor(
    n_estimators=500, 
    learning_rate=0.05, 
    max_depth=6, 
    subsample=0.8, 
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

# Huấn luyện mô hình
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Dự đoán trên tập test
test_predictions = model.predict(test_data)

with open('data/preprocessed/scaling_params.txt', 'r') as f:
    lines = f.readlines()
    mean = float(lines[0].split()[1])
    std = float(lines[1].split()[1])

# Đảo ngược chuẩn hóa
test_predictions = test_predictions * std + mean

# Lưu kết quả
output_dir = './data/output'
os.makedirs(output_dir, exist_ok=True)
output_filepath = os.path.join(output_dir, 'predictions.csv')
test_ids = pd.read_csv(test_filepath)['Id']
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv(output_filepath, index=False)

print(f"Predictions saved to {output_filepath}")

[0]	validation_0-rmse:1.06107
[1]	validation_0-rmse:1.02413
[2]	validation_0-rmse:0.98983
[3]	validation_0-rmse:0.95461
[4]	validation_0-rmse:0.92251
[5]	validation_0-rmse:0.89140
[6]	validation_0-rmse:0.86123
[7]	validation_0-rmse:0.83152
[8]	validation_0-rmse:0.80454
[9]	validation_0-rmse:0.78070
[10]	validation_0-rmse:0.75530
[11]	validation_0-rmse:0.73777
[12]	validation_0-rmse:0.71523
[13]	validation_0-rmse:0.69432
[14]	validation_0-rmse:0.67218
[15]	validation_0-rmse:0.65066
[16]	validation_0-rmse:0.63298
[17]	validation_0-rmse:0.61538
[18]	validation_0-rmse:0.59851
[19]	validation_0-rmse:0.58245
[20]	validation_0-rmse:0.56690
[21]	validation_0-rmse:0.55417
[22]	validation_0-rmse:0.54141
[23]	validation_0-rmse:0.53066
[24]	validation_0-rmse:0.52115
[25]	validation_0-rmse:0.51022
[26]	validation_0-rmse:0.49639
[27]	validation_0-rmse:0.48588
[28]	validation_0-rmse:0.47714
[29]	validation_0-rmse:0.46903
[30]	validation_0-rmse:0.46164
[31]	validation_0-rmse:0.45551
[32]	validation_0-