In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import optuna

# Load data
data = pd.read_csv('encode_data.csv')  # Thay 'data.csv' bằng tên file chứa dữ liệu của bạn
X = data[['Quận (mã hóa)', 'Loại Nhà (mã hóa)', 'Diện tích (m²)', 'Số phòng ngủ', 'Số toilet']]
y = data['Mức giá (tỷ)']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define objective function for Optuna
def objective(trial):
    # Define parameters to be tuned
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Initialize and train the model with suggested parameters
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Evaluate model
    score = model.score(X_test, y_test)
    return score

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Get best parameters and score
best_params = study.best_params
best_score = study.best_value

print("Best parameters:", best_params)
print("Best score:", best_score)

# Train the model with best parameters
best_model = RandomForestRegressor(**study.best_params)
best_model.fit(X_train, y_train)

# Evaluate the model
cv_scores = cross_val_score(best_model, X, y, cv=10)
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print("Cross-validated scores:", cv_scores)
print("Mean accuracy:", mean_cv_score)
print("Standard deviation of accuracy:", std_cv_score)
print()

# Predict on test set
y_pred = best_model.predict(X_test)

# Convert to binary labels if needed
# (Replace this with your actual labels)
y_pred_binary = (y_pred > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

# Classification report
print(classification_report(y_test_binary, y_pred_binary))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary))


[I 2024-05-12 23:14:43,275] A new study created in memory with name: no-name-f2dabd14-fc53-48e9-871e-c6e66ee6f373
[I 2024-05-12 23:14:48,099] Trial 0 finished with value: 0.7107930811570277 and parameters: {'n_estimators': 814, 'max_depth': 11, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.7107930811570277.
[I 2024-05-12 23:14:54,404] Trial 1 finished with value: 0.7503226727987742 and parameters: {'n_estimators': 704, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.7503226727987742.
[I 2024-05-12 23:14:58,992] Trial 2 finished with value: 0.7247107853276828 and parameters: {'n_estimators': 872, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.7503226727987742.
[I 2024-05-12 23:15:01,626] Trial 3 finished with value: 0.670518817756302 and parameters: {'n_estimators': 663, 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 1 with value: