In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from random import randint
from sklearn.model_selection import RandomizedSearchCV



In [3]:
df = pd.read_csv('C:/Users/monte/OneDrive/Documentos/codigos/projeto-house-price-prediction/house-price-prediction/data/processed/housing_final.csv')


# 1. Remover valores truncados da target
q_max = df['median_house_value'].quantile(0.95)
df = df[df['median_house_value'] <= q_max]

# 2. Feature Engineering Geográfico
from sklearn.cluster import KMeans
coords = df[['longitude', 'latitude']].dropna()
kmeans = KMeans(n_clusters=15, random_state=42).fit(coords)
df['geo_cluster'] = kmeans.labels_

# 3. Seleção de Features
cols_to_drop = [
    'median_income_squared', 'total_rooms', 'total_bedrooms',
    'bedrooms_per_household', 'bedrooms_per_room', 'rooms_per_household'
]
df = df.drop(columns=cols_to_drop, errors='ignore')

# 4. Log-Transform na Target para normalizar distribuição
df['median_house_value'] = np.log1p(df['median_house_value'])


In [4]:
# Separar variáveis
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Dividir dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
param_dist = {
    'max_depth': [None, 15, 30, 45],
    'max_features': ['sqrt', 'log2', 0.8],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=42)
search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=30,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_



Fitting 3 folds for each of 24 candidates, totalling 72 fits


KeyboardInterrupt: 

In [14]:
# 6. Treinar modelo final
best_model.fit(X_train, y_train)

# Previsões e métricas
y_pred = best_model.predict(X_test)
y_pred = np.expm1(y_pred)  # Reverter log-transform
y_test = np.expm1(y_test)  # Reverter log-transform

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMelhores Parâmetros: {search.best_params_}")
print(f"MSE Final: {mse:.2f}")
print(f"R² Final: {r2:.2f}")


Melhores Parâmetros: {'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}
MSE Final: 1725168608.47
R² Final: 0.82


In [1]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Valores Reais")
plt.ylabel("Valores Preditos")
plt.title("Comparação: Valores Reais vs Preditos")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()


NameError: name 'plt' is not defined