In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [5]:
# Carregar o dataset
file_path = 'C:\\Users\\User\\Desktop\\AASE\\datasets\\apartments_for_rent_classified_100K.csv'
data = pd.read_csv(file_path, sep=';')

In [6]:
# Selecionar features e target
features = [
    "bathrooms", "bedrooms", "square_feet", "pets_allowed_transformed", "has_photo_transformed", "cityname", "AC", 
    "Alarm", "Basketball", "Cable or Satellite", "Clubhouse", "Dishwasher", "Doorman", 
    "Elevator", "Fireplace", "Garbage Disposal", "Gated", "Golf", "Gym", "Hot Tub", 
    "Internet Access", "Luxury", "Parking", "Patio/Deck", "Playground", "Pool", 
    "Refrigerator", "Storage", "TV", "Tennis", "View", "Washer Dryer", "Wood Floors"
]
target = "price"

X = data[features]
y = data[target]

# Tratar valores ausentes na variável target
y = y.fillna(y.mean())

In [7]:
# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Identificar colunas numéricas e categóricas
numeric_features = ["bathrooms", "bedrooms", "square_feet"]
categorical_features = [
    "pets_allowed_transformed", "has_photo_transformed", "cityname", 
    "AC", "Alarm", "Basketball", "Cable or Satellite", "Clubhouse", "Dishwasher", "Doorman", 
    "Elevator", "Fireplace", "Garbage Disposal", "Gated", "Golf", "Gym", "Hot Tub", 
    "Internet Access", "Luxury", "Parking", "Patio/Deck", "Playground", "Pool", 
    "Refrigerator", "Storage", "TV", "Tennis", "View", "Washer Dryer", "Wood Floors"
]

In [9]:
# Preprocessamento para colunas numéricas
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [10]:
# Preprocessamento para colunas categóricas
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [11]:
# Combinar pré-processadores em um transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [12]:
# Criar pipeline com árvore de decisão
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])

In [13]:
# Hiperparâmetros para otimização
param_grid = {
    "regressor__max_depth": [5, 10, 15, None],
    "regressor__min_samples_split": [2, 5, 10],
    "regressor__min_samples_leaf": [1, 2, 4]
}

In [15]:
# GridSearchCV para encontrar os melhores hiperparâmetros
search = GridSearchCV(pipeline, param_grid, cv=10, scoring="neg_mean_squared_error", n_jobs=-1)
search.fit(X_train, y_train)

In [None]:
# Melhor modelo
best_model = search.best_estimator_

In [None]:
# Fazer previsões com o melhor modelo
y_pred = best_model.predict(X_test)

In [None]:
# Avaliar modelo
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Cálculo das métricas de regressão
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
nmae = mae / np.mean(np.abs(y_test)) * 100  # Percentual em relação à média dos valores reais
r2 = r2_score(y_test, y_pred)

# Exibir os resultados
print("Decision Tree- Mean Squared Error (MSE):", mse)
print("Decision Tree- Mean Absolute Error (MAE):", mae)
print("Decision Tree- Root Mean Squared Error (RMSE):", rmse)
print("Decision Tree- Normalized Mean Absolute Error (NMAE):", f"{nmae:.2f}%")
print("Decision Tree- R² Score:", r2)

Mean Squared Error (MSE): 295778.683059709
Mean Absolute Error (MAE): 295.674739172424
Root Mean Squared Error (RMSE): 543.8553880028302
Normalized Mean Absolute Error (NMAE): 19.38%
R² Score: 0.5638865272291576


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score
)
import numpy as np
import pandas as pd

# Identificar automaticamente as colunas numéricas e categóricas
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Configuração do pré-processador para colunas numéricas e categóricas
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

# Adiciona o modelo ao pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators=100, max_depth=None))
])

# Treinar o modelo com o pipeline
pipeline.fit(X_train, y_train)

# Previsões no conjunto de teste usando o pipeline
y_pred_rf = pipeline.predict(X_test)

# Cálculo das métricas para Random Forest
test_mse_rf = mean_squared_error(y_test, y_pred_rf)
test_mae_rf = mean_absolute_error(y_test, y_pred_rf)
test_rmse_rf = np.sqrt(test_mse_rf)
test_r2_rf = r2_score(y_test, y_pred_rf)
test_nmae_rf = test_mae_rf / np.mean(np.abs(y_test)) * 100  # Percentual

# Exibir os resultados para Random Forest
print("Random Forest - Mean Squared Error (MSE):", test_mse_rf)
print("Random Forest - Mean Absolute Error (MAE):", test_mae_rf)
print("Random Forest - Root Mean Squared Error (RMSE):", test_rmse_rf)
print("Random Forest - Normalized Mean Absolute Error (NMAE):", f"{test_nmae_rf:.2f}%")
print("Random Forest - R² Score:", test_r2_rf)


Random Forest - Mean Squared Error (MSE): 202983.6544567652
Random Forest - Mean Absolute Error (MAE): 229.33681142628072
Random Forest - Root Mean Squared Error (RMSE): 450.5370733433212
Random Forest - Normalized Mean Absolute Error (NMAE): 15.03%
Random Forest - R² Score: 0.7007089708253715


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Identificar automaticamente as colunas numéricas e categóricas
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Configuração do pré-processador para colunas numéricas e categóricas
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

# Adicionar o modelo ao pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators=100, max_depth=None))
])

# Configuração do número de folds para validação cruzada
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Avaliar o pipeline com validação cruzada
mse_scores = -cross_val_score(
    pipeline, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'
)
mae_scores = -cross_val_score(
    pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error'
)
r2_scores = cross_val_score(
    pipeline, X_train, y_train, cv=cv, scoring='r2'
)

# Previsões com validação cruzada
y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=cv)

# Cálculo adicional das métricas baseadas nas previsões
cv_rmse = np.sqrt(np.mean(mse_scores))
cv_nmae = np.mean(mae_scores) / np.mean(np.abs(y_train)) * 100

# Exibir os resultados com validação cruzada
print("Cross-Validation - Mean Squared Error (MSE):", np.mean(mse_scores))
print("Cross-Validation - Mean Absolute Error (MAE):", np.mean(mae_scores))
print("Cross-Validation - Root Mean Squared Error (RMSE):", cv_rmse)
print("Cross-Validation - Normalized Mean Absolute Error (NMAE):", f"{cv_nmae:.2f}%")
print("Cross-Validation - R² Score:", np.mean(r2_scores))


NameError: name 'X_train' is not defined