In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Carregar dados brutos novamente para garantir consistência

In [5]:
data = pd.read_csv('../data/raw/train.csv')

# Selecionar features

In [6]:
features = ['GrLivArea', 'OverallQual', 'YearBuilt', 'GarageCars', 'TotalBsmtSF']
X = data[features]
y = data['SalePrice']

# Pré-processamento

In [7]:
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Treinar modelos

In [8]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name} -> RMSE: {rmse:.2f} | R²: {r2:.3f}")

LinearRegression -> RMSE: 39763.30 | R²: 0.794
Ridge -> RMSE: 39767.67 | R²: 0.794
Lasso -> RMSE: 39763.32 | R²: 0.794
RandomForest -> RMSE: 28863.31 | R²: 0.891
