In [3]:
!pip install kaggle



In [7]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 421MB/s]


In [8]:
# Descompacta todos os arquivos no diretório atual
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [10]:
# Instalar (se necessário, mas já fizemos no seu caso)
# !pip install pandas scikit-learn numpy

# Importar bibliotecas
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
# Carregar o Dataset
df_train = pd.read_csv('train.csv')

X = df_train.drop(['SalePrice', 'Id'], axis=1) # Remove a variável alvo e a coluna ID
y = df_train['SalePrice']

# Visualizar as primeiras linhas para confirmar o carregamento
print(X.head())
print(y.head())

   MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0          60       RL         65.0     8450   Pave   NaN      Reg   
1          20       RL         80.0     9600   Pave   NaN      Reg   
2          60       RL         68.0    11250   Pave   NaN      IR1   
3          70       RL         60.0     9550   Pave   NaN      IR1   
4          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities LotConfig  ... ScreenPorch PoolArea PoolQC Fence  \
0         Lvl    AllPub    Inside  ...           0        0    NaN   NaN   
1         Lvl    AllPub       FR2  ...           0        0    NaN   NaN   
2         Lvl    AllPub    Inside  ...           0        0    NaN   NaN   
3         Lvl    AllPub    Corner  ...           0        0    NaN   NaN   
4         Lvl    AllPub       FR2  ...           0        0    NaN   NaN   

  MiscFeature MiscVal  MoSold  YrSold  SaleType  SaleCondition  
0         NaN       0       2    2008        WD         N

In [12]:
# 3. Identificar Colunas Categóricas e Numéricas
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

# Pipeline para dados Numéricos
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Pipeline para dados Categóricos
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar os pipelines:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [13]:
# 4. Criar o Pipeline Final com o Modelo (Random Forest Regressor)
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

In [14]:
# 5. Dividir os dados em Treinamento e Teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Treinar o Modelo
print("Iniciando treinamento...")
model.fit(X_train, y_train)
print("Treinamento concluído!")

# 7. Fazer Previsões
y_pred = model.predict(X_test)

Iniciando treinamento...
Treinamento concluído!


In [15]:
# 8. Avaliar o Modelo
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- Resultados de Avaliação (Random Forest) ---")
print(f"RMSE (Root Mean Squared Error): ${rmse:,.2f}")
print(f"R² (Coeficiente de Determinação): {r2:.4f}")

--- Resultados de Avaliação (Random Forest) ---
RMSE (Root Mean Squared Error): $28,937.00
R² (Coeficiente de Determinação): 0.8908
