# 1 - Carregar o dataset

In [20]:
import pandas as pd
import numpy as np
df = pd.read_csv('/home/dataset_casas.csv')
print(df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

# 2 - Análise Descritiva - Categórica e Numérica

In [21]:
# Análise descritiva das variáveis numéricas
print(df.describe())

# Análise descritiva das variáveis categóricas
print(df.describe(include=['object']))

# Verificar valores nulos
print(df.isnull().sum())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [22]:
# Verificar apenas as colunas com valores nulos
null_columns = df.columns[df.isnull().any()]
print(df[null_columns].isnull().sum())

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


# 3 - Feature Engineering

In [23]:
from sklearn.preprocessing import LabelEncoder

# Tratar valores nulos
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())
df['Alley'] = df['Alley'].fillna('NoAlley')
df['PoolQC'] = df['PoolQC'].fillna('NoPool')
df['Fence'] = df['Fence'].fillna('NoFence')
df['MiscFeature'] = df['MiscFeature'].fillna('NoMiscFeature')

# Criar novas colunas
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['TotalBath'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']

# Aplicar Label Encoding para variáveis categóricas
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col].astype(str))

# Aplicar get_dummies para variáveis categóricas
df = pd.get_dummies(df, columns=df.select_dtypes(include=['object']).columns, drop_first=True)

# Verificar o dataframe após a engenharia de features
print(df.head())

   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0   1          60         3         65.0     8450       1      1         3   
1   2          20         3         80.0     9600       1      1         3   
2   3          60         3         68.0    11250       1      1         0   
3   4          70         3         60.0     9550       1      1         0   
4   5          60         3         84.0    14260       1      1         0   

   LandContour  Utilities  ...  Fence  MiscFeature  MiscVal  MoSold  YrSold  \
0            3          0  ...      4            1        0       2    2008   
1            3          0  ...      4            1        0       5    2007   
2            3          0  ...      4            1        0       9    2008   
3            3          0  ...      4            1        0       2    2006   
4            3          0  ...      4            1        0      12    2008   

   SaleType  SaleCondition  SalePrice  TotalSF  TotalBat

# 4 - Modelagem (baseline)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.impute import SimpleImputer

# Separar features e target
X = df.drop(['SalePrice', 'Id'], axis=1)
y = df['SalePrice']

# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tratar valores nulos usando SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Treinar o modelo baseline
model_baseline = LinearRegression()
model_baseline.fit(X_train_imputed, y_train)

# Fazer previsões
y_pred_baseline = model_baseline.predict(X_test_imputed)

# Calcular MAPE
mape_baseline = mean_absolute_percentage_error(y_test, y_pred_baseline)
print(f'MAPE do modelo baseline: {mape_baseline}')

MAPE do modelo baseline: 0.12950479231609247


# 5 - Modelagem (modelo melhorado com RandomSearch)

In [25]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Definir o espaço de busca para os hiperparâmetros
param_dist = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}

# Inicializar o modelo e o RandomizedSearchCV
model = DecisionTreeRegressor(random_state=42)
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)

# Treinar o modelo
random_search.fit(X_train, y_train)

# Melhor modelo
best_model = random_search.best_estimator_

# Fazer previsões com o melhor modelo
y_pred_improved = best_model.predict(X_test)

# Calcular MAPE
mape_improved = mean_absolute_percentage_error(y_test, y_pred_improved)
print(f'MAPE do modelo melhorado: {mape_improved}')

# Melhores hiperparâmetros
print(f'Melhores hiperparâmetros: {random_search.best_params_}')

MAPE do modelo melhorado: 0.1447581088204801
Melhores hiperparâmetros: {'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 4}


# 6 - Métricas (MAPE para modelo baseline e melhorado)

In [26]:
print(f'MAPE do modelo baseline: {mape_baseline}')
print(f'MAPE do modelo melhorado: {mape_improved}')

MAPE do modelo baseline: 0.12950479231609247
MAPE do modelo melhorado: 0.1447581088204801
