In [138]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Leitura dos dados
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')  # Corrigir para 'test.csv'

# Escolha das features
features = ['Id', 'MSSubClass', 'LotFrontage', 'MiscVal', 'MoSold', 'YrSold']
X_train = train[features]
y_train = train.SalePrice

# Divisão dos dados de treino
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Configuração do modelo
my_model = XGBRegressor(n_estimators=500, learning_rate=0.05, enable_categorical=True)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)

# Previsões no conjunto de validação
prediction = my_model.predict(X_valid)
mae = mean_absolute_error(prediction, y_valid)
print("Mean Absolute Error on Validation Set:", mae)

# Previsões no conjunto de teste
p = my_model.predict(test[features].fillna(0))  # Certifique-se de tratar NaNs no conjunto de teste adequadamente




Mean Absolute Error on Validation Set: 44447.62217465753




Vamos tentar melhorar esse modelo

NOVAS VARIÁVEIS

In [139]:
test['Age']=test['YrSold']-test['YearBuilt']
train['Age']=train['YrSold']-train['YearBuilt']
train['RemodelAge']=train['YearRemodAdd']!=train['YearBuilt'].astype(int)
test['RemodelAge']=test['YearRemodAdd']!=test['YearBuilt'].astype(int)

train['PoolQCELITE'] = train['PoolQC'] == 3            #qualidade maxima da piscina
train['TotalOutdoorArea'] = train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch'] + train['WoodDeckSF'] #area total da varanda
train['TotalGarage'] = train['GarageCars'] + train['GarageArea']       #total de garagens


test['PoolQCELITE'] = test['PoolQC'] == 3            #qualidade maxima da piscina
test['TotalOutdoorArea'] = test['OpenPorchSF'] + test['EnclosedPorch'] + test['3SsnPorch'] + test['ScreenPorch'] + test['WoodDeckSF'] #area total da varanda
test['TotalGarage'] = test['GarageCars'] + test['GarageArea']       #total de garagens

In [140]:
# Verificar quais colunas têm valores nulos
colunas_com_nulos = train.isnull().sum()

# Filtrar apenas as colunas com valores nulos
colunas_com_nulos = colunas_com_nulos[colunas_com_nulos > 0]

# Exibir as colunas com valores nulos e suas contagens
print(colunas_com_nulos)

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


dropando as colunas com muitos valores nulos

In [141]:
columns=['Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature','Utilities','Street']
train=train.drop(columns,axis=1)
test=test.drop(columns,axis=1)

In [142]:
#codigo para transformar variaveis categoricas em numericas
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in train.columns:
    if train[i].dtype=='object':
        train[i]=le.fit_transform(train[i])
for i in test.columns:
    if test[i].dtype=='object':
        test[i]=le.fit_transform(test[i])

In [143]:
train['Age']

0        5
1       31
2        7
3       91
4        8
        ..
1455     8
1456    32
1457    69
1458    60
1459    43
Name: Age, Length: 1460, dtype: int64

In [144]:
variaveis = ['Id', 'MSSubClass', 'LotFrontage','MiscVal','MoSold','Age','RemodelAge','TotalGarage','PoolQCELITE','TotalOutdoorArea']

X = train[variaveis].fillna(-1)
y = train.SalePrice

In [145]:
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Age,RemodelAge,PoolQCELITE,TotalOutdoorArea,TotalGarage
0,1461,20,2,80.0,11622,1,2,3,3,0,...,0,6,2010,8,4,49,False,False,260,731.0
1,1462,20,3,81.0,14267,1,2,0,3,0,...,12500,6,2010,8,4,52,False,False,429,313.0
2,1463,60,3,74.0,13830,1,2,0,3,0,...,0,3,2010,8,4,13,True,False,246,484.0
3,1464,60,3,78.0,9978,1,2,0,3,0,...,0,6,2010,8,4,12,False,False,396,472.0
4,1465,120,3,43.0,5005,1,2,0,1,0,...,0,1,2010,8,4,18,False,False,226,508.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,4,21.0,1936,1,2,3,3,0,...,0,6,2006,8,4,36,False,False,0,0.0
1455,2916,160,4,21.0,1894,1,2,3,3,0,...,0,4,2006,8,0,36,False,False,24,287.0
1456,2917,20,3,160.0,20000,1,2,3,3,0,...,0,9,2006,8,0,46,True,False,474,578.0
1457,2918,85,3,62.0,10441,1,2,3,3,0,...,700,7,2006,8,4,14,False,False,112,0.0


In [146]:
from sklearn.ensemble import RandomForestRegressor
my_model2 = RandomForestRegressor(n_estimators=150,random_state=42)

In [147]:

my_model.fit(X,y)
p = my_model.predict(test[variaveis].fillna(-1))


In [148]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(my_model, X, y, scoring='neg_mean_absolute_error', cv=5)

# Exiba a média dos scores
print('Média do MAE na validação cruzada:', -scores.mean())

Média do MAE na validação cruzada: 32034.312724743148


Média do MAE na validação cruzada: 31755.23785049229


In [151]:
submission2=pd.DataFrame()
submission2['Id']=test['Id']

submission2['SalePrice']=p
submission2.to_csv('modelo.csv',index=False)