In [57]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

X = pd.read_csv("../data/train.csv", index_col='Id')
test_data= pd.read_csv("../data/test.csv", index_col='Id')

In [58]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuilt 

In [59]:
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y=X.SalePrice
X.drop(['SalePrice','Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature'], axis=1, inplace=True)

# spliting the data into train and test
X_train,X_valid,y_train,y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

#categorial_cols = X.select_dtypes(include=["object", "category"]).columns
#numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorial_cols = [cols for cols in X_train.columns if X_train[cols].nunique() < 10 and X_train[cols].dtype == 'object']
numerical_cols = [cols for cols in X_train.columns if X_train[cols].dtype in ['int64','float64']]

# Perprocessing the data

In [60]:
numerical_transformer = SimpleImputer(strategy='mean')
categorial_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))
])
# making a preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorial_transformer, categorial_cols)
])

# Define model

In [61]:
model = XGBRegressor(n_estimator = 700, learning_rate = 0.05, verbosity=0, random_state=42)

# Full pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# Define parameter grid for GridSearchCV and run it

In [62]:
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 5],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.7, 1.0]
}

# Run GridSearchCV
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="neg_mean_absolute_error", verbose=0)
grid.fit(X_train, y_train)

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__colsample_bytree': [0.7, 1.0], 'model__learning_rate': [0.05, 0.1], 'model__max_depth': [3, 5], 'model__n_estimators': [100, 200], ...}"
,scoring,'neg_mean_absolute_error'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


# Best model pipeline 

In [None]:
final_pipeline = grid.best_estimator_

# Evaluate on validation set
from sklearn.metrics import mean_absolute_error

preds = final_pipeline.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print("Validation MAE:", mae)  # 15859.4501953125 -- 15525.0068359375 increased accuracy by 2.15%
print("Best Hyperparameters:", grid.best_params_)  

Validation MAE: 15525.0068359375
Best Hyperparameters: {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.8}


In [64]:
#predictions = final_pipeline.predict(test_data)
#output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
#output.to_csv('submission.csv', index=False)