In [2]:
import numpy as numpy
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [4]:
prices_full = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
prices_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
prices_full

In [5]:
prices_full.describe()

In [6]:
plt.figure(figsize=(15, 15))
sns.heatmap(prices_full.corr())

In [7]:
prices_full.info()

In [8]:
plt.figure(figsize=(10, 10))
sns.heatmap(prices_full.isna())

In [9]:
X_train = prices_full.drop(columns='SalePrice')
y_train = prices_full['SalePrice']

X_test = prices_test

### Drop Columns

In [10]:
drop_cols = ['Alley', 'PoolQC', 'MiscFeature', 'Fence']
prices_full.drop(columns=drop_cols, inplace=True)

In [11]:
num_cols, cat_cols = [], []
for col in X_train:
    if X_train[col].dtype == 'object':
        cat_cols.append(col)
    else:
        num_cols.append(col) 

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', MaxAbsScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

In [13]:
from xgboost import XGBRegressor

In [14]:
model = XGBRegressor(n_estimators=250, learning_rate=0.03)

In [15]:
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
params = {'model__n_estimators': [100, 200, 300, 400, 500], 
          'model__learning_rate': [0.3, 0.03, 0.09, 0.003, 0.009]}

search_model = GridSearchCV(my_pipeline, param_grid=params)

search_model.fit(X_train, y_train)

In [22]:
learning_rate, n_estimators = search_model.best_params_.values()

print(n_estimators, learning_rate)

## Best Model

In [23]:
best_model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate)

In [24]:
best_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', best_model)
])

In [25]:
best_pipeline.fit(X_train, y_train)

In [26]:
preds = best_pipeline.predict(X_test)

In [27]:
pd.DataFrame({'Id': prices_test.Id, 'SalePrice': preds}).to_csv('./submission.csv', index=False)