# Introduction
***
## House Prices: Advanced Regression Techniques

In this notebook, we will explore the dataset of house prices in Ames, Iowa and build a machine learning model that predicts the sale price of each house.

> Our goal is to build a model that accurately predicts the sale price of each house

This notebook ranked in the top 20% of all participants. Let's get started!

## What you will learn

- Create easy to use **data processing pipelines** using scikit-learn
- Apply **quantile transformer** to normalize the data and reduce outliers
- Use onehotencoder and standardscaler to encode categorical features and scale numerical features
- Optimize the model parameters using **hyperopt** library
- Fit and train **`XGBRegressor`**

# Libraries
***

In [None]:
# Holy grail
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

# Visualization
from matplotlib import pyplot as plt

# Machine learning model
from xgboost import XGBRegressor

# Hyperparameter tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

RANDOM_STATE = 2137

# Load data
***

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')

In [None]:
x_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv", index_col='Id')
x_test.shape

# Data analysis
***

In [None]:
train.info()

In [None]:
train.hist(figsize=(16,16))
plt.show()

In [None]:
x, y = train.drop(columns='SalePrice'), train['SalePrice']
print(x.shape, y.shape)

# Create pipeline for data processing

In [None]:
my_transformer = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown='ignore'),
        make_column_selector(dtype_include='object')
    ),
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipe = make_pipeline(
    my_transformer,
    IterativeImputer(random_state=RANDOM_STATE),
    StandardScaler(),
    QuantileTransformer(output_distribution='normal', random_state=RANDOM_STATE)
)

In [None]:
x = pd.DataFrame(pipe.fit_transform(x), columns=my_transformer.get_feature_names_out())
x.head()

# Split into train and cross-validation

In [None]:
x_train, x_cv, y_train, y_cv = train_test_split(x, y, train_size=0.8, random_state=RANDOM_STATE)

print(x_train.shape, y_train.shape)
print(x_cv.shape, y_cv.shape)

# Modeling
***

## Tune hyperparameters using Hyperopt

In [None]:
space={
    'gamma': hp.uniform ('gamma', 0, 5),
    'eta': hp.uniform('eta', 0, .3),
    'subsample': hp.uniform('subsample', 0, 1),
    'n_estimators': 180,
    'max_depth': hp.quniform('max_depth', 1, 20, 1),
    'reg_lambda' : hp.uniform('reg_lambda', 0, 10),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.4,1),
    'seed': 0
    }

In [None]:
def objective(space):
    model=XGBRegressor(
        eta = space['eta'],
        n_estimators = int(space['n_estimators']),
        max_depth = int(space['max_depth']),
        reg_lambda = space['reg_lambda'],
        subsample = space['subsample'],
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = space['colsample_bytree'],
        eval_metric = "rmse",
        early_stopping_rounds = 10
    )
    
    
    evaluation = [( x_train, y_train), ( x_cv, y_cv)]
    
    model.fit(
        x_train, y_train,
        eval_set=evaluation,
        verbose=False
    )
    
    pred = model.predict(x_cv)
    mse = mean_squared_error(y_cv, pred, squared=True)
    return {'loss':mse, 'status': STATUS_OK, 'model': model}

In [None]:
trials = Trials()

#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals=5000, trials = trials)

In [None]:
#best_hyperparams

In [None]:
n_estimators = 180
colsample_bytree = 0.4396092226617401
eta = 0.06258588022261978
max_depth = 5
min_child_weight = 4
reg_lambda = 6.675546531790164
subsample = 0.3974055570153105

## Fit XGBRegressor with the best hyperparameters

In [None]:
model = XGBRegressor(
    max_depth=max_depth,
    reg_lambda=reg_lambda,
    n_estimators=n_estimators,
    eta=eta,
    subsample=subsample,
    min_child_weight=min_child_weight,
    colsample_bytree=colsample_bytree,
    early_stopping_rounds=10
)

model.fit(x_train, y_train,
          eval_set=[( x_train, y_train), ( x_cv, y_cv)],
          verbose=False)

## Finally let's mesure model's performance

In [None]:
train_pred = model.predict(x_train)
cv_pred = model.predict(x_cv)

print(f'Train MSE: {mean_squared_error(y_train , train_pred)}')
print(f'CV MSE: {mean_squared_error(y_cv , cv_pred)}')

print(f'CGBoost score: {model.score(x_cv, y_cv)}')

# Submission
***

In [None]:
x_test = pd.DataFrame(pipe.transform(x_test), columns=my_transformer.get_feature_names_out())

In [None]:
sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(x_test)
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df