# 🏠 Kaggle Competition for House Prices 💵: Advanced Regression Techniques 
***

# 📤 Import the Libraries

In [None]:
!pip install -q hvplot

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas

import optuna
import xgboost as xgb
from optuna.samplers import TPESampler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

%matplotlib inline

pd.pandas.set_option('display.max_columns', None)
pd.pandas.set_option('display.max_rows', 100)

# 💾 Load Data

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

# 📊 Exploratory Data Analysis (EDA)

In Data Analysis We will Analyze To Find out the below stuff
1. Missing Values
1. All The Numerical Variables
1. Distribution of the Numerical Variables
1. Categorical Variables
1. Cardinality of Categorical Variables
1. Outliers
1. Relationship between independent and dependent feature(SalePrice)

In [None]:
print(f"Train data shape {train.shape}")
print(f"Trest data shape {test.shape}")

In [None]:
train.hvplot.hist("SalePrice", title="Sales Price Distribution")

In [None]:
train['SalePrice'].describe()

In [None]:
train[train['SalePrice']>500000].shape

Only 9 houses in  training data has a sales price more then 500000.

## Missing Values

In [None]:
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.hvplot.barh(title="Missing Values (Training Data)")

In [None]:
missing = test.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.hvplot.barh(title="Missing Values (Testing Data)", height=500)

In [None]:
train_missing = []
for column in train.columns:
    if train[column].isna().sum() != 0:
        missing = train[column].isna().sum()
        print(f"{column:-<{30}}: {missing} ({missing / train.shape[0] * 100:.2f}%)")
        if missing > train.shape[0] / 3:
            train_missing.append(column)

In [None]:
test_missing = []
for column in test.columns:
    if test[column].isna().sum() != 0:
        missing = test[column].isna().sum()
        print(f"{column:-<{30}}: {missing} ({missing / test.shape[0] * 100:.2f}%)")
        if missing > test.shape[0] / 3:
            test_missing.append(column)

In [None]:
print(f"{train_missing}")
print(f"{test_missing}")

In [None]:
train.drop(train_missing, axis=1, inplace=True)
test.drop(train_missing, axis=1, inplace=True)

## `MSSubClass`, `MSZoning`

In [None]:
print(f"MSZoning number of unique values (Train): {train['MSZoning'].nunique()}")
print(f"MSZoning number of unique values (Test): {test['MSZoning'].nunique()}")

print(f"MSSubClass number of unique values (Train): {train['MSSubClass'].nunique()}")
print(f"MSSubClass number of unique values (Test): {test['MSSubClass'].nunique()}")

In [None]:
all_columns = train.columns.to_list()

In [None]:
train['MSSubClass'].value_counts().hvplot.bar(title="MSSubClass (Trainig Data)")

In [None]:
test['MSSubClass'].value_counts().hvplot.bar(title="MSSubClass (Testing Data)")

## `LotArea`, `LotFrontage`, `LotShape`, `LotConfig`

In [None]:
print(f"LotArea number of unique values (Train): {train['LotArea'].nunique()}")
print(f"LotArea number of unique values (Test): {test['LotArea'].nunique()}")

print(f"LotFrontage number of unique values (Train): {train['LotFrontage'].nunique()}")
print(f"LotFrontage number of unique values (Test): {test['LotFrontage'].nunique()}")

print(f"LotShape number of unique values (Train): {train['LotShape'].nunique()}")
print(f"LotShape number of unique values (Test): {test['LotShape'].nunique()}")

print(f"LotConfig number of unique values (Train): {train['LotConfig'].nunique()}")
print(f"LotConfig number of unique values (Test): {test['LotConfig'].nunique()}")

In [None]:
train.hvplot.scatter(x='LotArea', y='SalePrice')

In [None]:
train.hvplot.scatter(x='LotFrontage', y='SalePrice')

In [None]:
train['LotShape'].value_counts().hvplot.bar()

In [None]:
test['LotShape'].value_counts().hvplot.bar()

In [None]:
train['LotConfig'].value_counts().hvplot.bar()

In [None]:
test['LotConfig'].value_counts().hvplot.bar()

In [None]:
train.hvplot.scatter(x='GrLivArea', y='SalePrice')

In [None]:
train.hvplot.scatter(x='TotalBsmtSF', y='SalePrice')

In [None]:
train.hvplot.box(by='OverallQual', y='SalePrice')

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(train.corr(), vmax=.8, square=True)

In [None]:
cols = train.corr().nlargest(15, 'SalePrice')['SalePrice'].index
plt.figure(figsize=(10, 8))
sns.heatmap(train[cols].corr(), annot=True, vmax=.8, square=True)

In [None]:
print(f"Train dataset shape before removing: {train.shape}")
print(f"Test dataset shape before removing: {test.shape}")

# from 2 features high correlated, removing the less correlated with SalePrice
train.drop(['GarageArea','1stFlrSF','TotRmsAbvGrd','2ndFlrSF'], axis=1, inplace=True)
test.drop(['GarageArea','1stFlrSF','TotRmsAbvGrd','2ndFlrSF'], axis=1, inplace=True)

# removing outliers
# train = train[train['GrLivArea'] < 4500]
train.reset_index(drop=True, inplace=True) # Important to make optuna work

print(f"Train dataset shape after removing: {train.shape}")
print(f"Test dataset shape after removing: {test.shape}")

# 📦 Data Pre-processing

In [None]:
missing_features = [col for col in train.columns if train[col].isna().sum()!=0]
categorical_col = [col for col in train.columns if train[col].dtype == object]

In [None]:
print(missing_features)
print(categorical_col)

In [None]:
X = train.drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice']
test.drop('Id', axis=1, inplace=True)

imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# scaler = StandardScaler()

column_transformer = make_column_transformer(
    (ohe, categorical_col),
    remainder='passthrough'
)

X = column_transformer.fit_transform(X)
test = column_transformer.transform(test)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Test shape: {test.shape}")

# 🤖 Model Building & Hyperparameter Tuning

In [None]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500, 50),
        'eta': trial.suggest_discrete_uniform('eta', 0.01, 0.1, 0.01),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
#         'learning_rate': trial.suggest_discrete_uniform('leaning_rate', 0.01, 1, 0.01)
    }

    reg = xgb.XGBRegressor(tree_method='gpu_hist', **param_grid)
    # TODO: PRUNING
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    reg.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], eval_metric='rmse',
            verbose=False)

#     return np.sqrt(-cross_val_score(reg, X_valid, y_valid, scoring='neg_mean_squared_error').mean())
    return mean_squared_error(y_valid, reg.predict(X_valid), squared=False)

In [None]:
train_time = 1 * 10 * 60 # h * m * s
study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='XGBRegressor')
study.optimize(objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
xgb_params = trial.params
# xgb_params['eta'] = 0.01
xgb_params['tree_method'] = 'gpu_hist'

n_splits = 10
test_preds = None
kf_rmse = []

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X, y)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='rmse', verbose=False)
       
    valid_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    print(f'Fold {fold+1}/{n_splits} RMSE: {rmse:.4f}')
    kf_rmse.append(rmse)
    
    if test_preds is None:
        test_preds = model.predict(test)
    else:
        test_preds += model.predict(test)

test_preds /= n_splits
print(f'Average KFold RMSE: {np.mean(np.array(kf_rmse)):.5f}')

In [None]:
sample_submission['SalePrice'] = test_preds
sample_submission.to_csv('submission.csv', index=False)