<a href="https://colab.research.google.com/github/Phanttan/Kaggle_house-prices-advanced-regression-techniques/blob/main/Pipeline_RandomizedSearchCV_XGBoostRegression_on_Ames_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Dattasets from Kaggle API

In [1]:
# Author:  Tan Phan <phanttan@gmail.com>
# License: BSD 3 clause

import os
import shutil
# Create kaggle folder in Colab
kaggle_dir = "/kaggle/input"
directory = "house-prices-advanced-regression-techniques"
path = os.path.join(kaggle_dir, directory)
try: 
    os.makedirs(kaggle_dir, exist_ok=True)
    print("Directory '%s' created " %kaggle_dir)
except OSError as error:
    print("Directory '%s' can not be created")

# Access to Kaggle Database
os.environ['KAGGLE_USERNAME'] = "phanttan" 
os.environ['KAGGLE_KEY'] = "9bfa667f159eeb635c1bd4db0d37c45e" 
!kaggle competitions download -c house-prices-advanced-regression-techniques -p house-prices-advanced-regression-techniques

# Move dataset to new folder 
shutil.move(directory, kaggle_dir)

Directory '/kaggle/input' created 
Downloading data_description.txt to house-prices-advanced-regression-techniques
  0% 0.00/13.1k [00:00<?, ?B/s]
100% 13.1k/13.1k [00:00<00:00, 12.6MB/s]
Downloading sample_submission.csv to house-prices-advanced-regression-techniques
  0% 0.00/31.2k [00:00<?, ?B/s]
100% 31.2k/31.2k [00:00<00:00, 32.0MB/s]
Downloading train.csv to house-prices-advanced-regression-techniques
  0% 0.00/450k [00:00<?, ?B/s]
100% 450k/450k [00:00<00:00, 81.7MB/s]
Downloading test.csv to house-prices-advanced-regression-techniques
  0% 0.00/441k [00:00<?, ?B/s]
100% 441k/441k [00:00<00:00, 62.3MB/s]


'/kaggle/input/house-prices-advanced-regression-techniques'

# Import packages and load dataset

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_full = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_full = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
train_full.shape, test_full.shape

((1460, 81), (1459, 80))

# Data Preprocessing

1.   Check duplications in Data
2.   Transformation pipelines





In [4]:
# Check for duplicates for train data
idsUnique = len(set(train_full.Id))
idsTotal = train_full.shape[0]
idsDupli = idsTotal - idsUnique

if idsDupli != 0:
    print("Have duplicated Id in train")
else:
    print("No dupicated Id in train")

# Check for duplicates for test data
idsUnique = len(set(test_full.Id))
idsTotal = test_full.shape[0]
idsDupli = idsTotal - idsUnique

if idsDupli != 0:
    print("Have duplicated Id in test")
else:
    print("No dupicated Id in test")

No dupicated Id in train
No dupicated Id in test


In [5]:
train = train_full.drop('Id', axis=1)
test = test_full.drop('Id', axis=1)
X = train.drop('SalePrice', axis=1)
y = train.SalePrice

In [6]:
# List out all features that are categorical
numerical_cols = X.select_dtypes(include= ['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include = ["object"]).columns

In [7]:
# Build Transformer pipeline
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                         ('scaler', StandardScaler(with_mean=False))
                                         ])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore')),
                                           ('scaler', StandardScaler(with_mean=False))
                                           ])
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                              ('cat', categorical_transformer, categorical_cols)])

### Note: 


1.   We chose OHE(OneHot Encoder), not Label Encoder because Label Encoder is not fair with all Categories Variables. 
2.   A disadvantage of OHE is high cardinality, we can use PCA to reduce data dimension. 



In [8]:
# Split data for Training and Validation
X_train, X_val, y_train, y_val = train_test_split(X, y)

# Define the Model

In [9]:
xgb_reg = xgb.XGBRegressor(random_state=42, seed=123, silent=True)
ames_pipe = Pipeline(steps=[('preprocessor_transformers', preprocessor),
                            ('model', xgb_reg)])

ames_pipe

Pipeline(memory=None,
         steps=[('preprocessor_transformers',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                  

# Fine-Tuning Model : Using RandomizedSearchCV


In [10]:
xgb_param_grid = {'model__subsample': np.arange(0.05, 1, 0.05),
                  'model__max_depth': np.arange(3,20,1),
                  'model__colsample_bytree': np.arange(0.1, 1.05, 0.05)}
randomized_neg_mse = RandomizedSearchCV(estimator= ames_pipe,
                                        param_distributions=xgb_param_grid,
                                        scoring='neg_mean_squared_error',
                                        cv=4)
randomized_neg_mse.fit(X_train, y_train)
# best_rmse = np.sqrt(np.abs(randomized_neg_mse.best_score_))
# best_param = randomized_neg_mse.best_params_
best_estimator= randomized_neg_mse.best_estimator_

In [None]:
y_pred = best_estimator.predict(X_val)
xgb_score = np.sqrt(mean_squared_error(y_val, y_pred))

In [11]:
def rmse_cv(model, X, y, n_folds):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmse= np.sqrt(-cross_val_score(model, X, np.log(y), scoring="neg_mean_squared_error", cv = kf))
    return rmse.mean()

In [12]:
rmse = rmse_cv(best_estimator, X_val, y_val, 5)
rmse

0.15679716433683744

In [None]:
y_pred = best_estimator.predict(test)

array([123202.29, 155386.17, 178618.3 , ..., 168859.95, 118827.16,
       233785.52], dtype=float32)

# Create Submission File

In [None]:
sample_sub = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
sample_sub.Id.shape, y_pred.shape

((1459,), (1459,))

In [None]:
final_data = {'Id': sample_sub.Id, 'SalePrice': y_pred}
final_submission = pd.DataFrame(data=final_data).to_csv('submission_file.csv', 
                                                        index=False)