# Slowly building the preprocessor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

url = "https://drive.google.com/file/d/1iVBv5R6U53mofNpI9EkpFUQfwhYBk9MZ/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

X = data.drop(columns=["Id"])
y = X.pop("SalePrice")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=1)

X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numerical_pipe = make_pipeline(SimpleImputer())

columns_to_ordinal = X_cat.columns.get_indexer(['ExterQual', 'ExterCond', 
                                                'BsmtQual', 'BsmtCond',
                                                'BsmtExposure', 'BsmtFinType1', 
                                                'KitchenQual', 'FireplaceQu'])

ExterQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]

cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats]

ordinal_encoder = OrdinalEncoder(categories=cats_ord)

columns_to_ohe = X_cat.columns.get_indexer(list(set(X_cat) - set(columns_to_ordinal)))

ohe_encoder = OneHotEncoder(handle_unknown="ignore")

categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", ordinal_encoder, columns_to_ordinal),
        ("cat_onehot", ohe_encoder, columns_to_ohe),
    ]
)

categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder)

final_preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numerical_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

# Modelling with GridSearchCV or RandomizedSearchCV

## Gradientboostregressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression

final_pipe_gb = make_pipeline(final_preprocessor,
                              StandardScaler(with_mean=False),
                              VarianceThreshold(),
                              SelectKBest(score_func=f_regression),
                              GradientBoostingRegressor(random_state =123))

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "gradientboostingregressor__loss": ["squared_error", "absolute_error", "huber", "quantile"],
    "gradientboostingregressor__criterion": ["friedman_mse", "squared_error"],
    "gradientboostingregressor__n_estimators": range(50, 150, 20),
    "selectkbest__k": range(5, 85, 5)
}

gb_search = RandomizedSearchCV(final_pipe_gb,
                              param_grid,
                              cv=5,
                              n_iter = 50,
                              scoring='neg_root_mean_squared_error',
                              verbose=1)

gb_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrS...
                   param_distributions={'columntransformer__num_pipe__simpleimputer__strategy': ['mean',
                                                                                                 'median'],
                                        'gradientboostingregressor__criterion': ['

In [None]:
gb_search.best_score_

-30064.16864199557

# Making a submission CSV

In [None]:
# import Kaggle competition data
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)

In [None]:
my_test_X = competition_data.drop(columns=["Id"])

In [None]:
my_submission = pd.DataFrame(competition_data["Id"])
my_submission["SalePrice"] = gb_search.predict(my_test_X)

my_submission.to_csv('my_submission_1.csv', index=False)

# Extras needed on colab
from google.colab import files
files.download("my_submission_1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>