In [1]:
from settings import DATA_ROOT, CATEGORICAL_COLS, FLOAT_COLS, RESPONSE, RESULTS_DIR
import pandas as pd
import numpy as np

train = pd.read_csv(DATA_ROOT / "train.csv")
test = pd.read_csv(DATA_ROOT / "test.csv")
print(train)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ... PoolArea PoolQC  Fe

In [2]:
cat_train = train[CATEGORICAL_COLS].astype(str)
cat_test = test[CATEGORICAL_COLS].astype(str)

from sklearn.model_selection import train_test_split

enc_data = pd.concat([cat_train, cat_test], axis=0)
# for encoder in encoders:
#     encoder.fit(enc_data)

X_num = train[FLOAT_COLS].astype(float).fillna(0.)
X_cat = train[CATEGORICAL_COLS].astype(str).fillna("")

assert set(X_cat.columns) == set(cat_train.columns) == set(cat_test.columns)

y = train[RESPONSE]

(
    X_cat_train,
    X_cat_val,
    X_num_train,
    X_num_val,
    y_train,
    y_val
) = train_test_split(
    X_cat,
    X_num,
    y,
    test_size=.33)

X_val = np.hstack([X_cat_val, X_num_val])
X_train = np.hstack([X_cat_train, X_num_train])

assert X_cat_val.shape[0] == X_num_val.shape[0] == X_val.shape[0] == len(y_val)
assert X_cat_train.shape[0] == X_num_train.shape[0] == X_train.shape[0] == len(y_train)
assert X_cat_val.shape[1] == X_cat_train.shape[1]
assert X_num_val.shape[1] == X_num_train.shape[1]
assert X_val.shape[1] == X_train.shape[1]


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor

def get_predictor_with_enc(encoder) -> Pipeline:
    steps = [
        ('encoder', encoder),
        ('feature_selection', SelectFromModel(RandomForestRegressor(n_jobs=-1))),
        ('regression', RandomForestRegressor(n_jobs=-1))
    ]
    return Pipeline(steps)

def get_predictor() -> Pipeline:
    steps = [
        ('feature_selection', SelectFromModel(RandomForestRegressor(n_jobs=-1))),
        ('regression', RandomForestRegressor(n_jobs=-1))
    ]
    return Pipeline(steps)


In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    CountEncoder,
    HashingEncoder,
    HelmertEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    PolynomialEncoder,
    SumEncoder
)

encoders = [
    BackwardDifferenceEncoder(),
    BaseNEncoder(),
    CountEncoder(),
    HashingEncoder(),
    HelmertEncoder(),
    OrdinalEncoder(),
    OneHotEncoder(),
    PolynomialEncoder(),
    SumEncoder()
]

predictors = [
    (encoder.__class__.__name__, get_predictor_with_enc(encoder))
    for encoder in  encoders]

regr_cat = StackingRegressor(predictors)

regr_cat.fit(X_cat_train, y_train)


StackingRegressor(estimators=[('BackwardDifferenceEncoder',
                               Pipeline(steps=[('encoder',
                                                BackwardDifferenceEncoder()),
                                               ('feature_selection',
                                                SelectFromModel(estimator=RandomForestRegressor(n_jobs=-1))),
                                               ('regression',
                                                RandomForestRegressor(n_jobs=-1))])),
                              ('BaseNEncoder',
                               Pipeline(steps=[('encoder', BaseNEncoder()),
                                               ('feature_selection',
                                                SelectFromModel(estima...
                               Pipeline(steps=[('encoder', PolynomialEncoder()),
                                               ('feature_selection',
                                                SelectFromMo

In [5]:
regr_num = get_predictor()
regr_num.fit(X_num_train, y_train)

predictions_cat = regr_cat.predict(X_cat_val)
predictions_num = regr_num.predict(X_num_val)

errs = [
    predictions_cat - y_val,
    predictions_num - y_val
]

# These will live on past this cell, therefore capitalized.
COVMAT = np.cov(errs)
BIASES = [np.mean(arr) for arr in errs]
VAL_RATIO = X_val.shape[0] / (X_val.shape[0] + X_train.shape[0])

def aggregate_predictions(covmat, biases, predictions):
    # Aggregate as if errors are independent
    w = 1. / covmat.diagonal()
    w /= sum(w)
    return np.matmul(
        w.T,
        np.vstack([
            prd - bias * VAL_RATIO
            for prd, bias in zip(predictions, biases)
        ])
    )

def aggregate_predictions_inv(covmat, biases, predictions):
    # Aggregate taking covariances into account
    w = np.linalg.inv(covmat).sum(axis=0)
    w /= sum(w)
    return np.matmul(
        w.T,
        np.vstack([
            prd - bias * VAL_RATIO
            for prd, bias in zip(predictions, biases)
        ])
    )

predictions_agg = aggregate_predictions_inv(
    COVMAT,
    BIASES,
    [predictions_cat, predictions_num])

def evaluate(y_pred, y_true):
    """Evaluate predictions"""
    err = y_pred - y_true
    standardized_bias = np.mean(err) / np.std(y_true)
    standardized_rmse = np.sqrt(np.mean(err * err)) / np.std(y_true)
    return standardized_bias, standardized_rmse

for prediction in [
    {"name": "cat", "predictions": predictions_cat},
    {"name": "num", "predictions": predictions_num},
    {"name": "agg", "predictions": predictions_agg},
]:
    name = prediction["name"]
    predictions = prediction["predictions"]
    standardized_bias, standardized_rmse = evaluate(predictions, y_val)
    print(f"{name}: std bias={standardized_bias:.3f}, std rmse={standardized_rmse:.3f}")

cat: std bias=-0.006, std rmse=0.486
num: std bias=-0.030, std rmse=0.520
agg: std bias=-0.011, std rmse=0.437


In [6]:

X_num_test = test[FLOAT_COLS].astype(float).fillna(0.)
X_cat_test = test[CATEGORICAL_COLS].astype(str).fillna("")

final_predictions_num = regr_num.predict(X_num_test)
final_predictions_cat = regr_cat.predict(X_cat_test)

final_predictions_agg = aggregate_predictions_inv(
    COVMAT,
    BIASES,
    [final_predictions_cat, final_predictions_num])

result = pd.DataFrame(
    index=test.index,
    columns=[RESPONSE],
    data=final_predictions_agg.T)

result.index.name = "Id"

filename = RESULTS_DIR / "submission.csv"
i = 0
while filename.exists():
    i += 1
    filename = RESULTS_DIR / f"submission_{i}.csv"
print(result)
result.to_csv(filename)

          SalePrice
Id                 
0     119772.479367
1     152419.100533
2     181710.942624
3     187818.427587
4     194800.017460
...             ...
1454   82169.150832
1455   85595.879704
1456  142465.351841
1457  111075.005139
1458  220658.371234

[1459 rows x 1 columns]
