In [10]:
# Target mean encoding
# Encode x with P(Y | X = x), so it includes some target information

import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.base import BaseEstimator, TransformerMixin

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode

    def fit(self, X, y):
        # Require X to be a pandas dataframe
        self.encodings = {}
        for col in self.columns:
            new_X = pd.concat([X, y], axis=1)
            mean_encoded = new_X.groupby(col)[y.name].mean()
            self.encodings[col] = mean_encoded.to_dict()
            # frequencies = X[col].value_counts(dropna=False, normalize=True)
            # self.encodings[col] = frequencies.to_dict()
        return self

    def transform(self, X):
        # Apply the frequency encoding to each column
        X_transformed = X.copy()
        for col in self.columns:
            # Map the frequencies, using .get to handle missing categories; encode NaN explicitly as -1
            X_transformed[col] = X[col].apply(lambda x: self.encodings[col].get(x, -1)).astype(float).fillna(-1)
        return X_transformed

In [20]:
def prepare_dataset(df, one_hot=False, fillna=True):
    # Encode categorical features with one hot, and replace NA with "Missing"
    # With one hot encoding support
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        if fillna:
            df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
        else:
            df[key] = df[key].astype('category')
    if not one_hot:
        return df
    
    categorical_columns = df.select_dtypes(include=['category']).columns
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns.to_list()))
    df = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)
    return df

In [30]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
dataset_df = dataset_df.drop('Id', axis=1)
target_column = 'SalePrice'
dataset_df = prepare_dataset(dataset_df, fillna=True)
y = dataset_df.pop(target_column)

# Apply target mean encoding
target_mean_encoder = TargetMeanEncoder(dataset_df.select_dtypes(include=['category']).columns.to_list())
target_mean_encoder.fit(dataset_df, y)
dataset_df = target_mean_encoder.transform(dataset_df)
dataset_df.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,191004.994787,65.0,8450,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,2,2008,173401.836622,175202.219533
1,20,191004.994787,80.0,9600,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,177934.574468,...,0,0,180404.663455,187596.837998,182046.410384,0,5,2007,173401.836622,175202.219533
2,60,191004.994787,68.0,11250,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,9,2008,173401.836622,175202.219533


In [46]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostRegressor()

# Set up the parameter grid
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1],
    'iterations': [500, 1000]
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(dataset_df, y, verbose=200)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
0:	learn: 80103.5933022	total: 4.83ms	remaining: 2.41s
200:	learn: 34034.6389040	total: 313ms	remaining: 465ms
400:	learn: 24647.6812363	total: 638ms	remaining: 157ms
499:	learn: 22618.7420197	total: 754ms	remaining: 0us
[CV] END ........depth=4, iterations=500, learning_rate=0.01; total time=   0.8s
0:	learn: 78511.7333316	total: 6.66ms	remaining: 3.32s
200:	learn: 33482.4918013	total: 248ms	remaining: 369ms
400:	learn: 24199.3529531	total: 602ms	remaining: 149ms
499:	learn: 22301.9375708	total: 775ms	remaining: 0us
[CV] END ........depth=4, iterations=500, learning_rate=0.01; total time=   0.9s
0:	learn: 76878.1769580	total: 9.94ms	remaining: 4.96s
200:	learn: 32971.2740881	total: 333ms	remaining: 495ms
400:	learn: 24046.6793153	total: 645ms	remaining: 159ms
499:	learn: 22083.4436745	total: 758ms	remaining: 0us
[CV] END ........depth=4, iterations=500, learning_rate=0.01; total time=   0.8s
0:	learn: 80766.9143854	total: 2

In [47]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    # cat_features=categorical_columns.to_list(),
    verbose=200,
    depth=6,
    iterations=4000,
    learning_rate=0.03,
)

# Train the model
model.fit(dataset_df, y)

0:	learn: 77967.5878978	total: 11.7ms	remaining: 46.9s
200:	learn: 18614.0104039	total: 468ms	remaining: 8.85s
400:	learn: 14488.9763256	total: 1.02s	remaining: 9.2s
600:	learn: 11620.3199021	total: 1.53s	remaining: 8.69s
800:	learn: 9693.9819328	total: 2.06s	remaining: 8.22s
1000:	learn: 8293.6791266	total: 3.02s	remaining: 9.05s
1200:	learn: 7244.3281424	total: 3.46s	remaining: 8.06s
1400:	learn: 6369.5198586	total: 3.95s	remaining: 7.33s
1600:	learn: 5605.1456507	total: 4.56s	remaining: 6.83s
1800:	learn: 5004.2358738	total: 5.19s	remaining: 6.33s
2000:	learn: 4474.4077560	total: 6.3s	remaining: 6.29s
2200:	learn: 4003.0588562	total: 7.29s	remaining: 5.96s
2400:	learn: 3591.9676977	total: 7.73s	remaining: 5.15s
2600:	learn: 3229.2612186	total: 8.37s	remaining: 4.5s
2800:	learn: 2914.2544037	total: 8.81s	remaining: 3.77s
3000:	learn: 2634.2680809	total: 9.17s	remaining: 3.05s
3200:	learn: 2382.4211922	total: 9.78s	remaining: 2.44s
3400:	learn: 2164.7749692	total: 10.4s	remaining: 1.8

<catboost.core.CatBoostRegressor at 0x7f595191df10>

In [48]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': dataset_df.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

         Feature  Importance
16   OverallQual   17.606288
45     GrLivArea   11.742681
11  Neighborhood    7.637292
33    BsmtFinSF1    4.481826
3        LotArea    4.319600
..           ...         ...
71        PoolQC    0.007019
5          Alley    0.006610
4         Street    0.002425
38       Heating    0.002069
8      Utilities    0.000296

[79 rows x 2 columns]


In [49]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data = prepare_dataset(test_data, fillna=False)

# Apply frequency encoding
test_data= target_mean_encoder.transform(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,131558.375,80.0,11622,181130.538514,-1.0,164754.818378,180183.746758,180950.95682,176938.047529,...,120,0,-1.0,148751.089172,-1.0,0,6,2010,173401.836622,175202.219533
1,20,191004.994787,81.0,14267,181130.538514,-1.0,206101.665289,180183.746758,180950.95682,181623.425856,...,0,0,-1.0,-1.0,170750.0,12500,6,2010,173401.836622,175202.219533
2,60,191004.994787,74.0,13830,181130.538514,-1.0,206101.665289,180183.746758,180950.95682,176938.047529,...,0,0,-1.0,148751.089172,-1.0,0,3,2010,173401.836622,175202.219533


In [51]:
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/catboost-cv-target-iter4000.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120334.059544
1,1462,161813.79753
2,1463,189527.006337
3,1464,196146.848975
4,1465,183771.587064
