In [9]:
# Frequency encoding

import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.base import BaseEstimator, TransformerMixin

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode

    def fit(self, X, y=None):
        # Compute the frequency for each category in each column
        self.encodings = {}
        for col in self.columns:
            # Compute frequencies, including NaN as a category
            frequencies = X[col].value_counts(dropna=False, normalize=True)
            self.encodings[col] = frequencies.to_dict()
        return self

    def transform(self, X):
        # Apply the frequency encoding to each column
        X_transformed = X.copy()
        for col in self.columns:
            # Map the frequencies, using .get to handle missing categories; encode NaN explicitly as -1
            X_transformed[col] = X[col].apply(lambda x: self.encodings[col].get(x, -1)).astype(float).fillna(-1)
        return X_transformed

In [10]:
def prepare_dataset(df, one_hot=False, fillna=True):
    # Encode categorical features with one hot, and replace NA with "Missing"
    # With one hot encoding support
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        if fillna:
            df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
        else:
            df[key] = df[key].astype('category')
    if not one_hot:
        return df
    
    categorical_columns = df.select_dtypes(include=['category']).columns
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns.to_list()))
    df = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)
    return df

In [11]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
dataset_df = dataset_df.drop('Id', axis=1)
target_column = 'SalePrice'
dataset_df = prepare_dataset(dataset_df, fillna=False)
y = dataset_df.pop(target_column)

# Apply frequency encoding
freq_encoder = FrequencyEncoder(dataset_df.select_dtypes(include=['category']).columns.to_list())
freq_encoder.fit(dataset_df)
dataset_df = freq_encoder.transform(dataset_df)
dataset_df.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,0.788356,65.0,8450,0.99589,-1.0,0.633562,0.897945,0.999315,0.720548,...,0,0,-1.0,-1.0,-1.0,0,2,2008,0.867808,0.820548
1,20,0.788356,80.0,9600,0.99589,-1.0,0.633562,0.897945,0.999315,0.032192,...,0,0,-1.0,-1.0,-1.0,0,5,2007,0.867808,0.820548
2,60,0.788356,68.0,11250,0.99589,-1.0,0.331507,0.897945,0.999315,0.720548,...,0,0,-1.0,-1.0,-1.0,0,9,2008,0.867808,0.820548


In [13]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostRegressor()

# Set up the parameter grid
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'iterations': [2000]
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(dataset_df, y, verbose=200)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
0:	learn: 80159.6146033	total: 55.8ms	remaining: 1m 51s
200:	learn: 35409.5338369	total: 305ms	remaining: 2.73s
400:	learn: 26092.6323684	total: 535ms	remaining: 2.13s
600:	learn: 22343.4575102	total: 749ms	remaining: 1.74s
800:	learn: 20051.8232089	total: 960ms	remaining: 1.44s
1000:	learn: 18519.2748939	total: 1.41s	remaining: 1.41s
1200:	learn: 17446.6305283	total: 1.68s	remaining: 1.12s
1400:	learn: 16524.0261016	total: 1.89s	remaining: 810ms
1600:	learn: 15659.3799764	total: 2.1s	remaining: 523ms
1800:	learn: 14928.4004540	total: 2.31s	remaining: 256ms
1999:	learn: 14269.1103835	total: 2.52s	remaining: 0us
[CV] END .......depth=4, iterations=2000, learning_rate=0.01; total time=   2.7s
0:	learn: 78548.0455479	total: 5.09ms	remaining: 10.2s
200:	learn: 34369.8447128	total: 417ms	remaining: 3.73s
400:	learn: 25164.4167366	total: 646ms	remaining: 2.58s
600:	learn: 21836.9663217	total: 849ms	remaining: 1.98s
800:	learn: 1996

In [14]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(
    # cat_features=categorical_columns.to_list(),
    verbose=200,
    depth=6,
    iterations=5000,
    learning_rate=0.1,
)

# Train the model
model.fit(dataset_df, y)

0:	learn: 74743.2006185	total: 4.09ms	remaining: 20.5s
200:	learn: 11092.8869642	total: 456ms	remaining: 10.9s
400:	learn: 6495.1309476	total: 888ms	remaining: 10.2s
600:	learn: 4478.3946063	total: 1.23s	remaining: 9.01s
800:	learn: 3028.2066617	total: 1.73s	remaining: 9.07s
1000:	learn: 2171.6745407	total: 2.23s	remaining: 8.89s
1200:	learn: 1568.6269405	total: 2.58s	remaining: 8.15s
1400:	learn: 1150.6230630	total: 2.92s	remaining: 7.51s
1600:	learn: 854.5225657	total: 3.31s	remaining: 7.03s
1800:	learn: 644.4384279	total: 3.7s	remaining: 6.57s
2000:	learn: 492.2263700	total: 4.13s	remaining: 6.2s
2200:	learn: 379.6572582	total: 4.53s	remaining: 5.76s
2400:	learn: 288.0842573	total: 4.95s	remaining: 5.36s
2600:	learn: 227.9633542	total: 5.4s	remaining: 4.98s
2800:	learn: 176.9350821	total: 5.79s	remaining: 4.55s
3000:	learn: 139.0410035	total: 6.17s	remaining: 4.11s
3200:	learn: 107.4489067	total: 6.88s	remaining: 3.87s
3400:	learn: 84.3362876	total: 7.24s	remaining: 3.4s
3600:	learn

<catboost.core.CatBoostRegressor at 0x7fd9d58ed4d0>

In [15]:
feature_importances = model.get_feature_importance()
print(pd.DataFrame({'Feature': dataset_df.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False))

        Feature  Importance
16  OverallQual   20.307890
45    GrLivArea   13.592415
42     1stFlrSF    5.841140
33   BsmtFinSF1    4.721947
37  TotalBsmtSF    3.988734
..          ...         ...
74      MiscVal    0.003778
21     RoofMatl    0.002275
38      Heating    0.001621
4        Street    0.000035
8     Utilities    0.000002

[79 rows x 2 columns]


In [16]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data = prepare_dataset(test_data, fillna=False)

# Apply frequency encoding
test_data= freq_encoder.transform(test_data)
test_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,0.010959,80.0,11622,0.99589,-1.0,0.633562,0.897945,0.999315,0.720548,...,120,0,-1.0,0.107534,-1.0,0,6,2010,0.867808,0.820548
1,20,0.788356,81.0,14267,0.99589,-1.0,0.331507,0.897945,0.999315,0.180137,...,0,0,-1.0,-1.0,0.00137,12500,6,2010,0.867808,0.820548
2,60,0.788356,74.0,13830,0.99589,-1.0,0.331507,0.897945,0.999315,0.720548,...,0,0,-1.0,0.107534,-1.0,0,3,2010,0.867808,0.820548


In [18]:
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/catboost-cv-freq-na-1-iter5000.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,122118.179563
1,1462,167993.4843
2,1463,192027.140258
3,1464,192401.291172
4,1465,180398.154761
