# Catboost for predictions
This Notebook attempts to create the best possible catboost classifier for the earthquake dataset.  

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install -q optuna category-encoders catboost

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/365.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m358.4/365.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import f1_score

# Hyperparameter Optimization
import optuna

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer
from category_encoders import LeaveOneOutEncoder, TargetEncoder, CatBoostEncoder
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin

from pathlib import Path
import pickle
from os import PathLike
import torch

## Constants

In [4]:
BASE_DIR = Path('drive', 'MyDrive', 'ml-competitions') / 'gorkha_earthquake'
DATA_DIR = BASE_DIR / 'data'
MODEL_DIR = BASE_DIR / 'models'
SUBMISSION_DIR = BASE_DIR / 'submissions'

TRAINING_FEATURES_PATH = DATA_DIR / "train_values.csv"
TRAINING_LABELS_PATH = DATA_DIR / "train_labels.csv"
TEST_FEATURES_PATH = DATA_DIR / "test_values.csv"
SUBMISSION_FORMAT_PATH = DATA_DIR / "submission_format.csv"


## Data Loading

In [5]:
features_df         = pd.read_csv(TRAINING_FEATURES_PATH,   index_col=0)
labels_df           = pd.read_csv(TRAINING_LABELS_PATH,     index_col=0) - 1
test_features_df    = pd.read_csv(TEST_FEATURES_PATH,       index_col=0)

## Data Preprocessing

In [6]:
# Setup

geo_level_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
numerical_columns = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
categorical_columns = ['foundation_type', 'ground_floor_type', 'land_surface_condition', 
                       'legal_ownership_status', 'other_floor_type',
                       'plan_configuration', 'position', 'roof_type']

In [7]:

class DREncoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.geo_lv1_embedder = torch.nn.Embedding(geo_lv1_size, 16)
        self.geo_lv2_embedder = torch.nn.Embedding(geo_lv2_size, 128)
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128) 
        self.compressor = torch.nn.Linear(16+128+128, latent_dim)

    def forward(self, x):
        x_1 = self.geo_lv1_embedder(x[:, 0])
        x_2 = self.geo_lv2_embedder(x[:, 1])
        x_3 = self.geo_lv3_embedder(x[:, 2])
        x = torch.concat((x_1, x_2, x_3), dim=1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoDimensionReduction(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv1_size: int=31,
            geo_lv2_size: int=1418,
            geo_lv3_size: int=11861, 
            device="cpu") -> None:
        super().__init__()
        self.path = path
        self.model = DREncoder(
            latent_dim, 
            geo_lv1_size,
            geo_lv2_size,
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv1_size = geo_lv1_size
        self.geo_lv2_size = geo_lv2_size
        self.geo_lv3_size = geo_lv3_size
        self.device = device
        self.model.load_state_dict(torch.load(path, map_location=device))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        if isinstance(X, pd.DataFrame):
            X = X.values # type: ignore
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()
    
class RollUpGeoLv3Encoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128)
        self.compressor = torch.nn.Linear(128, latent_dim)

    def forward(self, x):
        x = self.geo_lv3_embedder(x).squeeze(1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoLv3Rollup(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv3_size: int=11861,
            device="cpu") -> None:
        super().__init__()
        self.path = path
        self.model = RollUpGeoLv3Encoder(
            latent_dim, 
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv3_size = geo_lv3_size
        self.device = device
        self.model.load_state_dict(torch.load(path, map_location=device))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        if isinstance(X, pd.DataFrame):
            X = X.values # type: ignore
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()

In [8]:
# Load All Label Encoders
with open(MODEL_DIR / 'geo-lv-1-label-encoder.pickle', 'rb') as f:
    le1 = pickle.load(f)
with open(MODEL_DIR / 'geo-lv-2-label-encoder.pickle', 'rb') as f:
    le2 = pickle.load(f)
with open(MODEL_DIR / 'geo-lv-3-label-encoder.pickle', 'rb') as f:
    le3 = pickle.load(f)

# Prepare Transformers
geo_lv1_le = FunctionTransformer(
    func=lambda x: np.array(le1.transform(x.values.ravel())).reshape(-1, 1),
    feature_names_out='one-to-one'
)

geo_lv2_le = FunctionTransformer(
    func=lambda x: np.array(le2.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

geo_lv3_le = FunctionTransformer(
    func=lambda x: np.array(le3.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Dim Reducer
geo_dim_reduction_preprocessor = ColumnTransformer([
        ('geo1_le', geo_lv1_le, ['geo_level_1_id']),
        ('geo2_le', geo_lv2_le, ['geo_level_2_id']),
        ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
    ], 
    remainder='drop', 
    verbose_feature_names_out=False
).set_output(transform='pandas')

geo_dim_reduction_pipe = Pipeline([
    ('label_encoder', geo_dim_reduction_preprocessor),
    ('embedder', GeoDimensionReduction(
        path=MODEL_DIR / 'dim-reduction-32', 
        latent_dim=32, 
        device=DEVICE)
    ),
])

# Rollup
geo3_rollup_preprocessor = ColumnTransformer([
        ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
    ], 
    remainder='drop', 
    verbose_feature_names_out=False,
).set_output(transform='pandas')

geo_rollup_pipe = Pipeline([
    ('label_encoder', geo3_rollup_preprocessor),
    ('embedder', GeoLv3Rollup(
        path=MODEL_DIR / 'geo3-rollup-16', 
        device=DEVICE)
    ),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('category', FunctionTransformer(
            lambda x: x.astype('category'), 
            feature_names_out='one-to-one'), 
         categorical_columns + geo_level_columns
        ),
        # ('numeric', FunctionTransformer(
        #     lambda x: np.log(1+x), 
        #     feature_names_out='one-to-one'), 
        #  numerical_columns
        # ),
        ('geo_dim_reduction', geo_dim_reduction_pipe, geo_level_columns),
        ('geo_rollup', geo_rollup_pipe, geo_level_columns),
        # ('geos', CatBoostEncoder(cols=geo_level_columns), geo_level_columns),
    ],
    remainder='passthrough'
)
# preprocessor.set_output(transform='pandas')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Evaluate Model

In [None]:
hyperparams = {
    'task_type': "GPU",
    'logging_level': 'Silent',
    'random_state': 69,
    'cat_features': list(range(len(categorical_columns + geo_level_columns))),
}

clf = CatBoostClassifier(**hyperparams)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf),
])


results = cross_val_score(
    pipe, 
    features_df, 
    labels_df.to_numpy().squeeze(), 
    cv=StratifiedKFold(n_splits=5), 
    scoring='f1_micro',
    verbose=100,
)

print(f'{results.mean():.5f}')

Scores:

0.74893 - 
```
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', FunctionTransformer(lambda x: x.astype('category'), feature_names_out='one-to-one'), categorical_columns),
        # ('bool', FunctionTransformer(lambda x: np.log(1+x), feature_names_out='one-to-one'), numerical_columns),
        ('geo_dim_reduction', geo_dim_reduction_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
        ('geo_rollup', geo_rollup_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
        ('geos', CatBoostEncoder(cols=geo_level_columns), ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
    ],
    remainder='passthrough'
)
```
0.74918 -
```
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', FunctionTransformer(lambda x: x.astype('category'), feature_names_out='one-to-one'), categorical_columns),
        ('bool', FunctionTransformer(lambda x: np.log(1+x), feature_names_out='one-to-one'), numerical_columns),
        ('geo_dim_reduction', geo_dim_reduction_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
        ('geo_rollup', geo_rollup_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
        ('geos', CatBoostEncoder(cols=geo_level_columns), ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']),
    ],
    remainder='passthrough'
)
```
0.74925 -
```
preprocessor = ColumnTransformer(
    transformers=[
        ('category', FunctionTransformer(
            lambda x: x.astype('category'), 
            feature_names_out='one-to-one'), 
         categorical_columns + geo_level_columns
        ),
        ('numeric', FunctionTransformer(
              lambda x: np.log(1+x), 
              feature_names_out='one-to-one'), 
          numerical_columns
        ),
        ('geo_dim_reduction', geo_dim_reduction_pipe, geo_level_columns),
        ('geo_rollup', geo_rollup_pipe, geo_level_columns),
        # ('geos', CatBoostEncoder(cols=geo_level_columns), geo_level_columns),
    ],
    remainder='passthrough'
)
```

In [77]:
from typing import Literal
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.model_selection import KFold, StratifiedKFold

class CVBaggingClassifier(BaseEstimator, ClassifierMixin):
  
  def __init__(self, estimator: BaseEstimator, cv: KFold, num_classes: int):
    self.cv = cv
    self.num_classes = num_classes
    self.estimator = estimator

  def fit(self, X: pd.DataFrame, y: np.ndarray):
    self.estimators = [clone(self.estimator) for _ in range(self.cv.n_splits)]
    self.weights = np.zeros(self.cv.n_splits)
    for i, (train_idx, valid_idx) in enumerate(self.cv.split(X, y)):
      X_train = X.iloc[train_idx]
      X_valid = X.iloc[valid_idx]
      y_train = y[train_idx]
      y_valid = y[valid_idx]
      self.estimators[i].fit(X_train, y_train)
      self.weights[i] = f1_score(y_valid, self.estimators[i].predict(X_valid), average='micro')
    self.weights /= self.weights.sum()


  def predict_proba(self, X, *_):

    y_pred = np.zeros((X.shape[0], self.num_classes))

    for estimator, weight in zip(self.estimators, self.weights):
      y_pred += estimator.predict_proba(X) * weight
    
    return y_pred

  def predict(self, X, *_):
    return self.predict_proba(X).argmax(axis=1)
    

In [78]:
preprocessor = ColumnTransformer(
    transformers=[
        ('category', FunctionTransformer(
            lambda x: x.astype('category'), 
            feature_names_out='one-to-one'), 
         categorical_columns + geo_level_columns
        ),
        ('geo_dim_reduction', geo_dim_reduction_pipe, geo_level_columns),
        ('geo_rollup', geo_rollup_pipe, geo_level_columns),
    ],
    remainder='passthrough'
)

hyperparams = {
    'task_type': "GPU",
    'logging_level': 'Silent',
    'random_state': 69,
    'cat_features': list(range(len(categorical_columns + geo_level_columns))),
}

clf = CatBoostClassifier(**hyperparams)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf),
])

In [82]:
bg_clf = CVBaggingClassifier(
    estimator=pipe, 
    cv=StratifiedKFold(10, shuffle=True, random_state=69), 
    num_classes=3
)

X_train, X_valid, y_train, y_valid = train_test_split(
    features_df, labels_df.values.ravel(), stratify=labels_df.values.ravel())

bg_clf.fit(X_train, y_train)
f1_score(y_valid, bg_clf.predict(X_valid), average='micro')

0.7533115378121594

0.751454313824807

Equal Weights 0.7512087304876364  
F1 Weighted 0.751454313824807  
F1 Weighted Stratified K-Fold 0.7533115378121594  

In [81]:
from os import PathLike
import pandas as pd


def create_submission(predictions, submission_formats_path: PathLike):
    submission_format = pd.read_csv(submission_formats_path, index_col=0)
    submission = pd.DataFrame(data=predictions, columns=submission_format.columns, index=submission_format.index)
    submission['damage_grade'] = submission['damage_grade'].astype(int)
    return submission

bg_clf = CVBaggingClassifier(
    estimator=pipe, 
    cv=StratifiedKFold(10, shuffle=True, random_state=69), 
    num_classes=3
)
bg_clf = CVBaggingClassifier(pipe, cv=KFold(10, shuffle=True, random_state=69), num_classes=3)
bg_clf.fit(features_df, labels_df.values.ravel())

submission = create_submission(bg_clf.predict(test_features_df) + 1, submission_formats_path=SUBMISSION_FORMAT_PATH)
submission.to_csv(SUBMISSION_DIR / "catboost-DR-32-RU-16-bagged-f1weighted.csv")

## Hyperparameter Optimization

### Objective Function

In [68]:
# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial: optuna.Trial, X_train: pd.DataFrame, y_train: np.ndarray, X_valid: pd.DataFrame, y_valid: np.ndarray):

    hyperparams = {
        'task_type': "GPU",
        'logging_level': 'Silent',
        'random_state': 69,
        'cat_features': list(range(len(categorical_columns + geo_level_columns))),

        # "colsample_bylevel": trial.suggest_float(
        #     "colsample_bylevel", 0.01, 0.1
        # ),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.01),
        "depth": trial.suggest_int("depth", 4, 10),
        # "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 10),
        # "random_strength": trial.suggest_float("random_strength", 0, 10),
    }
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.CatBoostPruningCallback(
        trial, "validation")
    
    clf = CatBoostClassifier(
        **hyperparams, 
    )

    clf.fit(
        X_train, 
        y_train,  
        eval_set=[(X_valid, y_valid)],
        # callbacks=[pruning_callback],
        early_stopping_rounds=50, 
        verbose=False,
    )
    results = f1_score(y_valid, clf.predict(X_valid), average='micro')
    return float(results)

### Study

In [None]:
from sklearn.model_selection import train_test_split


study = optuna.create_study(
    study_name='catboost-study', 
    storage='sqlite:///catboost.db', 
    load_if_exists=True,
    direction="maximize",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
)

X_train, X_valid, y_train, y_valid = train_test_split(features_df, labels_df.values.ravel(), stratify=labels_df.values.ravel())

X_train = preprocessor.fit_transform(X_train, y_train)
X_valid = preprocessor.transform(X_valid)
study.optimize(
    lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), 
    n_trials=100)

In [None]:
print(f"Best score: {study.best_trial.value}")
study.best_trial.params

### Evaluate Best Model

## Submission